自定义lucene的同义词分词器

来源:互联网 发布:pinyinime 源码 编辑:程序博客网 时间:2024/05/14 14:41

1.自定义分词器首先要了解Analyzer、Tokenizer和TokenFilter三者之间的联系

Analyzer包含两个核心组件,Tokenizer以及TokenFilter。两者的区别在于,前者在字符级别处理流,而后者则在词语级别处理流。Tokenizer是Analyzer的第一步,其构造函数接收一个Reader作为参数,而TokenFilter则是一个类似的拦截器,其参数可以是TokenStream、Tokenizer。

      详细介绍请看 Lucene源码解析–Analyzer之Tokenizer

2.定义自己的Analyzer——MyTokenSameWordAnalyzer

  • 这里我要说明一下我用的lucene的包是3.6.2的,因为没有汉字的分词器所以我是用了mmseg4j-all-1.8.5.jar,MMSegAnalyzer是提供了汉字分词
  • 所以我们就仿照这个MMSegAnalyzer类来写我们的MyTokenSameWordAnalyzer类
import java.io.IOException;import java.io.Reader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.store.Directory;import com.chenlb.mmseg4j.Dictionary;import com.chenlb.mmseg4j.MaxWordSeg;import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;import com.chenlb.mmseg4j.analysis.MMSegTokenizer;public class MyTokenSameWordAnalyzer extends Analyzer {    public MyTokenSameWordAnalyzer() {        // TODO Auto-generated constructor stub    }    @Override    public TokenStream tokenStream(String fieldName, Reader reader) {        // TODO Auto-generated method stub        Dictionary dic = Dictionary.getInstance("F:\\jar包\\java各种jar包集合\\mmseg\\mmseg4j-1.8.5\\data");  //这个是mmseg4j的源码包下边的dictionary,自己的分词字典        return new MyTokenSameWordFilter(new MMSegTokenizer(new MaxWordSeg(dic),reader));//通过MMSegTokenizer源码我们发现newSeg()是受保护的,所以我们继续网上找到了new MaxWordSeg(dic),我们也由此得知dic是个默认地址我们需要手工指定一个dictionary    }}

3.定义自己的Filter——MyTokenSameWordFilter

import java.io.File;import java.io.IOException;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.Stack;import org.apache.commons.io.FileUtils;import org.apache.commons.lang3.StringUtils;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.util.AttributeSource;public class MyTokenSameWordFilter extends TokenFilter {    private static Map<String,String[]> mp = new HashMap<String,String[]>();    private CharTermAttribute cta = null;    private PositionIncrementAttribute pia = null;    private Stack<String> stack  = null; // 栈是Vector的一个子类,它实现了一个标准的后进先出的栈。    private AttributeSource.State current;    static{         File file = new File("D:\\LuceneData\\local05\\samewords");//我自己定义的同义词字典         if (file.exists()) {             File[] files = file.listFiles();            try {              for (File file2 : files) {                      if(file2.isFile()){                        List readLines = FileUtils.readLines(file2,"GBK");                        if(readLines!=null){                            Object[] array = readLines.toArray();                            for(int i=0;i<array.length;i++ ){                                  String head = StringUtils.substring(array[i].toString(),0,StringUtils.indexOf(array[i].toString(), "=>"));                                  String weibu = StringUtils.substring(array[i].toString(),StringUtils.indexOf(array[i].toString(), "=>")+2,array[i].toString().length());                                  String[] end = StringUtils.split(weibu,",");                                  mp.put(head, end);                            }                        }                  }                   }             } catch (IOException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }         }      }    public MyTokenSameWordFilter(TokenStream input)  {         super(input);         cta = this.addAttribute(CharTermAttribute.class); //元素         pia = this.addAttribute(PositionIncrementAttribute.class);         stack  = new Stack<String>();//每次要取第一个元素是重新生成一下stack    }    @Override    public boolean incrementToken() throws IOException {        //获取分词结果        while(stack.size()>0){                            //将当前的cta的同义词出栈               String  sw = stack.pop();               //还原为当前状态               restoreState(current);               //               cta.setEmpty();               cta.append(sw);               pia.setPositionIncrement(pia.getPositionIncrement());               return true; //return之后输出新的元素        }        if(!input.incrementToken()) return false;        if(getSamesWords(cta.toString())){            current = captureState();        }//      System.out.println(cta);        return true;    }    public boolean getSamesWords(String name){        String[] obj = mp.get(name);        if(obj!=null){             for(String str:obj){                 stack.push(str);             }             return true;        }        return false;    }}

4.定义自己的TestSearcher

import java.io.File;import java.io.IOException;import org.apache.commons.io.FileUtils;import org.apache.commons.io.FilenameUtils;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.SimpleAnalyzer;import org.apache.lucene.analysis.StopAnalyzer;import org.apache.lucene.analysis.WhitespaceAnalyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.Version;import org.junit.Before;import org.junit.Test;import com.bjxy.lucene4.util.SearcherUtil;import com.bjxy.lucene5.analyzer.MyStopAnalyzer;import com.bjxy.lucene5.util.AnalyzerUtil;import com.chenlb.mmseg4j.MMSeg;import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;public class TestSearcher {       private MyIndexMain searcher = null;       private Directory directory = null;       private Analyzer a1 = null;       private Analyzer a2 = null;       private Analyzer a3 = null;       private Analyzer a4 = null;       String field = "this is xy , welcome to my house,friends my qq is 1229396220 and mail is yzuchaoyang@foxmail.com";       private static String pathname = "D:\\LuceneIndex\\Index05";       @Before       public void createDirectory(){        try {            directory = FSDirectory.open(new File(pathname));            searcher = new MyIndexMain();            //当前分词器不适用于汉字            a1 = new StandardAnalyzer(Version.LUCENE_36 );//不拆介词和this,that,标点,特殊符号这些词            a2 = new StopAnalyzer(Version.LUCENE_36 );  //不拆分数字和介词和this,that,标点,特殊符号这些词            a3 = new SimpleAnalyzer(Version.LUCENE_36 ); //不拆分数字,和标点,特殊符号            a4 = new WhitespaceAnalyzer(Version.LUCENE_36 );//以空格进行拆分        } catch (IOException e) {            e.printStackTrace();        }       }       @Test       public void testSameWordsAnalyzer01() throws Exception{             try {                field = "我叫玉朝阳,来自中国的一个保定市的小农村里";                AnalyzerUtil.displayAllTokenInfo(field,new MyTokenSameWordAnalyzer());            } catch (Exception e) {                // TODO: handle exception                e.printStackTrace();            }       }       @Test       public void testMyIndexWirter() throws Exception{             try {                 searcher.myIndexWirter(directory,true);//              AnalyzerUtil.displayAllTokenInfo(field,new MyTokenSameWordAnalyzer());            } catch (Exception e) {                // TODO: handle exception                e.printStackTrace();            }       }       @Test       public void testMyIndexReader() throws Exception{             try {                 searcher.myIndexReader(directory,"俺叫",1,5); //通过同义词分词器就能查询了//              AnalyzerUtil.displayAllTokenInfo(field,new MyTokenSameWordAnalyzer());            } catch (Exception e) {                // TODO: handle exception                e.printStackTrace();            }       }}

5.定义自己的AnalyzerUtil

import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.analysis.tokenattributes.TypeAttribute;public class AnalyzerUtil {       public static void displayToken(String field,Analyzer a){           try {                TokenStream tstream = a.tokenStream("content", new StringReader(field));                CharTermAttribute cta = tstream                        .addAttribute(CharTermAttribute.class);                while (tstream.incrementToken()) {                    System.out.println("cta: " + cta);                }                System.out.println("-----------------------------");            } catch (Exception e) {                e.printStackTrace();            }       }       public static void displayAllTokenInfo(String field,Analyzer a){           try {                TokenStream tstream = a.tokenStream("content", new StringReader(field));                //位置增量属性,存储语汇单元之间的距离                PositionIncrementAttribute pia = tstream.addAttribute(PositionIncrementAttribute.class);                //存储语汇单元的位偏移量                OffsetAttribute oa = tstream.addAttribute(OffsetAttribute.class);                //使用的分词器的类型信息                TypeAttribute ta = tstream.addAttribute(TypeAttribute.class);                //存储每个语汇单元的信息(分词单元信息)                CharTermAttribute cta = tstream.addAttribute(CharTermAttribute.class);//              while (tstream.incrementToken()) {//                  System.out.println("cta: "+cta+" ta: "+ta.type()+" pia: " + pia.getPositionIncrement()+" ["+oa.startOffset()+"-"+oa.endOffset()+"]");//              }                for(;tstream.incrementToken();){                    System.out.println("cta: "+cta+" ta: "+ta.type()+" pia: " + pia.getPositionIncrement()+" ["+oa.startOffset()+"-"+oa.endOffset()+"]");                }                System.out.println("-----------------------------");            } catch (Exception e) {                e.printStackTrace();            }       }}

6.定义自己的MyIndexMain

import java.io.File;import java.io.FileReader;import java.io.IOException;import java.text.SimpleDateFormat;import org.apache.commons.io.FileUtils;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.NumericField;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.util.Version;import cn.bjxy.lucene2.util.LuceneUtil1;import com.bjxy.lucene3.util.ReaderUtil;import com.bjxy.lucene4.util.SearcherUtil;public class MyIndexMain {    public void  myIndexWirter(Directory directory,boolean hasNew) {         IndexWriter indexWriter = null;         try{             indexWriter = LuceneUtil1.createIndexWriter(directory, Version.LUCENE_36 ,new StandardAnalyzer(Version.LUCENE_36));             if(hasNew){              indexWriter.deleteAll(); //创建索引之前,先把文档清空掉               }              //3.创建Document对象             File file = new File("D:\\LuceneData\\local05\\data\\");             Document doc = null;             for(File eFl:file.listFiles()){                 //4.创建Document对应的Field信息                 String readFileToString = FileUtils.readFileToString(eFl,"GBK");                 System.out.println(readFileToString);                 doc = new Document();//               doc.add(new Field("content",new FileReader(eFl)));//               doc.add(new Field("content",readFileToString,Field.Store.YES,Field.Index.ANALYZED));                 doc.add(new Field("content",readFileToString,Field.Store.NO,Field.Index.ANALYZED));                 doc.add(new Field("path",eFl.getAbsolutePath(),Field.Store.YES,Field.Index.NOT_ANALYZED));                 doc.add(new Field("filename",eFl.getName(),Field.Store.YES,Field.Index.NOT_ANALYZED));                 doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(eFl.lastModified()));                 doc.add(new NumericField("size",Field.Store.YES,true).setIntValue((int)(eFl.length()/1024)));//字节转化为k                 //5.通过IndexWriter将文档添加到索引中                  indexWriter.addDocument(doc);             }         }catch (Exception e) {            e.printStackTrace();         }finally{             if(indexWriter!=null)                try {                    indexWriter.close();                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                }         }    }    public static ScoreDoc getLastScoreDoc(int pageIndex,int pageSize,Query query,IndexSearcher searcher) throws Exception{        if(pageIndex == 1 ) return null;  //如果是第一页就返回空         int num  = (pageIndex-1)*pageSize;        TopDocs docs = searcher.search(query, num);        ScoreDoc last = (docs.scoreDocs)[num-1];        return last;    }    public void myIndexReader(Directory directory, String queryName,            int pageIndex, int pageSize) {        // TODO Auto-generated method stub        IndexSearcher searcher = null;        SimpleDateFormat format =  new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");        try{          QueryParser paser = new QueryParser(Version.LUCENE_36,"content",new MyTokenSameWordAnalyzer()); //默认搜索域 content          Query query = paser.parse(queryName); //有tom或者jerry的,默认空格为OR[对字段内容进行判断] 有两条          searcher = ReaderUtil.getIndexSearcher(directory);          ScoreDoc lastScoreDoc = SearcherUtil.getLastScoreDoc(pageIndex, pageSize, query, searcher);          TopDocs searchAfter = searcher.searchAfter(lastScoreDoc, query,pageSize);          ScoreDoc[] afters = searchAfter.scoreDocs;         for(ScoreDoc sc:afters){             Document doc = searcher.doc(sc.doc);             System.out.println(doc.get("filename")+"---->"+doc.get("content")+"---->"+format.format(Long.valueOf(doc.get("date"))));         }        }catch (Exception e) {            e.printStackTrace();        }    }}

7.定义自己的ReaderUtil

import org.apache.lucene.index.IndexReader;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.store.Directory;import cn.bjxy.lucene2.util.LuceneUtil1;public class ReaderUtil {      private static IndexReader indexReader = null;      private ReaderUtil(){}      public static IndexSearcher getIndexSearcher(Directory directory) throws Exception{          if(indexReader==null){               synchronized (ReaderUtil.class) {                   if(indexReader==null){                       indexReader =  IndexReader.open(directory);                   }else{                       IndexReader iR2 = IndexReader.openIfChanged(indexReader);                       if(iR2!=null)indexReader.close(); indexReader = iR2;                   }               }          }else{               IndexReader iR2 = IndexReader.openIfChanged(indexReader);               if(iR2!=null)indexReader = iR2;          }          return new IndexSearcher(indexReader);      }}

8.数据源

我的数据源为D:\LuceneData\local05\data\company.txt内容为:我是小玉来自一个神奇的国度,这个地方叫中国,在这里的人们每天都过得很嗨皮!
0 0
原创粉丝点击