自定义lucene的同义词分词器
来源:互联网 发布:pinyinime 源码 编辑:程序博客网 时间:2024/05/14 14:41
1.自定义分词器首先要了解Analyzer、Tokenizer和TokenFilter三者之间的联系
Analyzer包含两个核心组件,Tokenizer以及TokenFilter。两者的区别在于,前者在字符级别处理流,而后者则在词语级别处理流。Tokenizer是Analyzer的第一步,其构造函数接收一个Reader作为参数,而TokenFilter则是一个类似的拦截器,其参数可以是TokenStream、Tokenizer。
详细介绍请看 Lucene源码解析–Analyzer之Tokenizer
2.定义自己的Analyzer——MyTokenSameWordAnalyzer
- 这里我要说明一下我用的lucene的包是3.6.2的,因为没有汉字的分词器所以我是用了mmseg4j-all-1.8.5.jar,MMSegAnalyzer是提供了汉字分词
- 所以我们就仿照这个MMSegAnalyzer类来写我们的MyTokenSameWordAnalyzer类
import java.io.IOException;import java.io.Reader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.store.Directory;import com.chenlb.mmseg4j.Dictionary;import com.chenlb.mmseg4j.MaxWordSeg;import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;import com.chenlb.mmseg4j.analysis.MMSegTokenizer;public class MyTokenSameWordAnalyzer extends Analyzer { public MyTokenSameWordAnalyzer() { // TODO Auto-generated constructor stub } @Override public TokenStream tokenStream(String fieldName, Reader reader) { // TODO Auto-generated method stub Dictionary dic = Dictionary.getInstance("F:\\jar包\\java各种jar包集合\\mmseg\\mmseg4j-1.8.5\\data"); //这个是mmseg4j的源码包下边的dictionary,自己的分词字典 return new MyTokenSameWordFilter(new MMSegTokenizer(new MaxWordSeg(dic),reader));//通过MMSegTokenizer源码我们发现newSeg()是受保护的,所以我们继续网上找到了new MaxWordSeg(dic),我们也由此得知dic是个默认地址我们需要手工指定一个dictionary }}
3.定义自己的Filter——MyTokenSameWordFilter
import java.io.File;import java.io.IOException;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.Stack;import org.apache.commons.io.FileUtils;import org.apache.commons.lang3.StringUtils;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.util.AttributeSource;public class MyTokenSameWordFilter extends TokenFilter { private static Map<String,String[]> mp = new HashMap<String,String[]>(); private CharTermAttribute cta = null; private PositionIncrementAttribute pia = null; private Stack<String> stack = null; // 栈是Vector的一个子类,它实现了一个标准的后进先出的栈。 private AttributeSource.State current; static{ File file = new File("D:\\LuceneData\\local05\\samewords");//我自己定义的同义词字典 if (file.exists()) { File[] files = file.listFiles(); try { for (File file2 : files) { if(file2.isFile()){ List readLines = FileUtils.readLines(file2,"GBK"); if(readLines!=null){ Object[] array = readLines.toArray(); for(int i=0;i<array.length;i++ ){ String head = StringUtils.substring(array[i].toString(),0,StringUtils.indexOf(array[i].toString(), "=>")); String weibu = StringUtils.substring(array[i].toString(),StringUtils.indexOf(array[i].toString(), "=>")+2,array[i].toString().length()); String[] end = StringUtils.split(weibu,","); mp.put(head, end); } } } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } public MyTokenSameWordFilter(TokenStream input) { super(input); cta = this.addAttribute(CharTermAttribute.class); //元素 pia = this.addAttribute(PositionIncrementAttribute.class); stack = new Stack<String>();//每次要取第一个元素是重新生成一下stack } @Override public boolean incrementToken() throws IOException { //获取分词结果 while(stack.size()>0){ //将当前的cta的同义词出栈 String sw = stack.pop(); //还原为当前状态 restoreState(current); // cta.setEmpty(); cta.append(sw); pia.setPositionIncrement(pia.getPositionIncrement()); return true; //return之后输出新的元素 } if(!input.incrementToken()) return false; if(getSamesWords(cta.toString())){ current = captureState(); }// System.out.println(cta); return true; } public boolean getSamesWords(String name){ String[] obj = mp.get(name); if(obj!=null){ for(String str:obj){ stack.push(str); } return true; } return false; }}
4.定义自己的TestSearcher
import java.io.File;import java.io.IOException;import org.apache.commons.io.FileUtils;import org.apache.commons.io.FilenameUtils;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.SimpleAnalyzer;import org.apache.lucene.analysis.StopAnalyzer;import org.apache.lucene.analysis.WhitespaceAnalyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.Version;import org.junit.Before;import org.junit.Test;import com.bjxy.lucene4.util.SearcherUtil;import com.bjxy.lucene5.analyzer.MyStopAnalyzer;import com.bjxy.lucene5.util.AnalyzerUtil;import com.chenlb.mmseg4j.MMSeg;import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;public class TestSearcher { private MyIndexMain searcher = null; private Directory directory = null; private Analyzer a1 = null; private Analyzer a2 = null; private Analyzer a3 = null; private Analyzer a4 = null; String field = "this is xy , welcome to my house,friends my qq is 1229396220 and mail is yzuchaoyang@foxmail.com"; private static String pathname = "D:\\LuceneIndex\\Index05"; @Before public void createDirectory(){ try { directory = FSDirectory.open(new File(pathname)); searcher = new MyIndexMain(); //当前分词器不适用于汉字 a1 = new StandardAnalyzer(Version.LUCENE_36 );//不拆介词和this,that,标点,特殊符号这些词 a2 = new StopAnalyzer(Version.LUCENE_36 ); //不拆分数字和介词和this,that,标点,特殊符号这些词 a3 = new SimpleAnalyzer(Version.LUCENE_36 ); //不拆分数字,和标点,特殊符号 a4 = new WhitespaceAnalyzer(Version.LUCENE_36 );//以空格进行拆分 } catch (IOException e) { e.printStackTrace(); } } @Test public void testSameWordsAnalyzer01() throws Exception{ try { field = "我叫玉朝阳,来自中国的一个保定市的小农村里"; AnalyzerUtil.displayAllTokenInfo(field,new MyTokenSameWordAnalyzer()); } catch (Exception e) { // TODO: handle exception e.printStackTrace(); } } @Test public void testMyIndexWirter() throws Exception{ try { searcher.myIndexWirter(directory,true);// AnalyzerUtil.displayAllTokenInfo(field,new MyTokenSameWordAnalyzer()); } catch (Exception e) { // TODO: handle exception e.printStackTrace(); } } @Test public void testMyIndexReader() throws Exception{ try { searcher.myIndexReader(directory,"俺叫",1,5); //通过同义词分词器就能查询了// AnalyzerUtil.displayAllTokenInfo(field,new MyTokenSameWordAnalyzer()); } catch (Exception e) { // TODO: handle exception e.printStackTrace(); } }}
5.定义自己的AnalyzerUtil
import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.analysis.tokenattributes.TypeAttribute;public class AnalyzerUtil { public static void displayToken(String field,Analyzer a){ try { TokenStream tstream = a.tokenStream("content", new StringReader(field)); CharTermAttribute cta = tstream .addAttribute(CharTermAttribute.class); while (tstream.incrementToken()) { System.out.println("cta: " + cta); } System.out.println("-----------------------------"); } catch (Exception e) { e.printStackTrace(); } } public static void displayAllTokenInfo(String field,Analyzer a){ try { TokenStream tstream = a.tokenStream("content", new StringReader(field)); //位置增量属性,存储语汇单元之间的距离 PositionIncrementAttribute pia = tstream.addAttribute(PositionIncrementAttribute.class); //存储语汇单元的位偏移量 OffsetAttribute oa = tstream.addAttribute(OffsetAttribute.class); //使用的分词器的类型信息 TypeAttribute ta = tstream.addAttribute(TypeAttribute.class); //存储每个语汇单元的信息(分词单元信息) CharTermAttribute cta = tstream.addAttribute(CharTermAttribute.class);// while (tstream.incrementToken()) {// System.out.println("cta: "+cta+" ta: "+ta.type()+" pia: " + pia.getPositionIncrement()+" ["+oa.startOffset()+"-"+oa.endOffset()+"]");// } for(;tstream.incrementToken();){ System.out.println("cta: "+cta+" ta: "+ta.type()+" pia: " + pia.getPositionIncrement()+" ["+oa.startOffset()+"-"+oa.endOffset()+"]"); } System.out.println("-----------------------------"); } catch (Exception e) { e.printStackTrace(); } }}
6.定义自己的MyIndexMain
import java.io.File;import java.io.FileReader;import java.io.IOException;import java.text.SimpleDateFormat;import org.apache.commons.io.FileUtils;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.NumericField;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.util.Version;import cn.bjxy.lucene2.util.LuceneUtil1;import com.bjxy.lucene3.util.ReaderUtil;import com.bjxy.lucene4.util.SearcherUtil;public class MyIndexMain { public void myIndexWirter(Directory directory,boolean hasNew) { IndexWriter indexWriter = null; try{ indexWriter = LuceneUtil1.createIndexWriter(directory, Version.LUCENE_36 ,new StandardAnalyzer(Version.LUCENE_36)); if(hasNew){ indexWriter.deleteAll(); //创建索引之前,先把文档清空掉 } //3.创建Document对象 File file = new File("D:\\LuceneData\\local05\\data\\"); Document doc = null; for(File eFl:file.listFiles()){ //4.创建Document对应的Field信息 String readFileToString = FileUtils.readFileToString(eFl,"GBK"); System.out.println(readFileToString); doc = new Document();// doc.add(new Field("content",new FileReader(eFl)));// doc.add(new Field("content",readFileToString,Field.Store.YES,Field.Index.ANALYZED)); doc.add(new Field("content",readFileToString,Field.Store.NO,Field.Index.ANALYZED)); doc.add(new Field("path",eFl.getAbsolutePath(),Field.Store.YES,Field.Index.NOT_ANALYZED)); doc.add(new Field("filename",eFl.getName(),Field.Store.YES,Field.Index.NOT_ANALYZED)); doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(eFl.lastModified())); doc.add(new NumericField("size",Field.Store.YES,true).setIntValue((int)(eFl.length()/1024)));//字节转化为k //5.通过IndexWriter将文档添加到索引中 indexWriter.addDocument(doc); } }catch (Exception e) { e.printStackTrace(); }finally{ if(indexWriter!=null) try { indexWriter.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } public static ScoreDoc getLastScoreDoc(int pageIndex,int pageSize,Query query,IndexSearcher searcher) throws Exception{ if(pageIndex == 1 ) return null; //如果是第一页就返回空 int num = (pageIndex-1)*pageSize; TopDocs docs = searcher.search(query, num); ScoreDoc last = (docs.scoreDocs)[num-1]; return last; } public void myIndexReader(Directory directory, String queryName, int pageIndex, int pageSize) { // TODO Auto-generated method stub IndexSearcher searcher = null; SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); try{ QueryParser paser = new QueryParser(Version.LUCENE_36,"content",new MyTokenSameWordAnalyzer()); //默认搜索域 content Query query = paser.parse(queryName); //有tom或者jerry的,默认空格为OR[对字段内容进行判断] 有两条 searcher = ReaderUtil.getIndexSearcher(directory); ScoreDoc lastScoreDoc = SearcherUtil.getLastScoreDoc(pageIndex, pageSize, query, searcher); TopDocs searchAfter = searcher.searchAfter(lastScoreDoc, query,pageSize); ScoreDoc[] afters = searchAfter.scoreDocs; for(ScoreDoc sc:afters){ Document doc = searcher.doc(sc.doc); System.out.println(doc.get("filename")+"---->"+doc.get("content")+"---->"+format.format(Long.valueOf(doc.get("date")))); } }catch (Exception e) { e.printStackTrace(); } }}
7.定义自己的ReaderUtil
import org.apache.lucene.index.IndexReader;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.store.Directory;import cn.bjxy.lucene2.util.LuceneUtil1;public class ReaderUtil { private static IndexReader indexReader = null; private ReaderUtil(){} public static IndexSearcher getIndexSearcher(Directory directory) throws Exception{ if(indexReader==null){ synchronized (ReaderUtil.class) { if(indexReader==null){ indexReader = IndexReader.open(directory); }else{ IndexReader iR2 = IndexReader.openIfChanged(indexReader); if(iR2!=null)indexReader.close(); indexReader = iR2; } } }else{ IndexReader iR2 = IndexReader.openIfChanged(indexReader); if(iR2!=null)indexReader = iR2; } return new IndexSearcher(indexReader); }}
8.数据源
我的数据源为D:\LuceneData\local05\data\company.txt内容为:我是小玉来自一个神奇的国度,这个地方叫中国,在这里的人们每天都过得很嗨皮!
0 0
- 自定义lucene的同义词分词器
- Lucene自定义同义词分词器
- Lucene实现自定义中文同义词分词器
- Lucene 5.3 自定义同义词分词器
- Lucene实现自定义分词器(同义词查询与高亮)
- lucene构建同义词分词器
- Lucene 3.6.2入门:自定义停用词分词器和同义词分词器
- MMSegAnalyzer 自定义 同义词分词器
- Lucene同义词分词器简单实现
- Lucene 自定义分词器
- lucene3.5实现自定义同义词分词器
- 04_java Lucene学习——分词Analyzer(02):lucene4.0_学写简单的中文同义词分词器
- 自定义lucene分词器,单字分词
- Lucene采用自定义分词器
- Lucene实现自定义分词器
- 自定义Lucene分词器示例
- 基于Lucence的同义词分词器
- lucene 自定义分词器小程序
- Archive Data DSO Demo 1
- LinkedList,ArrayList和Vector
- java文件流处理jd-gui反编译后文件中每行的注释符
- 查找(1)---顺序查找
- java中gc()与finalize()
- 自定义lucene的同义词分词器
- android L平台增加来电翻转静音菜单、功能
- java.lang.IllegalStateException: Expected BEGIN_OBJECT but was was STRING at line 1 column 1
- 292. Nim Game
- X86汇编 通用寄存器总结
- 解压版MySQL安装
- 17.定义栈的数据结构,请在该类型中实现一个能够得到栈最小元素的min函数。
- Python将txt文件输入到MySQL数据库中
- pyinstaller打包问题,关于skleran