Lucene 同义词搜索

来源:互联网 发布:sql anywhere 12 下载 编辑:程序博客网 时间:2024/05/18 01:52

1、自定义TokenFilter过滤器

package com.lkt.analyzer;import java.io.IOException;import java.util.HashMap;import java.util.Map;import java.util.Stack;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.util.AttributeSource;/** * 定義同義詞過濾器 * @author lkt * */public class MyMmsegFilter extends TokenFilter {//用來存儲同義詞集合private Map<String, String[]> sameMap=new HashMap<String, String[]>();//用來存儲當前詞的同義詞private Stack<String> sameStack;private int flag;//存儲當前狀態private AttributeSource.State currState;private CharTermAttribute cta;private PositionIncrementAttribute pia;protected MyMmsegFilter(TokenStream input) {super(input);sameMap.put("中国", new String[]{"兲朝","大陸","China"});sameMap.put("北京", new String[]{"首都","燕京","Beijing"});sameMap.put("南京", new String[]{"六朝古都","建業","Nanjing"});cta = this.addAttribute(CharTermAttribute.class);pia = this.addAttribute(PositionIncrementAttribute.class);sameStack=new Stack<String>();}@Overridepublic boolean incrementToken() throws IOException {while(sameStack.size()>0){String str = sameStack.pop();//還原狀態restoreState(currState);cta.setEmpty();cta.append(str);//設置它和前一個單詞的距離pia.setPositionIncrement(0);return true;}if(!input.incrementToken())return false;if(getSameWord(cta.toString())){//獲取當前狀態,使用restoreState可以返回記錄的狀態currState=captureState();flag=0;}return true;}private boolean getSameWord(String word){String[] sm = sameMap.get(word);if(sm!=null&&sm.length>0){for (String s : sm) {sameStack.push(s);}return true;}return false;}}
2、自定义分词器

package com.lkt.analyzer;import java.io.Reader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.store.Directory;import com.chenlb.mmseg4j.Chunk;import com.chenlb.mmseg4j.Dictionary;import com.chenlb.mmseg4j.MaxWordSeg;import com.chenlb.mmseg4j.Seg;import com.chenlb.mmseg4j.Sentence;import com.chenlb.mmseg4j.analysis.MMSegTokenizer;public class MyMmsegAnalyzer extends Analyzer {@Overridepublic TokenStream tokenStream(String fieldName, Reader reader) {Dictionary dic =Dictionary.getInstance("F:\\学习资料\\Lucene\\mmseg4j-1.8.5\\data");return new MyMmsegFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader)) ;}}


3、测试

package com.lkt.lucene;import java.io.File;import java.io.IOException;import java.io.StringReader;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.Field.Index;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Term;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.Version;import org.junit.Test;import com.chenlb.mmseg4j.MMSeg;import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;import com.lkt.analyzer.MyMmsegAnalyzer;import com.lkt.analyzer.MyStopAnalyzer;import com.lkt.util.AnalyzerUtil;public class TestAnalyzerUtil {@Testpublic void testDisplayAnalyzer(){String str = "北京上海南京江苏南京北京中国重庆天津";//new AnalyzerUtil().displayAnalyzer(str,new StandardAnalyzer(Version.LUCENE_35) );//new AnalyzerUtil().displayAnalyzer(str,new StopAnalyzer(Version.LUCENE_35) );//new AnalyzerUtil().displayAnalyzer(str,new SimpleAnalyzer(Version.LUCENE_35) );//new AnalyzerUtil().displayAnalyzer(str,new WhitespaceAnalyzer(Version.LUCENE_35) );//new AnalyzerUtil().displayAnalyzer(str,new MyStopAnalyzer(new String[]{"my","dog"}) );try {//new AnalyzerUtil().displayAnalyzer(str,new MyMmsegAnalyzer());Directory dir = new RAMDirectory();IndexWriter writer = new IndexWriter(dir,new IndexWriterConfig(Version.LUCENE_35,new MyMmsegAnalyzer()));Document doc =new Document();doc.add(new Field("content",str,Store.YES,Index.ANALYZED));writer.addDocument(doc);writer.close();IndexSearcher sercher =new IndexSearcher(IndexReader.open(dir));TermQuery tq =new TermQuery(new Term("content","首都"));TopDocs td= sercher.search(tq, 10);for (ScoreDoc sd : td.scoreDocs) {Document dd = sercher.doc(sd.doc);System.out.println(dd.get("content"));}} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}



0 0
原创粉丝点击