Lucene 同义词搜索
来源:互联网 发布:sql anywhere 12 下载 编辑:程序博客网 时间:2024/05/18 01:52
1、自定义TokenFilter过滤器
package com.lkt.analyzer;import java.io.IOException;import java.util.HashMap;import java.util.Map;import java.util.Stack;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.util.AttributeSource;/** * 定義同義詞過濾器 * @author lkt * */public class MyMmsegFilter extends TokenFilter {//用來存儲同義詞集合private Map<String, String[]> sameMap=new HashMap<String, String[]>();//用來存儲當前詞的同義詞private Stack<String> sameStack;private int flag;//存儲當前狀態private AttributeSource.State currState;private CharTermAttribute cta;private PositionIncrementAttribute pia;protected MyMmsegFilter(TokenStream input) {super(input);sameMap.put("中国", new String[]{"兲朝","大陸","China"});sameMap.put("北京", new String[]{"首都","燕京","Beijing"});sameMap.put("南京", new String[]{"六朝古都","建業","Nanjing"});cta = this.addAttribute(CharTermAttribute.class);pia = this.addAttribute(PositionIncrementAttribute.class);sameStack=new Stack<String>();}@Overridepublic boolean incrementToken() throws IOException {while(sameStack.size()>0){String str = sameStack.pop();//還原狀態restoreState(currState);cta.setEmpty();cta.append(str);//設置它和前一個單詞的距離pia.setPositionIncrement(0);return true;}if(!input.incrementToken())return false;if(getSameWord(cta.toString())){//獲取當前狀態,使用restoreState可以返回記錄的狀態currState=captureState();flag=0;}return true;}private boolean getSameWord(String word){String[] sm = sameMap.get(word);if(sm!=null&&sm.length>0){for (String s : sm) {sameStack.push(s);}return true;}return false;}}2、自定义分词器
package com.lkt.analyzer;import java.io.Reader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.store.Directory;import com.chenlb.mmseg4j.Chunk;import com.chenlb.mmseg4j.Dictionary;import com.chenlb.mmseg4j.MaxWordSeg;import com.chenlb.mmseg4j.Seg;import com.chenlb.mmseg4j.Sentence;import com.chenlb.mmseg4j.analysis.MMSegTokenizer;public class MyMmsegAnalyzer extends Analyzer {@Overridepublic TokenStream tokenStream(String fieldName, Reader reader) {Dictionary dic =Dictionary.getInstance("F:\\学习资料\\Lucene\\mmseg4j-1.8.5\\data");return new MyMmsegFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader)) ;}}
package com.lkt.lucene;import java.io.File;import java.io.IOException;import java.io.StringReader;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.Field.Index;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Term;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.Version;import org.junit.Test;import com.chenlb.mmseg4j.MMSeg;import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;import com.lkt.analyzer.MyMmsegAnalyzer;import com.lkt.analyzer.MyStopAnalyzer;import com.lkt.util.AnalyzerUtil;public class TestAnalyzerUtil {@Testpublic void testDisplayAnalyzer(){String str = "北京上海南京江苏南京北京中国重庆天津";//new AnalyzerUtil().displayAnalyzer(str,new StandardAnalyzer(Version.LUCENE_35) );//new AnalyzerUtil().displayAnalyzer(str,new StopAnalyzer(Version.LUCENE_35) );//new AnalyzerUtil().displayAnalyzer(str,new SimpleAnalyzer(Version.LUCENE_35) );//new AnalyzerUtil().displayAnalyzer(str,new WhitespaceAnalyzer(Version.LUCENE_35) );//new AnalyzerUtil().displayAnalyzer(str,new MyStopAnalyzer(new String[]{"my","dog"}) );try {//new AnalyzerUtil().displayAnalyzer(str,new MyMmsegAnalyzer());Directory dir = new RAMDirectory();IndexWriter writer = new IndexWriter(dir,new IndexWriterConfig(Version.LUCENE_35,new MyMmsegAnalyzer()));Document doc =new Document();doc.add(new Field("content",str,Store.YES,Index.ANALYZED));writer.addDocument(doc);writer.close();IndexSearcher sercher =new IndexSearcher(IndexReader.open(dir));TermQuery tq =new TermQuery(new Term("content","首都"));TopDocs td= sercher.search(tq, 10);for (ScoreDoc sd : td.scoreDocs) {Document dd = sercher.doc(sd.doc);System.out.println(dd.get("content"));}} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}
0 0
- Lucene 同义词搜索
- lucene+ikanalyzer实现中文同义词搜索
- Lucene 同义词
- lucene-同义词分析器
- Lucene同义词(一)
- Lucene自定义同义词分词器
- lucene构建同义词分词器
- elasticSearch 同义词搜索
- lucene(三) lucene搜索
- lucene搜索
- lucene 搜索
- Lucene搜索
- lucene 搜索
- Lucene 搜索
- lucene搜索
- Lucene--搜索
- lucene搜索
- lucene--同义词简单的实现方式
- EXCEL文本转数值方法---我找的好苦啊
- 常用的正则表达式
- C++primer plus第六版课后编程题答案17.1
- 访问https 绕过证书验证的方法
- 黑马程序员_Java基础[4]_if、switch、while
- Lucene 同义词搜索
- JS随机生成密码
- Linux下进程间传递描述符
- [MOC062066]背景建模资料收集整理
- 链表初识之功能不完善的链表
- 单机运行环境搭建之 --CentOS-6.4安装MySQL 5.6.10并修改MySQL的root用户密码
- 深入浅出 Javascript API(一)--基本框架
- ubuntu 锐捷认证 rjsupplicant.sh 无法运行启动脚本
- 黑马程序员_Java基础[4]_函数\方法