基于Lucence的同义词分词器

来源：互联网发布：windows安装caffe 编辑：程序博客网时间：2024/06/05 19:00

package org.lucene.util;

import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;

import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;

/**
* 同义词分词器
* @author
*
*/
public class MySameAnalyzer extends Analyzer {
private SamewordContext samewordContext;

public MySameAnalyzer(SamewordContext swc) {
samewordContext = swc;
}

/**
public TokenStream tokenStream(String fieldName, Reader reader) {
Dictionary dic = Dictionary.getInstance("D:\\tools\\javaTools\\lucene\\mmseg4j-1.8.5\\data");
return new MySameTokenFilter(
new MMSegTokenizer(new MaxWordSeg(dic), reader),samewordContext);
}
*/

@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
//获取中文分词器MMSeg的词库
Dictionary dic = Dictionary.getInstance();
//创建Tokenizer
Tokenizer tokenizer=new MMSegTokenizer(new MaxWordSeg(dic), reader);
//创建TokenStream，使用自定义的同义词过滤器
TokenStream ts= new MySameTokenFilter(tokenizer,samewordContext);
//创建TokenStreamComponents
TokenStreamComponents tscs=new TokenStreamComponents(tokenizer,ts);
return tscs;
}

}

package org.lucene.util;

import java.io.IOException;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;

public class MySameTokenFilter extends TokenFilter {
private CharTermAttribute cta = null;
private PositionIncrementAttribute pia = null;
//存放当前的状态，用于状态还原
private AttributeSource.State current;
private Stack<String> sames = null;
private SamewordContext samewordContext;

protected MySameTokenFilter(TokenStream input,SamewordContext samewordContext) {
super(input);
cta = this.addAttribute(CharTermAttribute.class);
pia = this.addAttribute(PositionIncrementAttribute.class);
sames = new Stack<String>();
this.samewordContext = samewordContext;
}

@Override
public boolean incrementToken() throws IOException {
if(sames.size()>0) {
//将元素出栈，并且获取这个同义词
String str = sames.pop();
//还原状态
restoreState(current);
cta.setEmpty();
cta.append(str);
//设置位置为0
pia.setPositionIncrement(0);
return true;
}
boolean inct=this.input.incrementToken();
if(!inct) return false;
if(addSames(cta.toString())) {
//如果存在同义词，将当前的状态先保存
current = captureState();
}
return true;
}

/**
* 添加同义词
* @param name
* @return
*/
private boolean addSames(String name) {
String[] sws = samewordContext.getSamewords(name);
if(sws!=null) {
for(String str:sws) {
sames.push(str);
}
return true;
}
return false;
}

}

package org.lucene.util;

public interface SamewordContext {
/**
* 获取一个词的同义词列表
* @param name
* @return
*/
public String[] getSamewords(String name);
}

package org.lucene.util;

import java.util.HashMap;
import java.util.Map;

public class SimpleSamewordContext implements SamewordContext {
//存放同义词
Map<String,String[]> maps = new HashMap<String,String[]>();

public SimpleSamewordContext() {
//创建同义词列表
maps.put("数据",new String[]{"信息","知识","智慧"});
maps.put("当今",new String[]{"现在","目前","当前","眼下"});
maps.put("采用",new String[]{"使用","利用"});
}

/**
* 获取一个词的同义词列表
*/
@Override
public String[] getSamewords(String name) {
return maps.get(name);
}

}

@Test
public void test05() {
try {
Analyzer a2 = new MySameAnalyzer(new SimpleSamewordContext());
String txt = "系统建设采用代表当今云计算、大数据和互联网主流并成熟的技术进行架构设计，相应的软件开发和产品选型应充分考虑未来发展方向，同时保证平台在技术先进和可靠性。";
Directory dir = new RAMDirectory();
//Directory dir =FSDirectory.open(new File("indexs\\index04"));
IndexWriter writer = new IndexWriter(dir,new IndexWriterConfig(Version.LUCENE_4_9, a2));
Document doc = new Document();
doc.add(new TextField("content",txt,Field.Store.YES));
writer.addDocument(doc);
writer.close();
IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir));
//QueryParser parser=new QueryParser(Version.LUCENE_4_9,"content",a2);
//Query query=parser.parse("眼下");
Query query=new TermQuery(new Term("content","眼下"));
TopDocs tds = searcher.search(query,10);
System.out.println(tds.scoreDocs.length);
Document d = searcher.doc(tds.scoreDocs[0].doc);
System.out.println("content="+d.get("content"));
AnalyzerUtils.displayAllTokenInfo(txt, a2);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

0 0