近义词搜索
来源:互联网 发布:win7 apache php 编辑:程序博客网 时间:2024/04/28 17:18
“西语”是“西班牙语”的简称,当我搜“西语”时,我希望搜索结果也包含“西班牙语”。
所以我要为分词器,加上一层过滤器,用于处理近义词。
分词的结果用TokenStream表示,一个TokenStream包含一串Token,每个Token表示一个分词,包含词的内容,在句子中的位置等。
近义词过滤器,要实现的是将同义词加入到TokenStream中,并且和原词是相同的位置。要实现的结果如下所示:
public static void main(String[] args) throws IOException { Analyzer analyzer = new SynonymAnalyzer(); TokenStream stream = analyzer.tokenStream("", "西班牙语学习"); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); int position = 0; stream.reset(); while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(position + ": "); } System.out.println("["+termAtt.toString()+"]"); } stream.close(); }
结果为: 1:
[西班牙语]
[西语]
2:
[学习]
实现代码如下:
package com.analyzer.test;import java.io.IOException;import java.util.Stack;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.util.AttributeSource;public class SynonymFilter extends TokenFilter { public static final String TOKEN_TYPE_SYNONYM = "SYNONYM"; private Stack<String> synonymStack; private SynonymEngine engine; private AttributeSource.State current; // 词元文本属性 private final CharTermAttribute termAtt; // 词元位移属性 private final PositionIncrementAttribute posIncrAtt; protected SynonymFilter(TokenStream input, SynonymEngine engine) { super(input); synonymStack = new Stack<String>(); this.engine = engine; termAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); } @Override public boolean incrementToken() throws IOException { if (synonymStack.size() > 0) { String syn = synonymStack.pop(); restoreState(current); termAtt.setEmpty(); termAtt.append(syn); termAtt.setLength(syn.length()); posIncrAtt.setPositionIncrement(0); return true; } if (!input.incrementToken()) return false; if (addAliasesToStack()) { current = captureState(); } return true; } private boolean addAliasesToStack() { String[] synonyms = engine.getSynonyms(termAtt.toString()); if (synonyms == null) return false; for (String synonym : synonyms) { if (!termAtt.toString().equals(synonym)) synonymStack.push(synonym); } return true; }}
package com.analyzer.test;public interface SynonymEngine { String[] getSynonyms(String s);}
package com.analyzer.test;import java.io.Reader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.Tokenizer;import org.wltea.analyzer.lucene.IKTokenizer;public class SynonymAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer _IKTokenizer = new IKTokenizer(reader , true); TokenStream ts = new SynonymFilter(_IKTokenizer, new TestSynonymEngine()); return new TokenStreamComponents(_IKTokenizer, ts); }}
package com.analyzer.test;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.util.HashMap;public class TestSynonymEngine implements SynonymEngine { private static HashMap<String, String[]> map = new HashMap<String, String[]>(); static { try { InputStream in = Thread.currentThread().getContextClassLoader().getResourceAsStream("synonym.txt"); InputStreamReader reader = new InputStreamReader(in); BufferedReader br = new BufferedReader(reader); String line = null; while ((line = br.readLine()) != null) { String[] words = line.split(" "); for (int i = 0; i < words.length; i++) { map.put(words[i], words); } } } catch (IOException e) { e.printStackTrace(); } } @Override public String[] getSynonyms(String s) { return map.get(s); }}
package com.pyc.search.searchservice.lucene;import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.apache.log4j.Logger;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import com.analyzer.test.SynonymAnalyzer;public class SynonymSegmenter { private static Logger logger = Logger.getLogger(SynonymSegmenter.class); public String[] analysis(String input) { try { Analyzer analyzer = new SynonymAnalyzer(); TokenStream stream = analyzer.tokenStream("", input); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); List<String> list = new ArrayList<String>(); stream.reset(); while (stream.incrementToken()) { list.add(termAtt.toString()); } stream.close(); return list.toArray(new String[0]); } catch (IOException e) { logger.error(e); } return null; }}
如有错误,欢迎指正。
参考:
lucene in action(第二版)4.5节 Synomyms, aliases, and words that mean the same 131
0 0
- 近义词搜索
- 搜索
- 搜索
- 搜索
- 搜索
- 搜索
- 搜索
- 搜索
- 搜索
- 搜索
- 搜索
- 搜索
- 搜索
- 搜索
- 搜索
- 搜索
- 搜索
- 搜索
- Android开发中应用版本更新功能
- 成绩评定程序C++——方便——快捷————好用
- 【三】CC2541 Central 与 Peripheral 主从机实验 小结
- Java Object 之hashCode
- python3中split分割字符串出现错误 TypeError‘str’ does not support the buffer interface
- 近义词搜索
- MySQL的增删改查
- JDBC数据库连接 1)通过Driver连接数据库
- tiny-dnn import caffe's model
- Java虚拟机类加载
- Linux学习——基础命令总结(1)
- 套接字的I/O模型(一)
- geoserver 源码编译问题
- 开源中国源码学习UI篇(一)之FragmentTabHost的使用分析