lucene--同音词

来源:互联网 发布:大数据利用的过程是 编辑:程序博客网 时间:2024/05/02 00:41


如果lucene在查询的时候要显示同音词的话,那么在创建索引的时候就要加入相关的算法,肯定会损耗性能,得不偿失。

/** * 2013.06.06 * @author 赵洪志 * 同音词测试,只是随便看看 */package com.zhao.lucene.analysis.codec;import java.io.IOException;import org.apache.commons.codec.language.Metaphone;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.TermAttribute;import org.apache.lucene.analysis.tokenattributes.TypeAttribute;public class MetaphoneReplacementFilter extends TokenFilter {public static final String METAPHONE = "metaphone";private Metaphone metaphoner = new Metaphone();private TermAttribute termAttr;private TypeAttribute typeAttr;public MetaphoneReplacementFilter(TokenStream input) {super(input);termAttr = addAttribute(TermAttribute.class);typeAttr = addAttribute(TypeAttribute.class);}public boolean incrementToken() throws IOException {if (!input.incrementToken()) // #Areturn false; // #AString encoded;encoded = metaphoner.encode(termAttr.term()); // #BtermAttr.setTermBuffer(encoded); // #CtypeAttr.setType(METAPHONE); // #Dreturn true;}}

/** * 2013.06.06 * @author 赵洪志 * 同音词测试,只是随便看看 */package com.zhao.lucene.analysis.codec;import java.io.Reader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.LetterTokenizer;import org.apache.lucene.analysis.TokenStream;public class MetaphoneReplacementAnalyzer extends Analyzer {@Overridepublic TokenStream tokenStream(String fieldName, Reader reader) {// TODO 自动生成的方法存根return new MetaphoneReplacementFilter(new LetterTokenizer(reader));}}

/** * 2013.06.06 * @author 赵洪志 * 同音词测试,只是随便看看 */package com.zhao.lucene.analysis.codec;import java.io.IOException;import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.TermAttribute;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.Version;import junit.framework.TestCase;public class MetaphoneAnalyzerTest extends TestCase {public void testKoolKat() throws Exception {RAMDirectory directory = new RAMDirectory();Analyzer analyzer = new MetaphoneReplacementAnalyzer();IndexWriter writer = new IndexWriter(directory, analyzer, true,IndexWriter.MaxFieldLength.UNLIMITED);Document doc = new Document();doc.add(new Field("contents", // #A"我爱中华人民共和国", Field.Store.YES, Field.Index.ANALYZED));writer.addDocument(doc);writer.close();IndexSearcher searcher = new IndexSearcher(directory);Query query = new QueryParser(Version.LUCENE_30, // #B"contents", analyzer) // #B.parse("爱人"); // #BTopDocs hits = searcher.search(query, 1);if (hits.totalHits > 0) {int docID = hits.scoreDocs[0].doc;doc = searcher.doc(docID);assertEquals("我爱中华人民共和国", doc.get("contents")); // #DSystem.out.println(doc.get("contents"));}searcher.close();}/* * #A Index document #B Parse query text #C Verify match #D Retrieve * original value */public static void main(String[] args) throws IOException {MetaphoneReplacementAnalyzer analyzer = new MetaphoneReplacementAnalyzer();TokenStream stream = analyzer.tokenStream("content", new StringReader("The quick brown fox jumped over the lazy dog"));TermAttribute attribute = stream.addAttribute(TermAttribute.class);while (stream.incrementToken()) {System.out.print(attribute.term() + "===");}System.out.println();// System.out.println("");// AnalyzerUtils.displayTokens(analyzer,// "Tha quik brown phox jumpd ovvar tha lazi dag");}}