Lucene同义词检索同时精确提取自定义关键词(Lucene版本5.3.0)

来源:互联网 发布:java与传感器通信 编辑:程序博客网 时间:2024/06/01 20:37

此博文针对的是Lucene版本5.3.0,若您的Lucene版本为3.X,请移步这里http://write.blog.csdn.net/postedit/78291868(只提取关键词,未包含同义词检索)


本篇文章包含两个功能

1、精确提取自定义关键词

2、同义词检索与提取


废话不多说,直接撸代码


定义同义词分词类如下

package com.daelly.sample.lucene.analyzer.synonyms;import java.io.File;import java.io.IOException;import java.nio.file.Path;import java.nio.file.Paths;import java.util.HashMap;import java.util.Map;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.synonym.SynonymFilterFactory;import org.apache.lucene.analysis.util.ClasspathResourceLoader;import org.apache.lucene.analysis.util.FilesystemResourceLoader;import org.wltea.analyzer.lucene.IKTokenizer;public class SynonymsAnalyzer extends Analyzer {private final String synonymsPath;public SynonymsAnalyzer(String synonymsPath) {if(synonymsPath==null || synonymsPath.isEmpty()) {throw new IllegalArgumentException("synonymsPath must be provided!");}this.synonymsPath = synonymsPath;}@Overrideprotected TokenStreamComponents createComponents(String fieldName) {SynonymFilterFactory factory = null;try {factory = getSynonymFilterFactory();} catch (IOException e) {e.printStackTrace();}Tokenizer tokenizer = new IKTokenizer(true);if(factory != null) {TokenStream tokenStream = factory.create(tokenizer);return new TokenStreamComponents(tokenizer,tokenStream);}return new TokenStreamComponents(tokenizer);}private SynonymFilterFactory getSynonymFilterFactory() throws IOException {if(synonymsPath.contains("classpath:")) {String path = synonymsPath.replace("classpath:", "");Map args = new HashMap<String,String>();args.put("synonyms", path);SynonymFilterFactory factory = new SynonymFilterFactory(args );factory.inform(new ClasspathResourceLoader());return factory;}int index = synonymsPath.lastIndexOf(File.separator);String dir = synonymsPath.substring(0,index);String name = synonymsPath.substring(index+1);Map args = new HashMap<String,String>();args.put("synonyms", name);SynonymFilterFactory factory = new SynonymFilterFactory(args);Path baseDirectory = Paths.get(dir);FilesystemResourceLoader loader = new FilesystemResourceLoader(baseDirectory);factory.inform(loader);return factory;}}


添加索引类,其中INDEXDIR是索引路径,DATADIR是待索引文件路径,ACTIONDIR是同义词文件路径

package com.apache.luence;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.UnsupportedEncodingException;import java.nio.file.Paths;import java.util.ArrayList;import java.util.List;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.TextField;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.wltea.analyzer.lucene.IKAnalyzer;import com.daelly.sample.lucene.analyzer.synonyms.SynonymsAnalyzer;public class AddIndex {private static final String INDEXDIR = "D:\\TestSolr\\Index\\Test"; private static final String DATADIR = "D:\\TestSolr\\src\\resource\\node.dic";private static final String ACTIONDIR = "D:\\TestSolr\\src\\resource\\data\\action.txt";public AddIndex() {try {Directory directory = FSDirectory.open(Paths.get(INDEXDIR));IndexWriterConfig config = new IndexWriterConfig(new SynonymsAnalyzer(ACTIONDIR));IndexWriter iwriter = new IndexWriter(directory, config);File files = new File(DATADIR);List<String> contents = this.getContent(files);for (String content : contents) {    Document doc = new Document();    doc.add(new TextField("content",content,Field.Store.YES));    iwriter.addDocument(doc);}iwriter.close();} catch (IOException e) {e.printStackTrace();}}private List<String> getContent(File files) {List<String> strList = new ArrayList<String>();try {InputStream stream = new FileInputStream(files);String code = "UTF-8";BufferedReader br = new BufferedReader(new InputStreamReader(stream, code));String str = br.readLine();while (str != null) {strList.add(str);str = br.readLine();}br.close();} catch (FileNotFoundException e) {e.printStackTrace();} catch (UnsupportedEncodingException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}return strList;}public static void main(String[] args) {AddIndex a = new AddIndex();}}


检索类
package com.apache.luence;import java.io.IOException;import java.io.StringReader;import java.nio.file.Paths;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import javax.swing.plaf.synth.SynthSpinnerUI;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.tokenattributes.TypeAttribute;import org.apache.lucene.document.Document;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.Term;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;/** *  * 通过索引字段来读取文档 * */public class IKUtil {private  static final String INDEXDIR = "D:\\TestSolr\\Index\\Test";private boolean search(String keyword){boolean flag = false;Directory directory = null;// 读取索引并查询DirectoryReader ireader = null;TopDocs hits = null;try {flag = false;directory = FSDirectory.open(Paths.get(INDEXDIR));ireader = DirectoryReader.open(directory);IndexSearcher isearcher = new IndexSearcher(ireader);Analyzer analyzer=new IKAnalyzer(true);TermQuery query = new TermQuery(new Term("content", keyword)); hits=isearcher.search(query, 10);} catch (IOException e) {e.printStackTrace();}if(hits!=null&&hits.totalHits>0) {flag = true;}try {ireader.close();directory.close();} catch (IOException e) {e.printStackTrace();}return flag;}/** * 获取输入文本中的关键词 * @param sInput * @return */public String[] getKeyWords(String sInput){List<String> result = new ArrayList<String>();Map<String, String> map = new HashMap<String, String>();int i=0;try {IKAnalyzer analyzer = new IKAnalyzer(true);TokenStream tokenStream = analyzer.tokenStream("content",new StringReader(sInput));    CharTermAttribute term = (CharTermAttribute)tokenStream.addAttribute(CharTermAttribute.class);    tokenStream.reset();while (tokenStream.incrementToken()) {if(this.search(term.toString())==true) {if (!map.containsValue(term.toString())) {                map.put("key" + i, term.toString());                i++;            }}}tokenStream.end();} catch (Exception e) {e.printStackTrace();}for(int j=0;j<map.size();j++) {result.add( map.get("key" + j));}return (String[])result.toArray(new String[result.size()]);}/** * 获取同义词列表里的同义词 * @param src * @return */public List<String> getSynonyms(String src) {List<String> results = new ArrayList<String>();try {Term term = new Term("content", src);Query query = new TermQuery(term);Directory directory = FSDirectory.open(Paths.get(INDEXDIR));IndexReader reader = DirectoryReader.open(directory);IndexSearcher searcher = new IndexSearcher(reader);TopDocs docs = searcher.search(query, 10);for(ScoreDoc scoreDoc : docs.scoreDocs) {Document doc = searcher.doc(scoreDoc.doc);String synonyms = doc.get("content");if("".equals(synonyms)&&synonyms==null) {return null;}results.add(synonyms);}} catch (IOException e) {e.printStackTrace();}return results;}public static void main(String[] args) {String input = "这是一整个关键词哈哈";String[] results = new IKUtil().getKeyWords(input);//List<String> results = new IKUtil().getSynonyms(input);for(String result:results) {System.out.println(result);}}}

首先测试一下精确提取关键词功能

索引文件如图

运行结果如图



如图可以看到"哈哈”被过滤掉了


下面再测试一下同义词功能

如图是同义词文件


这里需要空一行(其实我也不知道为什么……),然后编码格式utf-8

检验一下,还是用之前建立的索引文件


如图,如果还记得索引文件话会发现里面没有hello这个关键词,但是仍被识别出来了。

再检验一下搜索同义词的功能

这里更换一下注释的行数,选择134行被注掉的代码,同时注掉133行,运行结果如图



bingo

剩下的就需要各位小伙伴按需求进行修改就好了



原创粉丝点击