lucene整合中文分词器mmseg4j和高亮highlighter

来源:互联网 发布:淘宝里api什么意思 编辑:程序博客网 时间:2024/06/05 10:46

最近在研究lucene,其实很简单,可以整合中文分词器mmseg4j时,总是会报一些异常,这主要是版本兼容问题,在此做一个记录

环境:

lucene:4.3.1

mmseg4j:1.9.1

主要jar包,如下图:

因为我只要mmseg4j的分词器,所以不要solr包

直接上代码:

package com.chenlb.mmseg4j.example;import java.io.File;import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.TextField;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.IndexWriterConfig.OpenMode;import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopScoreDocCollector;import org.apache.lucene.search.highlight.Highlighter;import org.apache.lucene.search.highlight.QueryScorer;import org.apache.lucene.search.highlight.SimpleFragmenter;import org.apache.lucene.search.highlight.SimpleHTMLFormatter;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import com.chenlb.mmseg4j.analysis.ComplexAnalyzer;public class Test {private static final String INDEXPATH = "D:\\index";private static Analyzer analyzer = new ComplexAnalyzer();public static void main(String[] args) {try {indexCreate();search();} catch (Exception e) {e.printStackTrace();}}public static void indexCreate() throws Exception {// 建立索引对象Directory directory = FSDirectory.open(new File(INDEXPATH));IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_43,analyzer);iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);IndexWriter writer = new IndexWriter(directory, iwConfig);String content = "京华时报2008年1月23日报道 昨天,受一股来自中西伯利亚的强冷空气影响,本市出现大风降温天气,白天最高气温只有零下7摄氏度,同时伴有6到7级的偏北风。";Document doc = new Document();TextField textField = new TextField("title", content, Field.Store.YES);doc.add(textField);writer.addDocument(doc);writer.close();}public static void search() throws Exception {File indexDir = new File(INDEXPATH);// 索引目录Directory dir = FSDirectory.open(indexDir);// 根据索引目录创建读索引对象IndexReader reader = DirectoryReader.open(dir);// 搜索对象创建IndexSearcher searcher = new IndexSearcher(reader);// 创建查询解析对象QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_43,new String[] { "title" }, analyzer);parser.setDefaultOperator(QueryParser.AND_OPERATOR);String word = "中西伯利亚 ";// 根据域和目标搜索文本创建查询器Query query = parser.parse(word);System.out.println("搜索关键词: " + query.toString(word));// 对结果进行相似度打分排序TopScoreDocCollector collector = TopScoreDocCollector.create(5 * 10,true);searcher.search(query, collector);// 获取结果ScoreDoc[] hits = collector.topDocs().scoreDocs;int numTotalHits = collector.getTotalHits();System.out.println("一共匹配" + numTotalHits + "个网页");// 设置高亮显示格式SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'><strong>", "</strong></font>");/* 语法高亮显示设置 */Highlighter highlighter = new Highlighter(simpleHTMLFormatter,new QueryScorer(query));highlighter.setTextFragmenter(new SimpleFragmenter(100));// 显示搜索结果for (int i = 0; i < hits.length; i++) {Document doc = searcher.doc(hits[i].doc);String title = doc.get("title");TokenStream titleTokenStream = analyzer.tokenStream(title,new StringReader(title));String highLightTitle = highlighter.getBestFragment(titleTokenStream, title);System.out.println((i + 1) + "." + title);System.out.println(highLightTitle);}}}

运行结果:

这就完成了.....

0 0
原创粉丝点击