14-使用自定义的高亮标签和搜索title和content中包含搜索关键字的内容

来源:互联网 发布:相似度矩阵 聚类算法 编辑:程序博客网 时间:2024/06/05 22:56

TestIndex.java

package org.lucene.test;import java.io.File;import org.junit.Test;import org.lucene.util.FileIndexUtil;import org.lucene.util.IndexUtil;import org.lucene.util.SearcherUtil;public class TestIndex {/** *@MethodName:testIndex *@Description:创建索引 *@author:半仙儿 *@return void *@date:2015-4-21上午11:50:58 */@Testpublic void testIndex() {IndexUtil iu = new IndexUtil();iu.index();}/** *@MethodName:testTika01 *@Description:使用tika插件进行解析doc文件到控制台 *@author:半仙儿 *@return void *@date:2015-4-21下午12:03:31 */@Testpublic void testTika01() {IndexUtil iu = new IndexUtil();System.out.println(iu.fileToTxt(new File("D:/lucene/example2/职位JD.doc")));}/** *@MethodName:testTika02 *@Description:使用tika进行解析doc(第二种方式) *@author:半仙儿 *@return void *@date:2015-4-21下午01:13:05 */@Testpublic void testTika02() {IndexUtil iu = new IndexUtil();System.out.println(iu.tikaTool(new File("D:/lucene/example2/职位JD.doc")));}/** *@MethodName:testIndex03 *@Description:使用Tika进行索引的创建 *@author:半仙儿 *@return void *@date:2015-4-21下午02:14:00 */@Testpublic void testIndex03() {FileIndexUtil.index(true);}/** *@MethodName:testSearcher01 *@Description:使用tika进行解析文档之后,进行搜索 *@author:半仙儿 *@return void *@date:2015-4-21下午03:38:43 */@Testpublic void testSearcher01() {SearcherUtil su = new SearcherUtil();su.searcher01();}/** *@MethodName:testLighter01  *@Description:测试高亮显示 *@author:半仙儿 *@return void *@date:2015-4-21下午04:49:44 */@Testpublic void testLighter01() {SearcherUtil su = new SearcherUtil();su.lighter01();}@Testpublic void testLighter02() {SearcherUtil su = new SearcherUtil();su.searcherByHighlighter("content:职位");}}

FileIndexUtil.java

package org.lucene.util;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import org.apache.commons.io.FilenameUtils;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.NumericField;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import org.apache.tika.Tika;import org.apache.tika.metadata.Metadata;import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;public class FileIndexUtil {private static Directory directory = null;static {try {directory = FSDirectory.open(new File("d:/lucene/files"));} catch (Exception e) {e.printStackTrace();}}public static Directory getDirectory() {return directory;}/** *@MethodName:generatorDocument *@Description:获取文件的页数 *@param f *@return *@author:半仙儿 *@return Document * @throws IOException *@date:2015-4-21下午02:05:48 */public static Document generatorDocument(File f) throws IOException {Document doc = new Document();Metadata metadata = new Metadata();doc.add(new Field("content", new Tika().parse(new FileInputStream(f),metadata)));doc.add(new Field("title", FilenameUtils.getBaseName(f.getName()),Field.Store.YES, Field.Index.ANALYZED));doc.add(new Field("filename", f.getName(), Field.Store.YES,Field.Index.NOT_ANALYZED));// 类型doc.add(new Field("type", FilenameUtils.getExtension(f.getName()),Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));int page = 0;doc.add(new Field("path", f.getAbsolutePath(), Field.Store.YES,Field.Index.NOT_ANALYZED));try {page = Integer.parseInt(metadata.get("xmpTPg:NPage"));} catch (Exception e) {}// 存储页码doc.add(new NumericField("page", Field.Store.YES, true).setIntValue(page));doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(f.lastModified()));doc.add(new NumericField("size", Field.Store.YES, true).setIntValue((int) f.length() / 1024));return doc;}/** *@MethodName:index *@Description:创建索引 *@param hasNew是否要新建索引 *@author:半仙儿 *@return void *@date:2015-4-15下午04:05:04 */public static void index(boolean hasNew) {IndexWriter writer = null;try {writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new MMSegAnalyzer()));if (hasNew) {writer.deleteAll();}File file = new File("d:/lucene/example2");Document doc = null;for (File f : file.listFiles()) {doc = generatorDocument(f);// 通过tika直接存储doc.add(new Field("content", new Tika().parse(f)));doc.add(new Field("title", FilenameUtils.getBaseName(f.getName()), Field.Store.YES, Field.Index.ANALYZED));doc.add(new Field("filename", f.getName(), Field.Store.YES,Field.Index.NOT_ANALYZED));// 类型doc.add(new Field("type", FilenameUtils.getExtension(f.getName()), Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));doc.add(new Field("path", f.getAbsolutePath(), Field.Store.YES,Field.Index.NOT_ANALYZED));doc.add(new NumericField("date", Field.Store.YES, true).setLongValue(f.lastModified()));doc.add(new NumericField("size", Field.Store.YES, true).setIntValue((int) f.length() / 1024));writer.addDocument(doc);}} catch (Exception e) {e.printStackTrace();} finally {try {if (writer != null)writer.close();} catch (CorruptIndexException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}}}}

IndexUtil.java

package org.lucene.util;import java.io.File;import java.io.FileInputStream;import java.io.FileReader;import java.io.IOException;import java.io.InputStream;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import org.apache.tika.Tika;import org.apache.tika.exception.TikaException;import org.apache.tika.metadata.Metadata;import org.apache.tika.parser.AutoDetectParser;import org.apache.tika.parser.ParseContext;import org.apache.tika.parser.Parser;import org.apache.tika.sax.BodyContentHandler;import org.xml.sax.ContentHandler;import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;public class IndexUtil {/** *  *@MethodName:index *@Description:创建索引 *@author:半仙儿 *@return void *@date:2015-4-21上午11:36:54 */public void index() {try {File f = new File("D:/lucene/example2/职位JD.doc");Directory dir = FSDirectory.open(new File("d:/lucene/file2"));IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_35, new MMSegAnalyzer()));writer.deleteAll();Document doc = new Document();doc.add(new Field("content", new FileReader(f)));writer.addDocument(doc);writer.close();} catch (Exception e) {e.printStackTrace();}}/** *@MethodName:fileToTxt *@Description:使用tika进行doc文件的解析 *@param f *@return *@author:半仙儿 *@return String *@date:2015-4-21下午01:08:32 */public String fileToTxt(File f) {Parser parser = new AutoDetectParser();InputStream is = null;try {Metadata metadata = new Metadata();metadata.set(Metadata.AUTHOR, "空号");is = new FileInputStream(f);ContentHandler handler = new BodyContentHandler();ParseContext context = new ParseContext();context.set(Parser.class, parser);parser.parse(is, handler, metadata, context);for (String name : metadata.names()) {System.out.println(name + ":" + metadata.get(name));}return handler.toString();} catch (Exception e) {e.printStackTrace();} finally {if (is != null)try {is.close();} catch (IOException e) {e.printStackTrace();}}return null;}/** *@MethodName:tikaTool *@Description:封装工具类 *@param f *@return *@author:半仙儿 *@return String *@date:2015-4-21下午01:09:27 */public String tikaTool(File f) {Tika tika=new Tika();try {return tika.parseToString(f);} catch (IOException e) {e.printStackTrace();} catch (TikaException e) {e.printStackTrace();}return null;}}

SearcherUtil.java

package org.lucene.util;import java.io.File;import java.io.IOException;import org.apache.commons.io.FileUtils;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.Term;import org.apache.lucene.queryParser.MultiFieldQueryParser;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.search.highlight.Formatter;import org.apache.lucene.search.highlight.Fragmenter;import org.apache.lucene.search.highlight.Highlighter;import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;import org.apache.lucene.search.highlight.QueryScorer;import org.apache.lucene.search.highlight.SimpleHTMLFormatter;import org.apache.lucene.search.highlight.SimpleSpanFragmenter;import org.apache.lucene.util.Version;import org.apache.tika.Tika;import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;public class SearcherUtil {public void searcher01() {try {IndexSearcher searcher = new IndexSearcher(IndexReader.open(FileIndexUtil.getDirectory()));TermQuery query = new TermQuery(new Term("content", "强"));TopDocs tds = searcher.search(query, 20);for (ScoreDoc sd : tds.scoreDocs) {Document doc = searcher.doc(sd.doc);System.out.println(doc.get("title"));}} catch (Exception e) {e.printStackTrace();}}/** *@MethodName:searcherByHighlighter *@Description:高亮搜索2 *@param name *@author:半仙儿 *@return void *@date:2015-4-22上午09:57:53 */public void searcherByHighlighter(String name) {try {Analyzer a = new MMSegAnalyzer();IndexSearcher searcher = new IndexSearcher(IndexReader.open(FileIndexUtil.getDirectory()));// QueryParser parser = new QueryParser(Version.LUCENE_35, "title",// a);MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, new String[] { "title", "content" }, a);Query query = parser.parse(name);TopDocs tds = searcher.search(query, 20);for (ScoreDoc sd : tds.scoreDocs) {Document doc = searcher.doc(sd.doc);String title = doc.get("title");title = ligterStr(a, query, title, "title");System.out.println("标题--->"+title);System.out.println("**************************************************************");String content = new Tika().parseToString(new File(doc.get("path")));content = ligterStr(a, query, content, "content");System.out.println("内容--->"+content);System.out.println("--------------------------------------------------------------");}searcher.close();} catch (Exception e) {}}private String ligterStr(Analyzer a, Query query, String txt,String fieldname) throws IOException, InvalidTokenOffsetsException {String str = null;QueryScorer scorer = new QueryScorer(query);Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);Formatter fmt = new SimpleHTMLFormatter("<b>", "</b>");Highlighter lighter = new Highlighter(fmt, scorer);lighter.setTextFragmenter(fragmenter);str = lighter.getBestFragment(a, fieldname, txt);if (str == null)return txt;return str;}/** *@MethodName:lighter01 *@Description:高亮基础 *@author:半仙儿 *@return void *@date:2015-4-21下午04:48:49 */public void lighter01() {try {String txt = "我爱北京天安门,天安门上彩旗飞,伟大领袖毛主席,指引我们向前进";// 只加粗北京// TermQuery query = new TermQuery(new Term("f", "北京"));// 加粗所有北京和伟大Query query = new QueryParser(Version.LUCENE_35, "f",new MMSegAnalyzer()).parse("北京 伟大");QueryScorer scorer = new QueryScorer(query);Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);// 要想使用自定义的标签Formatter formatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");Highlighter highlighter = new Highlighter(formatter, scorer);highlighter.setTextFragmenter(fragmenter);String str = highlighter.getBestFragment(new MMSegAnalyzer(), "f",txt);System.out.println(str);} catch (Exception e) {e.printStackTrace();}}}




0 0
原创粉丝点击