lucene的多个字段搜索以及分页并高亮显示

来源:互联网 发布:谷歌程序员常用插件 编辑:程序博客网 时间:2024/05/22 00:47

package mutilSearch;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

public class MutilLucene {
 // lucene 的 多字段搜索
 public static void main(String args[]) throws CorruptIndexException,
   LockObtainFailedException, IOException, ParseException {
  try {
   MutilLucene.doSearch("校内 伙一举一动往往 前", 1, 100);
  } catch (Exception ex) {
   // TODO
  }
 }
 @SuppressWarnings("deprecation")
 // 把Warning去掉
 public static void doSearch(String keyword, int page, int pageSize)
   throws Exception {
  Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);

  // Store the index in memory:
  Directory directory = new RAMDirectory();
  // To store an index on disk, use this instead:
  // Directory directory = FSDirectory.open("/tmp/testindex");
  IndexWriter iwriter = new IndexWriter(directory, analyzer, true,
    new IndexWriter.MaxFieldLength(25000));
  Post post = new Post();
  post.setPostId(10001);
  post.setTitle("校内八大俗");
  post.setContent("有网友总结校内网类网站的八大俗: 生活常识化妆技,十二星座小秘密; 不看后悔成功录,论文大全雅思题。 恋爱金句传送门,男默女泪蛋疼文; 读到哪句心痛了?不顶不是中国人。 相关日志推荐不成熟男人的十个标志中国大学排行榜2010版金融危机十项注意2008十大网络公敌将从我们生活里 ");
  iwriter.addDocument(MutilLucene.buildDocument(post));

  post.setPostId(10002);
  post.setTitle("天使的眼神:一个摄影家镜头中的孩子(上)");
  post.setContent("校内:之前已经有很多关于“天使”的文章了,这些小家伙一举一动往往最能打动我们。今天又收集了一些“天使的眼神”与大家分享,那清澈的眼神是否会直达你的心底,让你的心也一片清澈?   另外,由于图片数量较多,就作两期发布,希望大家喜欢……");
  iwriter.addDocument(MutilLucene.buildDocument(post));

  post.setPostId(10003);
  post.setTitle("冷组总是能出这么伟大的冷笑话");
  post.setContent("鹅李卡|蘑菇蘑菇分享 我的某位友邻说:据说大地震前有三个明显征兆: 1.井水异常;2.牲畜反应异常;3.专家出来辟谣。 但是细心网友指出,第二条和第三条重复了。 然后底下有人回应说:可能是喝了异常的井水。。。 其实专家的嘴就像屁股一样,有图有真相!!!!");

  iwriter.addDocument(MutilLucene.buildDocument(post));
  iwriter.close();

  // Now search the index:
  IndexSearcher isearcher = new IndexSearcher(directory, true); // read-only=true
  // Parse a simple query that searches for "text":
  QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_CURRENT,
    new String[] { "title", "content" }, analyzer);
  Query query = parser.parse(keyword);
  Sort sort = new Sort(); // 排序
  // "postId":排序的字段 SortField.INT:排序方式 false:降序排列 true:升序排列
  sort.setSort(new SortField("postId", SortField.INT, true));
  // 相似度最高的是 1000
  ScoreDoc[] hits = isearcher.search(query, null, 1000, sort).scoreDocs;

  // 关键字高亮显示
  Formatter formatter = new SimpleHTMLFormatter("<font color=/"red/">",
    "</font>"); // 前缀和后缀
  Scorer scorer = new QueryScorer(query);
  Highlighter highlighter = new Highlighter(formatter, scorer);
  highlighter.setTextFragmenter(new SimpleFragmenter(200)); // 字长度
  // 高亮结束
  System.out.println("length="+hits.length);
  
  //分页测试
  List list = MutilLucene.processHits(hits, isearcher, 1, 1);
   Map map  =new HashMap();
   for (int i=0; i<list.size(); i++){
    map=(Map) list.get(i);
    System.out.println(map.get("postId")+map.get("title").toString());
   }
  System.out.println("size="+list.size());
  
  
  // Iterate through the results:
  for (int i = 0; i < hits.length; i++) {
   Document hitDoc = isearcher.doc(hits[i].doc);
   System.out.println(hitDoc.get("postId") + ":" + hitDoc.get("title"));
   // 高亮显示 关键字 中的内容
//   try {
//    String title = highlighter.getBestFragment(analyzer, "title",hitDoc.get("title"));
//    // String content = highlighter.getBestFragment(analyzer,
//    // "content", hitDoc.get("content"));
//    System.out.println("title=" + title);
//    // System.out.println("content"+content);
//   } catch (InvalidTokenOffsetsException e) {
//    e.printStackTrace();
//   }
  }
  isearcher.close();
  directory.close();
 }

 private static Document buildDocument(Post post) {
  Document doc = new Document();
  doc.add(new Field("postId", String.valueOf(post.getPostId()),
    Field.Store.YES, Field.Index.NOT_ANALYZED));
  doc.add(new Field("title", String.valueOf(post.getTitle()),
    Field.Store.YES, Field.Index.ANALYZED));
  doc.add(new Field("content", String.valueOf(post.getContent()),
    Field.Store.YES, Field.Index.ANALYZED));
  return doc;
 }

 // lucence 分页 方法  Hits中保存的并不是真正的Document  将所查到的内容重新装入 list
 private static  List processHits(ScoreDoc[] hits,IndexSearcher isearcher, int startIndex, int endIndex)
   throws Exception {
  if (endIndex >= hits.length)
   endIndex = hits.length - 1;
  List docs = new ArrayList();
  for (int i = startIndex; i <= endIndex; i++) {
   Document document = isearcher.doc(hits[i].doc);
   Map docMap = new HashMap();
   docMap.put("postId", document.getField("postId").stringValue());
   docMap.put("title", document.getField("title").stringValue());
   docMap.put("content", document.getField("content").stringValue());
   docs.add(docMap);
  }
  return docs;
 }

}

原创粉丝点击