lucene的多个字段搜索以及分页并高亮显示

来源：互联网发布：谷歌程序员常用插件编辑：程序博客网时间：2024/05/22 00:47

package mutilSearch;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

public class MutilLucene {
// lucene 的多字段搜索
public static void main(String args[]) throws CorruptIndexException,
   LockObtainFailedException, IOException, ParseException {
  try {
   MutilLucene.doSearch("校内伙一举一动往往前", 1, 100);
  } catch (Exception ex) {
   // TODO
  }
}
@SuppressWarnings("deprecation")
// 把Warning去掉
public static void doSearch(String keyword, int page, int pageSize)
   throws Exception {
  Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);

  // Store the index in memory:
  Directory directory = new RAMDirectory();
  // To store an index on disk, use this instead:
  // Directory directory = FSDirectory.open("/tmp/testindex");
  IndexWriter iwriter = new IndexWriter(directory, analyzer, true,
    new IndexWriter.MaxFieldLength(25000));
  Post post = new Post();
  post.setPostId(10001);
  post.setTitle("校内八大俗");
  post.setContent("有网友总结校内网类网站的八大俗：生活常识化妆技，十二星座小秘密；不看后悔成功录，论文大全雅思题。恋爱金句传送门，男默女泪蛋疼文；读到哪句心痛了？不顶不是中国人。相关日志推荐不成熟男人的十个标志中国大学排行榜2010版金融危机十项注意2008十大网络公敌将从我们生活里 ");
  iwriter.addDocument(MutilLucene.buildDocument(post));

  post.setPostId(10002);
  post.setTitle("天使的眼神：一个摄影家镜头中的孩子（上）");
  post.setContent("校内：之前已经有很多关于“天使”的文章了，这些小家伙一举一动往往最能打动我们。今天又收集了一些“天使的眼神”与大家分享，那清澈的眼神是否会直达你的心底，让你的心也一片清澈？　　另外，由于图片数量较多，就作两期发布，希望大家喜欢……");
  iwriter.addDocument(MutilLucene.buildDocument(post));

  post.setPostId(10003);
  post.setTitle("冷组总是能出这么伟大的冷笑话");
  post.setContent("鹅李卡|蘑菇蘑菇分享我的某位友邻说：据说大地震前有三个明显征兆： 1.井水异常；2.牲畜反应异常；3.专家出来辟谣。但是细心网友指出，第二条和第三条重复了。然后底下有人回应说：可能是喝了异常的井水。。。其实专家的嘴就像屁股一样,有图有真相!!!!");

iwriter.addDocument(MutilLucene.buildDocument(post));
iwriter.close();

  // Now search the index:
  IndexSearcher isearcher = new IndexSearcher(directory, true); // read-only=true
  // Parse a simple query that searches for "text":
  QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_CURRENT,
    new String[] { "title", "content" }, analyzer);
  Query query = parser.parse(keyword);
  Sort sort = new Sort(); // 排序
  // "postId":排序的字段 SortField.INT：排序方式 false:降序排列 true：升序排列
  sort.setSort(new SortField("postId", SortField.INT, true));
  // 相似度最高的是 1000
  ScoreDoc[] hits = isearcher.search(query, null, 1000, sort).scoreDocs;

  // 关键字高亮显示
  Formatter formatter = new SimpleHTMLFormatter("<font color=/"red/">",
    "</font>"); // 前缀和后缀
  Scorer scorer = new QueryScorer(query);
  Highlighter highlighter = new Highlighter(formatter, scorer);
  highlighter.setTextFragmenter(new SimpleFragmenter(200)); // 字长度
  // 高亮结束
  System.out.println("length="+hits.length);

  //分页测试
  List list = MutilLucene.processHits(hits, isearcher, 1, 1);
   Map map =new HashMap();
   for (int i=0; i<list.size(); i++){
    map=(Map) list.get(i);
    System.out.println(map.get("postId")+map.get("title").toString());
   }
  System.out.println("size="+list.size());


  // Iterate through the results:
  for (int i = 0; i < hits.length; i++) {
   Document hitDoc = isearcher.doc(hits[i].doc);
   System.out.println(hitDoc.get("postId") + ":" + hitDoc.get("title"));
   // 高亮显示关键字中的内容
//   try {
//    String title = highlighter.getBestFragment(analyzer, "title",hitDoc.get("title"));
//    // String content = highlighter.getBestFragment(analyzer,
//    // "content", hitDoc.get("content"));
//    System.out.println("title=" + title);
//    // System.out.println("content"+content);
//   } catch (InvalidTokenOffsetsException e) {
//    e.printStackTrace();
//   }
  }
  isearcher.close();
  directory.close();
}

private static Document buildDocument(Post post) {
  Document doc = new Document();
  doc.add(new Field("postId", String.valueOf(post.getPostId()),
    Field.Store.YES, Field.Index.NOT_ANALYZED));
  doc.add(new Field("title", String.valueOf(post.getTitle()),
    Field.Store.YES, Field.Index.ANALYZED));
  doc.add(new Field("content", String.valueOf(post.getContent()),
    Field.Store.YES, Field.Index.ANALYZED));
  return doc;
}

// lucence 分页方法 Hits中保存的并不是真正的Document 将所查到的内容重新装入 list
private static List processHits(ScoreDoc[] hits,IndexSearcher isearcher, int startIndex, int endIndex)
   throws Exception {
  if (endIndex >= hits.length)
   endIndex = hits.length - 1;
  List docs = new ArrayList();
  for (int i = startIndex; i <= endIndex; i++) {
   Document document = isearcher.doc(hits[i].doc);
   Map docMap = new HashMap();
   docMap.put("postId", document.getField("postId").stringValue());
   docMap.put("title", document.getField("title").stringValue());
   docMap.put("content", document.getField("content").stringValue());
   docs.add(docMap);
  }
  return docs;
}

}