Nutch1.2标题关键字高亮的正确方法

来源：互联网发布：16年总决赛数据编辑：程序博客网时间：2024/06/01 18:43

最近在弄Nutch1.2，实现关键字高亮，却发现标题关键字高亮的方法，国内网站上的都是错的,最终在nutch.apache.org网站找到了相近的代码，进行修改，终于成功完成
关键字的高亮需要自己再创建一个分词器,关键的类是TokenStream,lucene3.0以上需要用到TermAttribute。

一、内容关键字高亮很简单，修改include/style.html即可：

 .highlight {
 color:#FF0000;
 }

二、标题关键字高亮的方法：

 我们从内容关键字高亮的方法可以得到启发：

 首先来看这一句：

 String summary = summaries[i].toHtml(true);

 这个是调用了org.apache.nutch.searcher.Summary方法
 public String toHtml(boolean encode)｛...｝

 这是标题的获取方法

 String title = detail.getValue("title");

 我们可不可以也像summary一样调用呢，答案是肯定的,但是,nutch本身并未提标题关键字高亮的方法，这里需要我们写类和方法。

 新建Titler.java

package org.apache.nutch.searcher;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.searcher.Summary.Fragment;

public class Titler implements Configurable {

 private int maxLength = 40;
 private Analyzer analyzer = null;
 private Configuration conf = null;

 public Titler(Configuration conf) {
 setConf(conf);
 }

 public Configuration getConf() {
 return conf;
 }

 public void setConf(Configuration conf) {
 this.conf = conf;
 this.analyzer = new NutchDocumentAnalyzer(conf);
 this.maxLength = conf.getInt("searcher.title.maxlength", 40);
 }

 public Summary getSummary(String text, Query query) {
 Token[] tokens = getTokens(text); // parse text to token array

 if (tokens.length == 0)
 return new Summary();

 String[] terms = query.getTerms();
 HashSet highlight = new HashSet(); // put query terms in table
 for (int i = 0; i < terms.length; i++)
 highlight.add(terms[i]);

 Summary s = new Summary();

 for (int i = 0; i < tokens.length && i < maxLength; i++) {
 Token token = tokens[i];
 //
 // If we find a term that's in the query...
 //
 if (highlight.contains(token.term())) {
 s.add(new Highlight(token.term()));
 }else{
 s.add(new Fragment(token.term()));
 }
 }
 return s;
 }

 /** A highlighted fragment of text within a summary. */
 public static class Highlight extends Fragment {
 /** Constructs a highlighted fragment for the given text. */
 public Highlight(String text) {
 super(text);
 }
 /** Returns true. */
 public boolean isHighlight() {
 return true;
 }
 }

 private Token[] getTokens(String text) {
 ArrayList result = new ArrayList();
 TokenStream ts = analyzer.tokenStream("title", new StringReader(text));

 TermAttribute termAtt = (TermAttribute) ts
 .getAttribute(TermAttribute.class);
 TypeAttribute typeAtt = (TypeAttribute) ts
 .getAttribute(TypeAttribute.class);

 try {
 while (ts.incrementToken()) {
 Token token = new Token();
 token.setTermBuffer(termAtt.term());
 result.add(token);
 }
 } catch (IOException e) {
 e.printStackTrace();
 }
 return (Token[]) result.toArray(new Token[result.size()]);
 }
}

 然后在NutchBean.java，添加

 private Titler titler;

 public Summary getTitle(HitDetails hit, Query query) throws IOException {
 return titler.getSummary(hit.getValue("title"), query);
 }

 public NutchBean(Configuration conf, Path dir) throws IOException {
 ...
 this.titler = new Titler(conf);
 }

我这里测试的JSP页面是新建s.jsp.如果要在原来的search.jsp页面内调用，需要修改相应的代码。

<%@ page
session="false"
contentType="text/html; charset=UTF-8"
pageEncoding="UTF-8"

import="java.io.*"
import="java.util.*"
import="java.net.*"
import="javax.servlet.http.*"
import="javax.servlet.*"

import="org.apache.nutch.html.Entities"
import="org.apache.nutch.metadata.Nutch"
import="org.apache.nutch.searcher.*"
import="org.apache.nutch.plugin.*"
import="org.apache.nutch.clustering.*"
import="org.apache.hadoop.conf.*"
import="org.apache.nutch.util.NutchConfiguration"
%>
<jsp:include page="/show/include/style.html"/>
<%

 String keyword = "贵阳pep艺术中心― 中心简介";
 String crawl = "/home/961a/workspace/test/Nutch1.2Web/ROOT/crawl";
 String summarylength = "120";

 final Configuration conf = NutchConfiguration.create();
 conf.set("searcher.dir", crawl);
 conf.set("searcher.summary.length", summarylength);

 final NutchBean bean = new NutchBean(conf);
 try {
 final Query query = Query.parse(keyword, conf);
 query.getParams().setMaxHitsPerDup(0);
 final Hits hits = bean.search(query);

 out.println("Total hits: " + hits.getTotal() + " keyword:贵阳pep艺术中心― 中心简介; ");

 final int length = (int) Math.min(hits.getLength(), 10);
 final Hit[] show = hits.getHits(0, length);
 final HitDetails[] details = bean.getDetails(show);
 final Summary[] summaries = bean.getSummary(details, query);

 for (int i = 0; i < hits.getLength(); i++) {
 String url = Entities.encode(details[i].getValue("url"));
 String title = bean.getTitle(details[i], query).toHtml(true);
 String summary = summaries[i].toHtml(true);
 %>
 
 <a href="<%=url%>"><%=title%></a>
 
 <%=summary%>
 
 <%=Entities.encode(details[i].getValue("url"))%>
 
 <%
 }
 } catch (Throwable t) {
 }%>