java LuceneCrawler

来源：互联网发布：广电授权的网络机顶盒编辑：程序博客网时间：2024/04/25 21:42

package com.jeecms.jspgou.lucene;

import java.io.IOException;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.jeecms.common.page.Pagination;

/**
* 商品对象。用于全文检索
*
* @author liufang
*
*/
public class LuceneCrawler {
private static final Logger log = LoggerFactory
.getLogger(LuceneCrawler.class);

public static Query createQuery(String queryString,String shrink,String startTime, String endTime,Analyzer analyzer) throws ParseException {
BooleanQuery typeNegativeSearch = new BooleanQuery();
QueryParser queryParser = new QueryParser(Version.LUCENE_35, LuceneCrawler.INDEXLIB, analyzer);
queryParser.setDefaultOperator(QueryParser.OR_OPERATOR);
Query query = queryParser.parse(queryString);
if(shrink!=null && !shrink.equals("")){
QueryParser queryParser1 = new QueryParser(Version.LUCENE_35, LuceneCrawler.SOURCEPATH_SHRINK, analyzer);
queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query1 = queryParser1.parse(shrink);
typeNegativeSearch.add(query1, Occur.MUST);
}
if(startTime!=null || endTime!=null){
TermRangeQuery rangeQuery = new TermRangeQuery(LuceneCrawler.INFODATE, startTime, endTime, true, true);
typeNegativeSearch.add(rangeQuery, Occur.MUST);
}
Sort sort = new Sort(new SortField[]{new SortField(LuceneCrawler.INFODATE, SortField.INT, false)});

typeNegativeSearch.add(query, Occur.MUST);
return typeNegativeSearch;
}

public static Pagination getResult(IndexReader reader,
TopScoreDocCollector res, int pageNo, int pageSize)
throws CorruptIndexException, IOException {
List<LuceneCrawler> list = new ArrayList<LuceneCrawler>(pageSize);
TopDocs tds = res.topDocs((pageNo - 1) * pageSize, pageSize);
ScoreDoc[] sd = tds.scoreDocs;
for (int i = 0; i < sd.length; i++) {
Document hitDoc = reader.document(sd[i].doc);
list.add(createDoc(hitDoc));
}
return new Pagination(pageNo, pageSize, res.getTotalHits(), list);
}
public static Pagination getResultSort(TopDocs topDocs,
IndexSearcher searcher, TopScoreDocCollector res,int pageNo, int pageSize)
throws CorruptIndexException, IOException {
Integer tempHit = topDocs.totalHits;
int count = pageSize * pageNo; //获取的总记录数
if(count > tempHit){
count = tempHit;
}
List<LuceneCrawler> list = new ArrayList<LuceneCrawler>(pageSize);
ScoreDoc[] docHits = topDocs.scoreDocs;
for (int i = (pageNo - 1) * pageSize; i <count; i++) {
Document hitDoc = searcher.doc(docHits[i].doc);
list.add(createDoc(hitDoc));
}
return new Pagination(pageNo, pageSize, tempHit, list);
}
/**
* 获得Lucene格式的Document
*
* @param p
* 爬虫数据
* @return
*/
public static Document createDocument(LuceneCrawler p) {
Document doc = new Document();

if (!StringUtils.isBlank(p.getUrlpath())) {
doc.add(new Field(URLPATH, p.getUrlpath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//URL不分词
}
if (!StringUtils.isBlank(p.getUrlHashCode())) {
doc.add(new Field(URLHASHCODE, p.getUrlHashCode(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//URL不分词
}
if (!StringUtils.isBlank(p.getSourcepath())) {
doc.add(new Field(SOURCEPATH, p.getSourcepath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //来源
}
if (!StringUtils.isBlank(p.getSourcepathShrink())) {
doc.add(new Field(SOURCEPATH_SHRINK, p.getSourcepathShrink(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //来源标识
}
if (!StringUtils.isBlank(p.getLocalPath())) {
doc.add(new Field(LOCALPATH, p.getLocalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //本地文件路径
}
if (!StringUtils.isBlank(p.getLocalFileName())) {
doc.add(new Field(LOCALFILENAME, p.getLocalFileName(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //本地文件名
}
if (!StringUtils.isBlank(p.getTitle())) {
doc.add(new Field(TITLE, p.getTitle(), Field.Store.YES, Field.Index.ANALYZED)); //标题
}
doc.add(new Field(CRAWLERDATE, DateTools.dateToString(
p.getCrawlerdate(), DateTools.Resolution.MILLISECOND),
Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));

doc.add(new Field(INFODATE, DateTools.dateToString(
p.getInfodate(), DateTools.Resolution.MILLISECOND),
Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
if (!StringUtils.isBlank(p.getKeyWord())) {
doc.add(new Field(KEYWORD, p.getKeyWord(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //关键字不分词
}
if (!StringUtils.isBlank(p.getIndexlib())) {
doc.add(new Field(INDEXLIB, p.getIndexlib(), Field.Store.YES, Field.Index.ANALYZED)); //内容分词
}
return doc;
}

/**
* @param d
* @return
*/
@SuppressWarnings("unchecked")
public static LuceneCrawler createDoc(Document d) {
LuceneCrawler p = new LuceneCrawler();
p.setIndexlib(d.get(INDEXLIB));
p.setUrlpath(d.get(URLPATH));
p.setSourcepath(d.get(SOURCEPATH));
try {
p.setCrawlerdate(new Timestamp(DateTools.stringToDate(d.get(INFODATE)).getTime()));
} catch (java.text.ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
p.setCrawlerDateS(d.get(INFODATE));
p.setLocalPath(d.get(LOCALPATH));
p.setLocalFileName(d.get(LOCALFILENAME));
p.setTitle((d.get(TITLE)));
return p;
}
public static final String TYPE = "type";
public static final String TYPEV = "D";
public static final String SID = "sid";
public static final String ID = "id";
public static final String INDEXLIB = "indexlib";
public static final String URLPATH = "urlpath";
public static final String LOCALPATH = "localpath";
public static final String SOURCEPATH = "sourcepath";
public static final String SOURCEPATH_SHRINK = "sourcepathShrink";
public static final String CRAWLERDATE = "crawlerdate";
public static final String LOCALFILENAME = "localFileName";
public static final String TITLE = "title";
public static final String KEYWORD = "keyWord";
public static final String URLHASHCODE = "urlHashCode";
public static final String INFODATE = "infodate";

public static final String[] QUERY_FIELD = { INDEXLIB, URLPATH};
public static final BooleanClause.Occur[] QUERY_FLAGS = {
BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
/**
* 索引库
*/
private String indexlib;
/**
* 网络URL
*/
private String urlpath;
/**
* 信息来源
*/
private String sourcepath;
/**
* 信息来源标识
*/
private String sourcepathShrink;
/**
* 本地文件路径
*/
private String localPath;
/**
* 本地文件名称
*/
private String localFileName;

/**
* 采集日期
*/
private java.sql.Timestamp crawlerdate;

/**
* 采集日期
*/
private String crawlerDateS;
/**
* 标题
*/
private String title;
/**
* HashCode
*/
private String urlHashCode;
/**
* 关键字
*/
private String keyWord;
/**
* 信息日期
*/
private java.sql.Timestamp infodate;

public String getIndexlib() {
return indexlib;
}
public void setIndexlib(String indexlib) {
this.indexlib = indexlib;
}
public String getUrlpath() {
return urlpath;
}
public void setUrlpath(String urlpath) {
this.urlpath = urlpath;
}
public String getSourcepath() {
return sourcepath;
}
public void setSourcepath(String sourcepath) {
this.sourcepath = sourcepath;
}

public java.sql.Timestamp getCrawlerdate() {
return crawlerdate;
}

public void setCrawlerdate(java.sql.Timestamp crawlerdate) {
this.crawlerdate = crawlerdate;
}

public String getLocalPath() {
return localPath;
}

public void setLocalPath(String localPath) {
this.localPath = localPath;
}

public String getLocalFileName() {
return localFileName;
}

public void setLocalFileName(String localFileName) {
this.localFileName = localFileName;
}

public String getCrawlerDateS() {
return crawlerDateS;
}

public void setCrawlerDateS(String crawlerDateS) {
this.crawlerDateS = crawlerDateS;
}

public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

public String getSourcepathShrink() {
return sourcepathShrink;
}

public void setSourcepathShrink(String sourcepathShrink) {
this.sourcepathShrink = sourcepathShrink;
}

public String getKeyWord() {
return keyWord;
}

public void setKeyWord(String keyWord) {
this.keyWord = keyWord;
}

public String getUrlHashCode() {
return urlHashCode;
}

public void setUrlHashCode(String urlHashCode) {
this.urlHashCode = urlHashCode;
}

public java.sql.Timestamp getInfodate() {
return infodate;
}

public void setInfodate(java.sql.Timestamp infodate) {
this.infodate = infodate;
}

}

0 0