java LuceneCrawler

来源:互联网 发布:广电授权的网络机顶盒 编辑:程序博客网 时间:2024/04/25 21:42
package com.jeecms.jspgou.lucene;


import java.io.IOException;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.List;


import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import com.jeecms.common.page.Pagination;




/**
 * 商品对象。用于全文检索
 * 
 * @author liufang
 * 
 */
public class LuceneCrawler {
private static final Logger log = LoggerFactory
.getLogger(LuceneCrawler.class);


public static Query createQuery(String queryString,String shrink,String startTime, String endTime,Analyzer analyzer) throws ParseException {
BooleanQuery typeNegativeSearch = new BooleanQuery();
       QueryParser queryParser = new QueryParser(Version.LUCENE_35, LuceneCrawler.INDEXLIB, analyzer);  
       queryParser.setDefaultOperator(QueryParser.OR_OPERATOR);  
       Query query = queryParser.parse(queryString);  
        if(shrink!=null && !shrink.equals("")){
            QueryParser queryParser1 = new QueryParser(Version.LUCENE_35, LuceneCrawler.SOURCEPATH_SHRINK, analyzer);  
            queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);  
        Query query1 = queryParser1.parse(shrink);
            typeNegativeSearch.add(query1, Occur.MUST);
        }
        if(startTime!=null || endTime!=null){
            TermRangeQuery  rangeQuery = new TermRangeQuery(LuceneCrawler.INFODATE, startTime, endTime, true, true);
            typeNegativeSearch.add(rangeQuery, Occur.MUST);
        }
        Sort sort = new Sort(new SortField[]{new SortField(LuceneCrawler.INFODATE, SortField.INT, false)});


        typeNegativeSearch.add(query, Occur.MUST);
return typeNegativeSearch;
}


public static Pagination getResult(IndexReader reader,
TopScoreDocCollector res, int pageNo, int pageSize)
throws CorruptIndexException, IOException {
List<LuceneCrawler> list = new ArrayList<LuceneCrawler>(pageSize);
TopDocs tds = res.topDocs((pageNo - 1) * pageSize, pageSize);  
ScoreDoc[] sd = tds.scoreDocs;
for (int i = 0; i < sd.length; i++) {  
Document hitDoc = reader.document(sd[i].doc);  
list.add(createDoc(hitDoc));
}
return new Pagination(pageNo, pageSize, res.getTotalHits(), list);
}
public static Pagination getResultSort(TopDocs topDocs,
IndexSearcher searcher, TopScoreDocCollector res,int pageNo, int pageSize)
throws CorruptIndexException, IOException {
Integer tempHit = topDocs.totalHits; 
int count = pageSize * pageNo;  //获取的总记录数
if(count > tempHit){
count = tempHit;
}
List<LuceneCrawler> list = new ArrayList<LuceneCrawler>(pageSize);
        ScoreDoc[] docHits = topDocs.scoreDocs;
        for (int i = (pageNo - 1) * pageSize; i <count; i++) {
Document hitDoc = searcher.doc(docHits[i].doc);  
list.add(createDoc(hitDoc));
}
return new Pagination(pageNo, pageSize, tempHit, list);
}
/**
* 获得Lucene格式的Document

* @param p
*            爬虫数据
* @return
*/
public static Document createDocument(LuceneCrawler p) {
Document doc = new Document();

if (!StringUtils.isBlank(p.getUrlpath())) {
         doc.add(new Field(URLPATH, p.getUrlpath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//URL不分词  
}
if (!StringUtils.isBlank(p.getUrlHashCode())) {
         doc.add(new Field(URLHASHCODE, p.getUrlHashCode(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//URL不分词  
}
if (!StringUtils.isBlank(p.getSourcepath())) {
    doc.add(new Field(SOURCEPATH, p.getSourcepath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //来源 
}
if (!StringUtils.isBlank(p.getSourcepathShrink())) {
    doc.add(new Field(SOURCEPATH_SHRINK, p.getSourcepathShrink(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //来源标识 
}
if (!StringUtils.isBlank(p.getLocalPath())) {
    doc.add(new Field(LOCALPATH, p.getLocalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //本地文件路径 
}
if (!StringUtils.isBlank(p.getLocalFileName())) {
    doc.add(new Field(LOCALFILENAME, p.getLocalFileName(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //本地文件名 
}
if (!StringUtils.isBlank(p.getTitle())) {
    doc.add(new Field(TITLE, p.getTitle(), Field.Store.YES, Field.Index.ANALYZED)); //标题
}
doc.add(new Field(CRAWLERDATE, DateTools.dateToString(
p.getCrawlerdate(), DateTools.Resolution.MILLISECOND),
Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));

doc.add(new Field(INFODATE, DateTools.dateToString(
p.getInfodate(), DateTools.Resolution.MILLISECOND),
Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
if (!StringUtils.isBlank(p.getKeyWord())) {
        doc.add(new Field(KEYWORD, p.getKeyWord(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //关键字不分词
}
if (!StringUtils.isBlank(p.getIndexlib())) {
        doc.add(new Field(INDEXLIB, p.getIndexlib(), Field.Store.YES, Field.Index.ANALYZED)); //内容分词 
}
return doc;
}


/**
* @param d
* @return
*/
@SuppressWarnings("unchecked")
public static LuceneCrawler createDoc(Document d) {
LuceneCrawler p = new LuceneCrawler();
p.setIndexlib(d.get(INDEXLIB));
p.setUrlpath(d.get(URLPATH));
p.setSourcepath(d.get(SOURCEPATH));
try {
p.setCrawlerdate(new Timestamp(DateTools.stringToDate(d.get(INFODATE)).getTime()));
} catch (java.text.ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
p.setCrawlerDateS(d.get(INFODATE));
p.setLocalPath(d.get(LOCALPATH));
p.setLocalFileName(d.get(LOCALFILENAME));
p.setTitle((d.get(TITLE)));
return p;
}
public static final String TYPE = "type";
public static final String TYPEV = "D";
public static final String SID = "sid";
public static final String ID = "id";
public static final String INDEXLIB = "indexlib";
public static final String URLPATH = "urlpath";
public static final String LOCALPATH = "localpath";
public static final String SOURCEPATH = "sourcepath";
public static final String SOURCEPATH_SHRINK = "sourcepathShrink";
public static final String CRAWLERDATE = "crawlerdate";
public static final String LOCALFILENAME = "localFileName";
public static final String TITLE = "title";
public static final String KEYWORD = "keyWord";
public static final String URLHASHCODE = "urlHashCode";
public static final String INFODATE = "infodate";



public static final String[] QUERY_FIELD = { INDEXLIB, URLPATH};
public static final BooleanClause.Occur[] QUERY_FLAGS = {
BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
/**
* 索引库
*/
private String indexlib;
/**
* 网络URL
*/
private String urlpath;
/**
* 信息来源
*/
private String sourcepath;
/**
* 信息来源标识
*/
private String sourcepathShrink;
/**
* 本地文件路径
*/
private String localPath;
/**
* 本地文件名称
*/
private String localFileName;


/**
* 采集日期
*/
private java.sql.Timestamp crawlerdate;

/**
* 采集日期
*/
private String crawlerDateS;
/**
* 标题
*/
private String title;
/**
* HashCode
*/
private String urlHashCode;
/**
* 关键字
*/
private String keyWord;
/**
* 信息日期
*/
private java.sql.Timestamp infodate;


public String getIndexlib() {
return indexlib;
}
public void setIndexlib(String indexlib) {
this.indexlib = indexlib;
}
public String getUrlpath() {
return urlpath;
}
public void setUrlpath(String urlpath) {
this.urlpath = urlpath;
}
public String getSourcepath() {
return sourcepath;
}
public void setSourcepath(String sourcepath) {
this.sourcepath = sourcepath;
}


public java.sql.Timestamp getCrawlerdate() {
return crawlerdate;
}


public void setCrawlerdate(java.sql.Timestamp crawlerdate) {
this.crawlerdate = crawlerdate;
}


public String getLocalPath() {
return localPath;
}


public void setLocalPath(String localPath) {
this.localPath = localPath;
}


public String getLocalFileName() {
return localFileName;
}


public void setLocalFileName(String localFileName) {
this.localFileName = localFileName;
}


public String getCrawlerDateS() {
return crawlerDateS;
}


public void setCrawlerDateS(String crawlerDateS) {
this.crawlerDateS = crawlerDateS;
}


public String getTitle() {
return title;
}


public void setTitle(String title) {
this.title = title;
}


public String getSourcepathShrink() {
return sourcepathShrink;
}


public void setSourcepathShrink(String sourcepathShrink) {
this.sourcepathShrink = sourcepathShrink;
}


public String getKeyWord() {
return keyWord;
}


public void setKeyWord(String keyWord) {
this.keyWord = keyWord;
}


public String getUrlHashCode() {
return urlHashCode;
}


public void setUrlHashCode(String urlHashCode) {
this.urlHashCode = urlHashCode;
}


public java.sql.Timestamp getInfodate() {
return infodate;
}


public void setInfodate(java.sql.Timestamp infodate) {
this.infodate = infodate;
}




}
0 0
原创粉丝点击