java LuceneCrawler
来源:互联网 发布:广电授权的网络机顶盒 编辑:程序博客网 时间:2024/04/25 21:42
package com.jeecms.jspgou.lucene;
import java.io.IOException;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.jeecms.common.page.Pagination;
/**
* 商品对象。用于全文检索
*
* @author liufang
*
*/
public class LuceneCrawler {
private static final Logger log = LoggerFactory
.getLogger(LuceneCrawler.class);
public static Query createQuery(String queryString,String shrink,String startTime, String endTime,Analyzer analyzer) throws ParseException {
BooleanQuery typeNegativeSearch = new BooleanQuery();
QueryParser queryParser = new QueryParser(Version.LUCENE_35, LuceneCrawler.INDEXLIB, analyzer);
queryParser.setDefaultOperator(QueryParser.OR_OPERATOR);
Query query = queryParser.parse(queryString);
if(shrink!=null && !shrink.equals("")){
QueryParser queryParser1 = new QueryParser(Version.LUCENE_35, LuceneCrawler.SOURCEPATH_SHRINK, analyzer);
queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query1 = queryParser1.parse(shrink);
typeNegativeSearch.add(query1, Occur.MUST);
}
if(startTime!=null || endTime!=null){
TermRangeQuery rangeQuery = new TermRangeQuery(LuceneCrawler.INFODATE, startTime, endTime, true, true);
typeNegativeSearch.add(rangeQuery, Occur.MUST);
}
Sort sort = new Sort(new SortField[]{new SortField(LuceneCrawler.INFODATE, SortField.INT, false)});
typeNegativeSearch.add(query, Occur.MUST);
return typeNegativeSearch;
}
public static Pagination getResult(IndexReader reader,
TopScoreDocCollector res, int pageNo, int pageSize)
throws CorruptIndexException, IOException {
List<LuceneCrawler> list = new ArrayList<LuceneCrawler>(pageSize);
TopDocs tds = res.topDocs((pageNo - 1) * pageSize, pageSize);
ScoreDoc[] sd = tds.scoreDocs;
for (int i = 0; i < sd.length; i++) {
Document hitDoc = reader.document(sd[i].doc);
list.add(createDoc(hitDoc));
}
return new Pagination(pageNo, pageSize, res.getTotalHits(), list);
}
public static Pagination getResultSort(TopDocs topDocs,
IndexSearcher searcher, TopScoreDocCollector res,int pageNo, int pageSize)
throws CorruptIndexException, IOException {
Integer tempHit = topDocs.totalHits;
int count = pageSize * pageNo; //获取的总记录数
if(count > tempHit){
count = tempHit;
}
List<LuceneCrawler> list = new ArrayList<LuceneCrawler>(pageSize);
ScoreDoc[] docHits = topDocs.scoreDocs;
for (int i = (pageNo - 1) * pageSize; i <count; i++) {
Document hitDoc = searcher.doc(docHits[i].doc);
list.add(createDoc(hitDoc));
}
return new Pagination(pageNo, pageSize, tempHit, list);
}
/**
* 获得Lucene格式的Document
*
* @param p
* 爬虫数据
* @return
*/
public static Document createDocument(LuceneCrawler p) {
Document doc = new Document();
if (!StringUtils.isBlank(p.getUrlpath())) {
doc.add(new Field(URLPATH, p.getUrlpath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//URL不分词
}
if (!StringUtils.isBlank(p.getUrlHashCode())) {
doc.add(new Field(URLHASHCODE, p.getUrlHashCode(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//URL不分词
}
if (!StringUtils.isBlank(p.getSourcepath())) {
doc.add(new Field(SOURCEPATH, p.getSourcepath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //来源
}
if (!StringUtils.isBlank(p.getSourcepathShrink())) {
doc.add(new Field(SOURCEPATH_SHRINK, p.getSourcepathShrink(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //来源标识
}
if (!StringUtils.isBlank(p.getLocalPath())) {
doc.add(new Field(LOCALPATH, p.getLocalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //本地文件路径
}
if (!StringUtils.isBlank(p.getLocalFileName())) {
doc.add(new Field(LOCALFILENAME, p.getLocalFileName(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //本地文件名
}
if (!StringUtils.isBlank(p.getTitle())) {
doc.add(new Field(TITLE, p.getTitle(), Field.Store.YES, Field.Index.ANALYZED)); //标题
}
doc.add(new Field(CRAWLERDATE, DateTools.dateToString(
p.getCrawlerdate(), DateTools.Resolution.MILLISECOND),
Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field(INFODATE, DateTools.dateToString(
p.getInfodate(), DateTools.Resolution.MILLISECOND),
Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
if (!StringUtils.isBlank(p.getKeyWord())) {
doc.add(new Field(KEYWORD, p.getKeyWord(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //关键字不分词
}
if (!StringUtils.isBlank(p.getIndexlib())) {
doc.add(new Field(INDEXLIB, p.getIndexlib(), Field.Store.YES, Field.Index.ANALYZED)); //内容分词
}
return doc;
}
/**
* @param d
* @return
*/
@SuppressWarnings("unchecked")
public static LuceneCrawler createDoc(Document d) {
LuceneCrawler p = new LuceneCrawler();
p.setIndexlib(d.get(INDEXLIB));
p.setUrlpath(d.get(URLPATH));
p.setSourcepath(d.get(SOURCEPATH));
try {
p.setCrawlerdate(new Timestamp(DateTools.stringToDate(d.get(INFODATE)).getTime()));
} catch (java.text.ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
p.setCrawlerDateS(d.get(INFODATE));
p.setLocalPath(d.get(LOCALPATH));
p.setLocalFileName(d.get(LOCALFILENAME));
p.setTitle((d.get(TITLE)));
return p;
}
public static final String TYPE = "type";
public static final String TYPEV = "D";
public static final String SID = "sid";
public static final String ID = "id";
public static final String INDEXLIB = "indexlib";
public static final String URLPATH = "urlpath";
public static final String LOCALPATH = "localpath";
public static final String SOURCEPATH = "sourcepath";
public static final String SOURCEPATH_SHRINK = "sourcepathShrink";
public static final String CRAWLERDATE = "crawlerdate";
public static final String LOCALFILENAME = "localFileName";
public static final String TITLE = "title";
public static final String KEYWORD = "keyWord";
public static final String URLHASHCODE = "urlHashCode";
public static final String INFODATE = "infodate";
public static final String[] QUERY_FIELD = { INDEXLIB, URLPATH};
public static final BooleanClause.Occur[] QUERY_FLAGS = {
BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
/**
* 索引库
*/
private String indexlib;
/**
* 网络URL
*/
private String urlpath;
/**
* 信息来源
*/
private String sourcepath;
/**
* 信息来源标识
*/
private String sourcepathShrink;
/**
* 本地文件路径
*/
private String localPath;
/**
* 本地文件名称
*/
private String localFileName;
/**
* 采集日期
*/
private java.sql.Timestamp crawlerdate;
/**
* 采集日期
*/
private String crawlerDateS;
/**
* 标题
*/
private String title;
/**
* HashCode
*/
private String urlHashCode;
/**
* 关键字
*/
private String keyWord;
/**
* 信息日期
*/
private java.sql.Timestamp infodate;
public String getIndexlib() {
return indexlib;
}
public void setIndexlib(String indexlib) {
this.indexlib = indexlib;
}
public String getUrlpath() {
return urlpath;
}
public void setUrlpath(String urlpath) {
this.urlpath = urlpath;
}
public String getSourcepath() {
return sourcepath;
}
public void setSourcepath(String sourcepath) {
this.sourcepath = sourcepath;
}
public java.sql.Timestamp getCrawlerdate() {
return crawlerdate;
}
public void setCrawlerdate(java.sql.Timestamp crawlerdate) {
this.crawlerdate = crawlerdate;
}
public String getLocalPath() {
return localPath;
}
public void setLocalPath(String localPath) {
this.localPath = localPath;
}
public String getLocalFileName() {
return localFileName;
}
public void setLocalFileName(String localFileName) {
this.localFileName = localFileName;
}
public String getCrawlerDateS() {
return crawlerDateS;
}
public void setCrawlerDateS(String crawlerDateS) {
this.crawlerDateS = crawlerDateS;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSourcepathShrink() {
return sourcepathShrink;
}
public void setSourcepathShrink(String sourcepathShrink) {
this.sourcepathShrink = sourcepathShrink;
}
public String getKeyWord() {
return keyWord;
}
public void setKeyWord(String keyWord) {
this.keyWord = keyWord;
}
public String getUrlHashCode() {
return urlHashCode;
}
public void setUrlHashCode(String urlHashCode) {
this.urlHashCode = urlHashCode;
}
public java.sql.Timestamp getInfodate() {
return infodate;
}
public void setInfodate(java.sql.Timestamp infodate) {
this.infodate = infodate;
}
}
import java.io.IOException;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.jeecms.common.page.Pagination;
/**
* 商品对象。用于全文检索
*
* @author liufang
*
*/
public class LuceneCrawler {
private static final Logger log = LoggerFactory
.getLogger(LuceneCrawler.class);
public static Query createQuery(String queryString,String shrink,String startTime, String endTime,Analyzer analyzer) throws ParseException {
BooleanQuery typeNegativeSearch = new BooleanQuery();
QueryParser queryParser = new QueryParser(Version.LUCENE_35, LuceneCrawler.INDEXLIB, analyzer);
queryParser.setDefaultOperator(QueryParser.OR_OPERATOR);
Query query = queryParser.parse(queryString);
if(shrink!=null && !shrink.equals("")){
QueryParser queryParser1 = new QueryParser(Version.LUCENE_35, LuceneCrawler.SOURCEPATH_SHRINK, analyzer);
queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query1 = queryParser1.parse(shrink);
typeNegativeSearch.add(query1, Occur.MUST);
}
if(startTime!=null || endTime!=null){
TermRangeQuery rangeQuery = new TermRangeQuery(LuceneCrawler.INFODATE, startTime, endTime, true, true);
typeNegativeSearch.add(rangeQuery, Occur.MUST);
}
Sort sort = new Sort(new SortField[]{new SortField(LuceneCrawler.INFODATE, SortField.INT, false)});
typeNegativeSearch.add(query, Occur.MUST);
return typeNegativeSearch;
}
public static Pagination getResult(IndexReader reader,
TopScoreDocCollector res, int pageNo, int pageSize)
throws CorruptIndexException, IOException {
List<LuceneCrawler> list = new ArrayList<LuceneCrawler>(pageSize);
TopDocs tds = res.topDocs((pageNo - 1) * pageSize, pageSize);
ScoreDoc[] sd = tds.scoreDocs;
for (int i = 0; i < sd.length; i++) {
Document hitDoc = reader.document(sd[i].doc);
list.add(createDoc(hitDoc));
}
return new Pagination(pageNo, pageSize, res.getTotalHits(), list);
}
public static Pagination getResultSort(TopDocs topDocs,
IndexSearcher searcher, TopScoreDocCollector res,int pageNo, int pageSize)
throws CorruptIndexException, IOException {
Integer tempHit = topDocs.totalHits;
int count = pageSize * pageNo; //获取的总记录数
if(count > tempHit){
count = tempHit;
}
List<LuceneCrawler> list = new ArrayList<LuceneCrawler>(pageSize);
ScoreDoc[] docHits = topDocs.scoreDocs;
for (int i = (pageNo - 1) * pageSize; i <count; i++) {
Document hitDoc = searcher.doc(docHits[i].doc);
list.add(createDoc(hitDoc));
}
return new Pagination(pageNo, pageSize, tempHit, list);
}
/**
* 获得Lucene格式的Document
*
* @param p
* 爬虫数据
* @return
*/
public static Document createDocument(LuceneCrawler p) {
Document doc = new Document();
if (!StringUtils.isBlank(p.getUrlpath())) {
doc.add(new Field(URLPATH, p.getUrlpath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//URL不分词
}
if (!StringUtils.isBlank(p.getUrlHashCode())) {
doc.add(new Field(URLHASHCODE, p.getUrlHashCode(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//URL不分词
}
if (!StringUtils.isBlank(p.getSourcepath())) {
doc.add(new Field(SOURCEPATH, p.getSourcepath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //来源
}
if (!StringUtils.isBlank(p.getSourcepathShrink())) {
doc.add(new Field(SOURCEPATH_SHRINK, p.getSourcepathShrink(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //来源标识
}
if (!StringUtils.isBlank(p.getLocalPath())) {
doc.add(new Field(LOCALPATH, p.getLocalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //本地文件路径
}
if (!StringUtils.isBlank(p.getLocalFileName())) {
doc.add(new Field(LOCALFILENAME, p.getLocalFileName(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //本地文件名
}
if (!StringUtils.isBlank(p.getTitle())) {
doc.add(new Field(TITLE, p.getTitle(), Field.Store.YES, Field.Index.ANALYZED)); //标题
}
doc.add(new Field(CRAWLERDATE, DateTools.dateToString(
p.getCrawlerdate(), DateTools.Resolution.MILLISECOND),
Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
doc.add(new Field(INFODATE, DateTools.dateToString(
p.getInfodate(), DateTools.Resolution.MILLISECOND),
Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
if (!StringUtils.isBlank(p.getKeyWord())) {
doc.add(new Field(KEYWORD, p.getKeyWord(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //关键字不分词
}
if (!StringUtils.isBlank(p.getIndexlib())) {
doc.add(new Field(INDEXLIB, p.getIndexlib(), Field.Store.YES, Field.Index.ANALYZED)); //内容分词
}
return doc;
}
/**
* @param d
* @return
*/
@SuppressWarnings("unchecked")
public static LuceneCrawler createDoc(Document d) {
LuceneCrawler p = new LuceneCrawler();
p.setIndexlib(d.get(INDEXLIB));
p.setUrlpath(d.get(URLPATH));
p.setSourcepath(d.get(SOURCEPATH));
try {
p.setCrawlerdate(new Timestamp(DateTools.stringToDate(d.get(INFODATE)).getTime()));
} catch (java.text.ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
p.setCrawlerDateS(d.get(INFODATE));
p.setLocalPath(d.get(LOCALPATH));
p.setLocalFileName(d.get(LOCALFILENAME));
p.setTitle((d.get(TITLE)));
return p;
}
public static final String TYPE = "type";
public static final String TYPEV = "D";
public static final String SID = "sid";
public static final String ID = "id";
public static final String INDEXLIB = "indexlib";
public static final String URLPATH = "urlpath";
public static final String LOCALPATH = "localpath";
public static final String SOURCEPATH = "sourcepath";
public static final String SOURCEPATH_SHRINK = "sourcepathShrink";
public static final String CRAWLERDATE = "crawlerdate";
public static final String LOCALFILENAME = "localFileName";
public static final String TITLE = "title";
public static final String KEYWORD = "keyWord";
public static final String URLHASHCODE = "urlHashCode";
public static final String INFODATE = "infodate";
public static final String[] QUERY_FIELD = { INDEXLIB, URLPATH};
public static final BooleanClause.Occur[] QUERY_FLAGS = {
BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
/**
* 索引库
*/
private String indexlib;
/**
* 网络URL
*/
private String urlpath;
/**
* 信息来源
*/
private String sourcepath;
/**
* 信息来源标识
*/
private String sourcepathShrink;
/**
* 本地文件路径
*/
private String localPath;
/**
* 本地文件名称
*/
private String localFileName;
/**
* 采集日期
*/
private java.sql.Timestamp crawlerdate;
/**
* 采集日期
*/
private String crawlerDateS;
/**
* 标题
*/
private String title;
/**
* HashCode
*/
private String urlHashCode;
/**
* 关键字
*/
private String keyWord;
/**
* 信息日期
*/
private java.sql.Timestamp infodate;
public String getIndexlib() {
return indexlib;
}
public void setIndexlib(String indexlib) {
this.indexlib = indexlib;
}
public String getUrlpath() {
return urlpath;
}
public void setUrlpath(String urlpath) {
this.urlpath = urlpath;
}
public String getSourcepath() {
return sourcepath;
}
public void setSourcepath(String sourcepath) {
this.sourcepath = sourcepath;
}
public java.sql.Timestamp getCrawlerdate() {
return crawlerdate;
}
public void setCrawlerdate(java.sql.Timestamp crawlerdate) {
this.crawlerdate = crawlerdate;
}
public String getLocalPath() {
return localPath;
}
public void setLocalPath(String localPath) {
this.localPath = localPath;
}
public String getLocalFileName() {
return localFileName;
}
public void setLocalFileName(String localFileName) {
this.localFileName = localFileName;
}
public String getCrawlerDateS() {
return crawlerDateS;
}
public void setCrawlerDateS(String crawlerDateS) {
this.crawlerDateS = crawlerDateS;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSourcepathShrink() {
return sourcepathShrink;
}
public void setSourcepathShrink(String sourcepathShrink) {
this.sourcepathShrink = sourcepathShrink;
}
public String getKeyWord() {
return keyWord;
}
public void setKeyWord(String keyWord) {
this.keyWord = keyWord;
}
public String getUrlHashCode() {
return urlHashCode;
}
public void setUrlHashCode(String urlHashCode) {
this.urlHashCode = urlHashCode;
}
public java.sql.Timestamp getInfodate() {
return infodate;
}
public void setInfodate(java.sql.Timestamp infodate) {
this.infodate = infodate;
}
}
0 0
- java LuceneCrawler
- java
- JAVA
- JAVA
- JAVA
- java
- Java
- Java
- JAVA:
- java
- java
- java
- java
- Java
- java
- java
- java
- JAVA?
- 用CSS控制图片大小显示的方法
- cocos2d-x中CCSaleTo与CCScaleBy的详细区别(附加代码)
- 06-在Windows中制作os x启动盘,安装os
- java中代码的注释和快捷键
- 怎样优化长尾要害词
- java LuceneCrawler
- windowIsFloating
- java LuceneCrawlerSvcImpl
- 当装系统时遇到“选中的磁盘采用GPT分区形式”
- JHOST 邀请码
- Least Recently Used(LRU) Cache
- 在Tomcat中使用数据连接池
- LIBSVM 如何关闭 Accuracy
- 终于有自己的博客啦