lucene之suggert

来源：互联网发布：遇见聊天软件编辑：程序博客网时间：2024/06/08 14:09

Lucene 关键词搜索

所需jar

Lucene-suggest-4.7.0.jar

Lucene-queryparser-4.7.0.jar

Lucene-misc-4.7.0.jar

Lucene-memory-4.7.0.jar

Lucene-highlighter-4.7.0.jar

Lucene-core-4.7.0.jar

Lucene-analyzers-common-4.7.0.jar

分词器

IKAnalyzer2012FF_u1.jar

效果图

代码：

package lucene;

import java.io.File;

import java.io.IOException;

import java.io.StringReader;

import java.util.HashSet;

import java.util.List;

import java.util.Set;

import org.apache.log4j.Logger;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.AnalyzerWrapper;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

import org.apache.lucene.codecs.lucene46.Lucene46Codec;

import org.apache.lucene.document.BinaryDocValuesField;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.Field.Index;

import org.apache.lucene.document.FieldType;

import org.apache.lucene.document.NumericDocValuesField;

import org.apache.lucene.document.StringField;

import org.apache.lucene.index.AtomicReader;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.index.MultiDocValues;

import org.apache.lucene.index.SlowCompositeReaderWrapper;

import org.apache.lucene.index.Term;

import org.apache.lucene.search.BooleanClause;

import org.apache.lucene.search.BooleanQuery;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.Sort;

import org.apache.lucene.search.SortField;

import org.apache.lucene.search.TermQuery;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.search.suggest.InputIterator;

import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester;

import org.apache.lucene.store.Directory;

import org.apache.lucene.util.BytesRef;

import org.apache.lucene.util.IOUtils;

import org.apache.lucene.util.Version;

public class MyAnalyzingInfixSuggester extends AnalyzingInfixSuggester {

/** 日志 **/

private final Logger logger = Logger.getLogger(MyAnalyzingInfixSuggester.class);

/** Field name used for the indexed text. */

public static final String TEXT_FIELD_NAME = "text";

/** Default minimum number of leading characters before

* PrefixQuery is used (4). */

public static final int DEFAULT_MIN_PREFIX_CHARS = 4;

private final File indexPath;

final int minPrefixChars;

final Version matchVersion;

private final Directory dir;

/**索引创建方式（新建或追加）*/

private final OpenMode mode;

* 重载构造方法初始化相关变量

* @param matchVersion Lucene版本

* @param indexPath 索引文件目录

* @param analyzer 分词器

* @param mode 索引创建方式（新建或追加）

* @throws IOException

public MyAnalyzingInfixSuggester(Version matchVersion, File indexPath, Analyzer analyzer, OpenMode mode) throws IOException {

//调用父类构造方法

super(matchVersion, indexPath, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS);

this.mode = mode;

this.indexPath = indexPath;

this.minPrefixChars = DEFAULT_MIN_PREFIX_CHARS;

this.matchVersion = matchVersion;

dir = getDirectory(indexPath);

}

* 重写获得IndexWriterConfig的方法

* 增加索引创建方式可变（新建或追加）

* @see org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester#getIndexWriterConfig(org.apache.lucene.util.Version, org.apache.lucene.analysis.Analyzer)

@Override

protected IndexWriterConfig getIndexWriterConfig(Version matchVersion, Analyzer indexAnalyzer) {

IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, indexAnalyzer);

iwc.setCodec(new Lucene46Codec());

if (indexAnalyzer instanceof AnalyzerWrapper) {

//如果是tmp目录，采用新建方式打开索引文件

iwc.setOpenMode(OpenMode.CREATE);

} else {

iwc.setOpenMode(mode);

}

return iwc;

}

* 重写查询方法,取消在建立索引时候进行排序

* @see org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester#build(org.apache.lucene.search.suggest.InputIterator)

@Override

public void build(InputIterator iter) throws IOException {

if (searcher != null) {

searcher.getIndexReader().close();

searcher = null;

}

Directory dirTmp = getDirectory(new File(indexPath.toString() + ".tmp"));

IndexWriter w = null;

IndexWriter w2 = null;

AtomicReader r = null;

boolean success = false;

try {

Analyzer gramAnalyzer = new AnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {

@Override

protected Analyzer getWrappedAnalyzer(String fieldName) {

return indexAnalyzer;

}

@Override

protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {

if (fieldName.equals("textgrams") && minPrefixChars > 0) {

return new TokenStreamComponents(components.getTokenizer(), new EdgeNGramTokenFilter(matchVersion, components.getTokenStream(), 1, minPrefixChars));

} else {

return components;

}

};

w = new IndexWriter(dirTmp, getIndexWriterConfig(matchVersion, gramAnalyzer));

BytesRef text;

Document doc = new Document();

FieldType ft = getTextFieldType();

// Field.Store.YES

Field textField = new Field(TEXT_FIELD_NAME, "", ft);

doc.add(textField);

Field textGramField = new Field("textgrams", "", ft);

doc.add(textGramField);

Field textDVField = new BinaryDocValuesField(TEXT_FIELD_NAME, new BytesRef());

doc.add(textDVField);

Field wordDVField = new StringField("word", "", Field.Store.YES);

doc.add(wordDVField);

Field weightField = new NumericDocValuesField("weight", 0);

doc.add(weightField);

Field countField = new StringField("count", "0", Field.Store.YES);

doc.add(countField);

Field payloadField;

if (iter.hasPayloads()) {

payloadField = new BinaryDocValuesField("payloads", new BytesRef());

doc.add(payloadField);

} else {

payloadField = null;

}

long t0 = System.nanoTime();

while ((text = iter.next()) != null) {

String textString = text.utf8ToString();

textField.setStringValue(textString);

wordDVField.setStringValue(textString);

textGramField.setStringValue(textString);

textDVField.setBytesValue(text);

weightField.setLongValue(iter.weight());

if (iter.hasPayloads()) {

payloadField.setBytesValue(iter.payload());

}

w.addDocument(doc);

}

logger.debug("initial indexing time: " + ((System.nanoTime() - t0) / 1000000) + " msec");

r = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(w, false));

w.rollback();

w2 = new IndexWriter(dir, getIndexWriterConfig(matchVersion, indexAnalyzer));

w2.addIndexes(new IndexReader[] { r });

r.close();

searcher = new IndexSearcher(DirectoryReader.open(w2, false));

w2.close();

payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads");

weightsDV = MultiDocValues.getNumericValues(searcher.getIndexReader(), "weight");

textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME);

assert textDV != null;

success = true;

} finally {

if (success) {

IOUtils.close(w, w2, r, dirTmp);

} else {

IOUtils.closeWhileHandlingException(w, w2, r, dirTmp);

}

* 重写查询方法，改变结果排序的方法

* @see org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester#lookup(java.lang.CharSequence, int, boolean, boolean)

@Override

public List<LookupResult> lookup(CharSequence key, int num, boolean allTermsRequired, boolean doHighlight) {

if (searcher == null) {

throw new IllegalStateException("suggester was not built");

}

final BooleanClause.Occur occur;

if (allTermsRequired) {

occur = BooleanClause.Occur.MUST;

} else {

occur = BooleanClause.Occur.SHOULD;

}

TokenStream ts = null;

try {

ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));

ts.reset();

final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);

String lastToken = null;

BooleanQuery query = new BooleanQuery();

int maxEndOffset = -1;

final Set<String> matchedTokens = new HashSet<String>();

while (ts.incrementToken()) {

if (lastToken != null) {

matchedTokens.add(lastToken);

query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur);

}

lastToken = termAtt.toString();

if (lastToken != null) {

maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());

}

ts.end();

String prefixToken = null;

if (lastToken != null) {

Query lastQuery;

if (maxEndOffset == offsetAtt.endOffset()) {

// Use PrefixQuery (or the ngram equivalent) when

// there was no trailing discarded chars in the

// string (e.g. whitespace), so that if query does

// not end with a space we show prefix matches for

// that token:

lastQuery = getLastTokenQuery(lastToken);

prefixToken = lastToken;

} else {

// Use TermQuery for an exact match if there were

// trailing discarded chars (e.g. whitespace), so

// that if query ends with a space we only show

// exact matches for that term:

matchedTokens.add(lastToken);

lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken));

}

if (lastQuery != null) {

query.add(lastQuery, occur);

}

ts.close();

Query finalQuery = finishQuery(query, allTermsRequired);

//新建排序方法

Sort sort = new Sort(new SortField("weight", SortField.Type.LONG, true));

TopDocs hits = searcher.search(finalQuery, num, sort);

List<LookupResult> results = createResults(hits, num, key, doHighlight, matchedTokens, prefixToken);

return results;

} catch (IOException ioe) {

throw new RuntimeException(ioe);

} finally {

IOUtils.closeWhileHandlingException(ts);

}

public List<LookupResult> lookup( int num, boolean allTermsRequired, boolean doHighlight ) {

if (searcher == null) {

throw new IllegalStateException("suggester was not built");

}

final Set<String> matchedTokens = new HashSet<String>();

String prefixToken = null;

final BooleanClause.Occur occur;

occur = BooleanClause.Occur.SHOULD;

TokenStream ts = null;

try {

BooleanQuery query = new BooleanQuery();

Query termQuery = new TermQuery(new Term("count", "0"));

query.add( termQuery, occur);

Query finalQuery = finishQuery(query, allTermsRequired);

//新建排序方法

Sort sort = new Sort(new SortField("weight", SortField.Type.LONG, true));

TopDocs hits = searcher.search(finalQuery, num, sort);

List<LookupResult> results = createResults(hits, num, null, doHighlight, matchedTokens, prefixToken);

return results;

} catch (IOException ioe) {

throw new RuntimeException(ioe);

} finally {

IOUtils.closeWhileHandlingException(ts);

}

package lucene;

import java.io.ByteArrayOutputStream;

import java.io.IOException;

import java.io.ObjectOutputStream;

import java.io.UnsupportedEncodingException;

import java.util.Comparator;

import java.util.HashSet;

import java.util.Iterator;

import java.util.Set;

import org.apache.lucene.search.suggest.InputIterator;

import org.apache.lucene.util.BytesRef;

public class ProductIterator implements InputIterator {

//集合的迭代器

private Iterator<VO> productIterator;

//遍历的当前的suggerter

private VO currentProduct;

public ProductIterator(Iterator<VO> productIterator) {

this.productIterator = productIterator;

}

public boolean hasContexts() {

return true;

}

/**

* 是否有设置payload信息

public boolean hasPayloads() {

return true;

}

public Comparator<BytesRef> getComparator() {

return null;

}

public BytesRef next() {

if (productIterator.hasNext()) {

currentProduct = productIterator.next();

try {

//返回当前Project的name值，把product类的name属性值作为key

return new BytesRef(currentProduct.getTerm().getBytes("UTF8"));

} catch (UnsupportedEncodingException e) {

throw new RuntimeException("Couldn't convert to UTF-8",e);

}

} else {

return null;

}

/**

* 将Product对象序列化存入payload

* [这里仅仅是个示例，其实这种做法不可取,一般不会把整个对象存入payload,这样索引体积会很大，浪费硬盘空间]

* 存其他后期需要取出的各种数据

public BytesRef payload() {

try {

ByteArrayOutputStream bos = new ByteArrayOutputStream();

ObjectOutputStream out = new ObjectOutputStream(bos);

out.writeObject(currentProduct);

// out.writeInt(currentProduct.getTimes());

out.close();

return new BytesRef(bos.toByteArray());

} catch (IOException e) {

throw new RuntimeException("Well that's unfortunate.");

}

/**

* 把产品的销售区域存入context，context里可以是任意的自定义数据，一般用于数据过滤

* Set集合里的每一个元素都会被创建一个TermQuery，你只是提供一个Set集合，至于new TermQuery

* VO底层API去做了，但你必须要了解底层干了些什么

public Set<BytesRef> contexts() {

try {

Set<BytesRef> regions = new HashSet<BytesRef>();

//for (String region : currentProduct.getStlist()) {

regions.add(new BytesRef(currentProduct.getTerm().getBytes("UTF8")));

//}

return regions;

} catch (UnsupportedEncodingException e) {

throw new RuntimeException("Couldn't convert to UTF-8");

}

/**

* 返回权重值，这个值会影响排序

* 这里以产品的销售量作为权重值，weight值即最终返回的热词列表里每个热词的权重值

* 怎么设计返回这个权重值，发挥你们的想象力吧

public long weight() {

return currentProduct.getTimes();

}

package lucene;

import java.io.ByteArrayInputStream;

import java.io.File;

import java.io.IOException;

import java.io.ObjectInputStream;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Comparator;

import java.util.HashSet;

import java.util.List;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.index.Term;

import org.apache.lucene.search.FuzzyTermsEnum;

import org.apache.lucene.search.TermQuery;

import org.apache.lucene.search.suggest.Lookup.LookupResult;

import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester;

import org.apache.lucene.search.suggest.analyzing.FuzzySuggester;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.BytesRef;

import org.apache.lucene.util.Version;

import org.wltea.analyzer.lucene.IKAnalyzer;

import document;

public class LuceneSuggest {

private static final Version VERSION = Version.LUCENE_47;

public void indexmake( List<VO> lucenelist,File indexDir,Analyzer analyzer,Version VERSION, OpenMode create ) throws IOException{

MyAnalyzingInfixSuggester suggester = new MyAnalyzingInfixSuggester(VERSION, indexDir, analyzer,OpenMode.CREATE_OR_APPEND);

try {

suggester.build(new ProductIterator(lucenelist.iterator()));

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}finally {

//关闭

suggester.close();

}

public List<VO> lookup(String name, String region,int count,String orgno,String indexDir,Analyzer analyzer,Version VERSION ,OpenMode create ) throws IOException {

MyAnalyzingInfixSuggester suggester = new MyAnalyzingInfixSuggester(VERSION, new File(indexDir+orgno), analyzer,OpenMode.CREATE_OR_APPEND);

List<LookupResult> lookup = suggester.lookup( count,false,false );

List<VO> lulist=new ArrayList<VO>();

HashSet<BytesRef> contexts = new HashSet<BytesRef>();

/*contexts.add(new BytesRef(region.getBytes("UTF8"))); */

// 先以contexts为过滤条件进行过滤，再以name为关键字进行筛选，根据weight值排序返回前2条

// 第3个布尔值即是否每个Term都要匹配，第4个参数表示是否需要关键字高亮 //5 最大长度

* 查询结果

* name- 查询的关键词

* count- 返回的最多数量

* allTermsRequired - should或者must关系

* doHighlight - 高亮

List<LookupResult> results = suggester.lookup(name ,count,false,false);

System.out.println("-- \"" + name + "\" (" + region + "):");

for (LookupResult result : results) {

System.out.println(result.key);

String str = (String) result.highlightKey;

str=(String) result.key;

Integer time = null;

// 从payload中反序列化出Product对象

BytesRef bytesRef = result.payload;

ObjectInputStream is = new ObjectInputStream(new ByteArrayInputStream(bytesRef.bytes));

try {

VO vo = (VO) is.readObject() ;

lulist.add(vo);

} catch (Exception e) {

e.printStackTrace();

}

suggester.close();

System.out.println("结束");

if(!"00".equals(orgno)){

MyAnalyzingInfixSuggester suggester1 = new MyAnalyzingInfixSuggester(VERSION, new File(indexDir+"00"), analyzer,OpenMode.CREATE_OR_APPEND);

* 查询结果

* name- 查询的关键词

* count- 返回的最多数量

* allTermsRequired - should或者must关系

* doHighlight - 高亮

List<LookupResult> results1 = suggester1.lookup(name ,count,false,false);

System.out.println("-- \"" + name + "\" (" + region + "):");

for (LookupResult result : results1) {

System.out.println(result.key);

String str = (String) result.highlightKey;

str=(String) result.key;

Integer time = null;

// 从payload中反序列化出Product对象

BytesRef bytesRef = result.payload;

ObjectInputStream is = new ObjectInputStream(new ByteArrayInputStream(bytesRef.bytes));

try {

VO vo = (VO) is.readObject() ;

lulist.add(vo);

} catch (Exception e) {

e.printStackTrace();

}

suggester.close();

}

Collections.sort(lulist, new Comparator<VO>() {

@Override

public int compare(VO o1, VO o2) {

int i = o1.getTimes() - o2.getTimes();

return i;

}

});

Collections.reverse(lulist);

if(lulist.size() > count){

lulist.subList(0, count);//取前count 条

}

return lulist;

}

/**

* @param count

* @param orgno

* @param indexDir

* @param analyzer

* @param VERSION

* @param create

* @return 查询全部数据（思路定义了定值count 为 0）默认查询

* @throws IOException

public List<VO> lookup1( int count,String orgno,String indexDir,Analyzer analyzer,Version VERSION ,OpenMode create ) throws IOException {

MyAnalyzingInfixSuggester suggester = new MyAnalyzingInfixSuggester(VERSION, new File(indexDir+orgno), analyzer,OpenMode.CREATE_OR_APPEND);

List<VO> lulist=new ArrayList<VO>();

List<LookupResult> results = suggester.lookup( count,false,false );

for (LookupResult result : results) {

System.out.println(result.key);

String str = (String) result.highlightKey;

str=(String) result.key;

Integer time = null;

// 从payload中反序列化出Product对象

BytesRef bytesRef = result.payload;

ObjectInputStream is = new ObjectInputStream(new ByteArrayInputStream(bytesRef.bytes));

try {

VO vo = (VO) is.readObject() ;

lulist.add(vo);

} catch (Exception e) {

e.printStackTrace();

}

suggester.close();

System.out.println("结束");

Collections.sort(lulist, new Comparator<VO>() {

@Override

public int compare(VO o1, VO o2) {

int i = o1.getTimes() - o2.getTimes();

return i;

}

});

Collections.reverse(lulist);

if(lulist.size() > count){

lulist.subList(0, count);//取前count 条

}

return lulist;

}

/**

* 跟新

* @return

* @throws IOException

public void edit(String word,File indexDir,Analyzer analyzer,Version VERSION ,VO vo) throws IOException{

Directory fsDir = FSDirectory.open(indexDir);

IndexWriter indexWriter = new IndexWriter(fsDir, new IndexWriterConfig( VERSION, analyzer));

//删除对应的词条

TermQuery termQuery = new TermQuery(new Term("word", word));

// indexWriter.deleteDocuments(new Term(MyAnalyzingInfixSuggester.TEXT_FIELD_NAME, word));

indexWriter.deleteDocuments(termQuery);

//彻底删除

indexWriter.forceMergeDeletes();

//关闭IndexWriter

indexWriter.commit();

indexWriter.close();

List<VO> list = new ArrayList<VO>();

list.add(vo);

//添加建立新的词条索引

this.indexmake(list, indexDir,analyzer, VERSION,OpenMode.APPEND);

}

public void deleteSuggert( String text,File indexDir ) throws IOException{

Analyzer analyzer = new IKAnalyzer(false);

Directory fsDir = FSDirectory.open(indexDir);

IndexWriter indexWriter = new IndexWriter(fsDir, new IndexWriterConfig( VERSION, analyzer));

//删除对应的词条

TermQuery termQuery = new TermQuery(new Term("word",text));

// indexWriter.deleteDocuments(new Term(MyAnalyzingInfixSuggester.TEXT_FIELD_NAME, word));

indexWriter.deleteDocuments(termQuery);

//彻底删除

indexWriter.forceMergeDeletes();

//关闭IndexWriter

indexWriter.commit();

indexWriter.close();

}

package lucene;

import java.io.Serializable;

public class VO implements Serializable{

private static final long serialVersionUID = 1L;

String term;

int times;

/**

* @param term 词条

* @param times 词频

public VO(String term, int times) {

this.term = term;

this.times = times;

}

public VO() {

super();

}

/**

* @return the term

public String getTerm() {

return term;

}

/**

* @param term the term to set

public void setTerm(String term) {

this.term = term;

}

/**

* @return the times

public int getTimes() {

return times;

}

/**

* @param times the times to set

public void setTimes(int times) {

this.times = times;

}

/* (non-Javadoc)

* @see java.lang.Object#toString()

@Override

public String toString() {

return term + " " + times;

}

/* (non-Javadoc)

* @see java.lang.Object#hashCode()

@Override

public int hashCode() {

final int prime = 31;

int result = 1;

result = prime * result + ((term == null) ? 0 : term.hashCode());

return result;

}

* 只对比term

* @see java.lang.Object#equals(java.lang.Object)

@Override

public boolean equals(Object obj) {

if (this == obj)

return true;

if (obj == null)

return false;

if (getClass() != obj.getClass())

return false;

VO other = (VO) obj;

if (term == null) {

if (other.term != null)

return false;

} else if (!term.equals(other.term))

return false;

return true;

}

1.MyAnalyzingInfixSuggester 是重写AnalyzingInfixSuggester 由于原代码不追加索引，采用新建OpenMode.CREATE，所以重写

2.Document 为创建的数据

Main

Word为搜索的key 第二各参数可以是null 作用是进行过滤，搜索10条，orgno和IndexWPath 主要是拼地址，分词器false粒度分词，为true是智能分词

List<VO> lookup = luceneSuggest.lookup(word, null, 10,orgno, IndexWPath , new IKAnalyzer(false), Version.LUCENE_47,OpenMode.CREATE_OR_APPEND);

词频排序

VO vo= new VO();

vo.setTerm(word);

vo.setTimes(Integer.valueOf(terms)+1);

LuceneSuggest luceneSuggest = new LuceneSuggest();

luceneSuggest.edit(word, new File(IndexWPath), new IKAnalyzer(false),Version.LUCENE_47, vo);

1 0