关于使用Filter减少Lucene tf idf打分计算的调研
来源:互联网 发布:c语言写入txt文件 编辑:程序博客网 时间:2024/06/03 18:00
将query改成filter,lucene中有个QueryWrapperFilter性能比较差,所以基本上都需要自己写filter,包括TermFilter,ExactPhraseFilter,ConjunctionFilter,DisjunctionFilter。
这几天验证下来,还是or改善最明显,4个termfilter,4508个返回结果,在我本机上性能提高1/3。ExactPhraseFilter也有小幅提升(5%-10%)。
最令人不解的是and,原来以为跟结果数和子查询数相关,但几次测试基本都是下降。
附ExactPhraseFilter和ut代码:
import java.io.IOException;import java.util.ArrayList;import org.apache.lucene.index.AtomicReaderContext;import org.apache.lucene.index.DocsAndPositionsEnum;import org.apache.lucene.index.Term;import org.apache.lucene.index.TermContext;import org.apache.lucene.index.TermState;import org.apache.lucene.index.Terms;import org.apache.lucene.index.TermsEnum;import org.apache.lucene.search.DocIdSet;import org.apache.lucene.search.DocIdSetIterator;import org.apache.lucene.search.Filter;import org.apache.lucene.util.ArrayUtil;import org.apache.lucene.util.Bits;// A fake to lucene phrase query, but far simplified.public class ExactPhraseFilter extends Filter { protected final ArrayList<Term> terms = new ArrayList<Term>(); protected final ArrayList<Integer> positions = new ArrayList<Integer>(); protected String fieldName; public void add(Term term) { if (terms.size() == 0) { fieldName = term.field(); } else { assert fieldName == term.field(); } positions.add(Integer.valueOf(terms.size())); terms.add(term); } @Override public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { return new ExactPhraseDocIdSet(context, acceptDocs); } static class PostingAndFreq implements Comparable<PostingAndFreq> { DocsAndPositionsEnum posEnum; int docFreq; int position; boolean useAdvance; int posFreq = 0; int pos = -1; int posTime = 0; public PostingAndFreq(DocsAndPositionsEnum posEnum, int docFreq, int position, boolean useAdvance) { this.posEnum = posEnum; this.docFreq = docFreq; this.position = position; this.useAdvance = useAdvance; } @Override public int compareTo(PostingAndFreq other) { if (docFreq != other.docFreq) { return docFreq - other.docFreq; } if (position != other.position) { return position - other.position; } return 0; } } protected class ExactPhraseDocIdSet extends DocIdSet { protected final AtomicReaderContext context; protected final Bits acceptDocs; protected final PostingAndFreq[] postings; protected boolean noDocs = false; public ExactPhraseDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { this.context = context; this.acceptDocs = acceptDocs; Terms fieldTerms = context.reader().fields().terms(fieldName); // TermContext states[] = new TermContext[terms.size()]; postings = new PostingAndFreq[terms.size()]; TermsEnum te = fieldTerms.iterator(null); for (int i = 0; i < terms.size(); ++i) { final Term t = terms.get(i); // states[i] = TermContext.build(context, terms.get(i), true); // final TermState state = states[i].get(context.ord); if (!te.seekExact(t.bytes(), true)) { noDocs = true; return; } if (i == 0) { postings[i] = new PostingAndFreq(te.docsAndPositions(acceptDocs, null, 0), te.docFreq(), positions.get(i), false); } else { postings[i] = new PostingAndFreq(te.docsAndPositions(acceptDocs, null, 0), te.docFreq(), positions.get(i), te.docFreq() > 5 * postings[0].docFreq); } } ArrayUtil.mergeSort(postings); for (int i = 1; i < terms.size(); ++i) { postings[i].posEnum.nextDoc(); } } @Override public DocIdSetIterator iterator() throws IOException { if (noDocs) { return EMPTY_DOCIDSET.iterator(); } else { return new ExactPhraseDocIdSetIterator(context, acceptDocs); } } protected class ExactPhraseDocIdSetIterator extends DocIdSetIterator { protected int docID = -1; public ExactPhraseDocIdSetIterator(AtomicReaderContext context, Bits acceptDocs) throws IOException { } @Override public int nextDoc() throws IOException { while (true) { // first (rarest) term final int doc = postings[0].posEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { // System.err.println("END"); return docID = doc; } // non-first terms int i = 1; while (i < postings.length) { final PostingAndFreq pf = postings[i]; int doc2 = pf.posEnum.docID(); if (pf.useAdvance) { if (doc2 < doc) { doc2 = pf.posEnum.advance(doc); } } else { int iter = 0; while (doc2 < doc) { if (++iter == 50) { doc2 = pf.posEnum.advance(doc); } else { doc2 = pf.posEnum.nextDoc(); } } } if (doc2 > doc) { break; } ++i; } if (i == postings.length) { // System.err.println(doc); docID = doc; // return docID; if (containsPhrase()) { return docID; } } } } @Override public int advance(int target) throws IOException { throw new IOException(); } private boolean containsPhrase() throws IOException { int index = -1; int i = 0; PostingAndFreq pf; // init. for (i = 0; i < postings.length; ++i) { postings[i].posFreq = postings[i].posEnum.freq(); postings[i].pos = postings[i].posEnum.nextPosition() - postings[i].position; postings[i].posTime = 1; } while (true) { pf = postings[0]; // first term. while (pf.pos < index && pf.posTime < pf.posFreq) { pf.pos = pf.posEnum.nextPosition() - pf.position; ++pf.posTime; } if (pf.pos >= index) { index = pf.pos; } else if (pf.posTime == pf.posFreq) { return false; } // other terms. for (i = 1; i < postings.length; ++i) { pf = postings[i]; while (pf.pos < index && pf.posTime < pf.posFreq) { pf.pos = pf.posEnum.nextPosition() - pf.position; ++pf.posTime; } if (pf.pos > index) { index = pf.pos; break; } if (pf.pos == index) { continue; } if (pf.posTime == pf.posFreq) {
return false; } } if (i == postings.length) { return true; } } } @Override public int docID() { return docID; } } } }
UT:
import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.codecs.Codec;import org.apache.lucene.document.Document;import org.apache.lucene.document.TextField;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Term;import org.apache.lucene.index.IndexWriterConfig.OpenMode;import org.apache.lucene.search.ConstantScoreQuery;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.Version;import org.testng.annotations.AfterTest;import org.testng.annotations.BeforeTest;import org.testng.annotations.Test;import com.dp.arts.lucenex.codec.Dp10Codec;public class ExactPhraseFilterTest{ final Directory dir = new RAMDirectory(); @BeforeTest public void setUp() throws IOException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer); iwc.setOpenMode(OpenMode.CREATE); iwc.setCodec(Codec.forName(Dp10Codec.DP10_CODEC_NAME)); IndexWriter writer = new IndexWriter(dir, iwc); addDocument(writer, "新疆烧烤"); // 0 addDocument(writer, "啤酒"); // 1 addDocument(writer, "烤烧"); // 2 addDocument(writer, "烧烧烧"); // 3 addDocument(writer, "烤烧中华烧烤"); // 4 writer.close(); } private void addDocument(IndexWriter writer, String str) throws IOException { Document doc = new Document(); doc.add(new TextField("searchkeywords", str, Store.YES)); writer.addDocument(doc, new StandardAnalyzer(Version.LUCENE_40)); } @AfterTest public void tearDown() throws IOException { this.dir.close(); } @Test public void test1() throws IOException { IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); ExactPhraseFilter pf = new ExactPhraseFilter(); pf.add(new Term("searchkeywords", "烧")); pf.add(new Term("searchkeywords", "烤")); Query query = new ConstantScoreQuery(pf); TopDocs results = searcher.search(query, 20); assert results.totalHits == 2; assert results.scoreDocs[0].doc == 0; assert results.scoreDocs[1].doc == 4; searcher.getIndexReader().close(); }}
0 0
- 关于使用Filter减少Lucene tf idf打分计算的调研
- TF-IDF 的计算二
- 使用spark的TF-IDF算法计算单词的重要性
- 使用sci-kit learn计算TF-IDF
- python 使用sklearn计算TF-IDF权重
- 计算分词的Tf-idf值
- 计算分词的tf*idf算法
- TF-IDF与余弦相似性的计算
- 的Tf-idf值分词计算列举
- 计算jieba分词的Tf-idf值
- TF-IDF计算一
- TF-IDF计算三
- TF-IDF计算四
- TF-IDF计算 Python
- sklearn 计算tf-idf
- 计算TF-IDF
- 利用lucene获取tf-idf
- 利用lucene获取tf-idf
- 写在前面
- SP2010开发和VS2010专家"食谱"--第五章节—Web部件(3)--创建启用AJAX的Web部件
- 字符串逆反
- NYOJ181小明的难题
- 编译器工作流程详解
- 关于使用Filter减少Lucene tf idf打分计算的调研
- PHP URL编码与解码
- mac 访问虚拟机里的python django页面
- Linux 下从命令行打开pdf文件和html文件的命令
- 苹果ipa软件包破解笔记
- Python入门之程序执行方式
- 吞吐量和延迟、信号量和互斥锁的形象比喻
- 第三节 API设计 Box2D中文手册的学习之旅
- 黑马程序员——oc篇(三)内存管理