关于使用Filter减少Lucene tf idf打分计算的调研

来源:互联网 发布:c语言写入txt文件 编辑:程序博客网 时间:2024/06/03 18:00
将query改成filter,lucene中有个QueryWrapperFilter性能比较差,所以基本上都需要自己写filter,包括TermFilter,ExactPhraseFilter,ConjunctionFilter,DisjunctionFilter。
这几天验证下来,还是or改善最明显,4个termfilter,4508个返回结果,在我本机上性能提高1/3。ExactPhraseFilter也有小幅提升(5%-10%)。
最令人不解的是and,原来以为跟结果数和子查询数相关,但几次测试基本都是下降。

附ExactPhraseFilter和ut代码:

import java.io.IOException;import java.util.ArrayList;import org.apache.lucene.index.AtomicReaderContext;import org.apache.lucene.index.DocsAndPositionsEnum;import org.apache.lucene.index.Term;import org.apache.lucene.index.TermContext;import org.apache.lucene.index.TermState;import org.apache.lucene.index.Terms;import org.apache.lucene.index.TermsEnum;import org.apache.lucene.search.DocIdSet;import org.apache.lucene.search.DocIdSetIterator;import org.apache.lucene.search.Filter;import org.apache.lucene.util.ArrayUtil;import org.apache.lucene.util.Bits;// A fake to lucene phrase query, but far simplified.public class ExactPhraseFilter extends Filter {    protected final ArrayList<Term> terms = new ArrayList<Term>();    protected final ArrayList<Integer> positions = new ArrayList<Integer>();        protected String fieldName;        public void add(Term term) {        if (terms.size() == 0) {            fieldName = term.field();        } else {            assert fieldName == term.field();        }        positions.add(Integer.valueOf(terms.size()));        terms.add(term);    }        @Override    public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException    {        return new ExactPhraseDocIdSet(context, acceptDocs);    }        static class PostingAndFreq implements Comparable<PostingAndFreq> {        DocsAndPositionsEnum posEnum;        int docFreq;        int position;        boolean useAdvance;        int posFreq = 0;        int pos = -1;        int posTime = 0;                public PostingAndFreq(DocsAndPositionsEnum posEnum, int docFreq, int position, boolean useAdvance) {            this.posEnum = posEnum;            this.docFreq = docFreq;            this.position = position;            this.useAdvance = useAdvance;        }             @Override        public int compareTo(PostingAndFreq other) {            if (docFreq != other.docFreq) {                return docFreq - other.docFreq;            }            if (position != other.position) {                return position - other.position;            }            return 0;        }    }        protected class ExactPhraseDocIdSet extends DocIdSet {        protected final AtomicReaderContext context;        protected final Bits acceptDocs;        protected final PostingAndFreq[] postings;        protected boolean noDocs = false;                public ExactPhraseDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {            this.context = context;            this.acceptDocs = acceptDocs;                        Terms fieldTerms = context.reader().fields().terms(fieldName);            // TermContext states[] = new TermContext[terms.size()];            postings = new PostingAndFreq[terms.size()];                        TermsEnum te = fieldTerms.iterator(null);            for (int i = 0; i < terms.size(); ++i) {                final Term t = terms.get(i);                // states[i] = TermContext.build(context, terms.get(i), true);                // final TermState state = states[i].get(context.ord);                if (!te.seekExact(t.bytes(), true)) {                    noDocs = true;                    return;                }                if (i == 0) {                    postings[i] = new PostingAndFreq(te.docsAndPositions(acceptDocs, null, 0), te.docFreq(), positions.get(i), false);                } else {                    postings[i] = new PostingAndFreq(te.docsAndPositions(acceptDocs, null, 0), te.docFreq(), positions.get(i), te.docFreq() > 5 * postings[0].docFreq);                }            }                        ArrayUtil.mergeSort(postings);            for (int i = 1; i < terms.size(); ++i) {                postings[i].posEnum.nextDoc();            }        }                @Override        public DocIdSetIterator iterator() throws IOException        {            if (noDocs) {                return EMPTY_DOCIDSET.iterator();            } else {                return new ExactPhraseDocIdSetIterator(context, acceptDocs);            }        }                protected class ExactPhraseDocIdSetIterator extends DocIdSetIterator {            protected int docID = -1;                        public ExactPhraseDocIdSetIterator(AtomicReaderContext context, Bits acceptDocs) throws IOException {            }                        @Override            public int nextDoc() throws IOException {                while (true) {                    // first (rarest) term                    final int doc = postings[0].posEnum.nextDoc();                    if (doc == DocIdSetIterator.NO_MORE_DOCS) {                        // System.err.println("END");                        return docID = doc;                    }                                        // non-first terms                    int i = 1;                    while (i < postings.length) {                        final PostingAndFreq pf = postings[i];                        int doc2 = pf.posEnum.docID();                        if (pf.useAdvance) {                            if (doc2 < doc) {                                doc2 = pf.posEnum.advance(doc);                            }                        } else {                            int iter = 0;                            while (doc2 < doc) {                                if (++iter == 50) {                                    doc2 = pf.posEnum.advance(doc);                                } else {                                    doc2 = pf.posEnum.nextDoc();                                }                            }                        }                        if (doc2 > doc) {                            break;                        }                        ++i;                    }                                        if (i == postings.length) {                        // System.err.println(doc);                        docID = doc;                        // return docID;                        if (containsPhrase()) {                            return docID;                        }                    }                }            }                        @Override            public int advance(int target) throws IOException {                throw new IOException();            }                        private boolean containsPhrase() throws IOException {                int index = -1;                int i = 0;                PostingAndFreq pf;                                // init.                for (i = 0; i < postings.length; ++i) {                    postings[i].posFreq = postings[i].posEnum.freq();                    postings[i].pos = postings[i].posEnum.nextPosition() - postings[i].position;                    postings[i].posTime = 1;                }                                while (true) {                    pf = postings[0];                                        // first term.                    while (pf.pos < index && pf.posTime < pf.posFreq) {                        pf.pos = pf.posEnum.nextPosition() - pf.position;                        ++pf.posTime;                    }                    if (pf.pos >= index) {                        index = pf.pos;                    } else if (pf.posTime == pf.posFreq) {                        return false;                    }                                        // other terms.                    for (i = 1; i < postings.length; ++i) {                        pf = postings[i];                        while (pf.pos < index && pf.posTime < pf.posFreq) {                            pf.pos = pf.posEnum.nextPosition() - pf.position;                            ++pf.posTime;                        }                        if (pf.pos > index) {                            index = pf.pos;                            break;                        }                        if (pf.pos == index) {                            continue;                        }                        if (pf.posTime == pf.posFreq) {
                            return false;                        }                    }                    if (i == postings.length) {                        return true;                    }                }            }            @Override            public int docID()            {                return docID;            }        }    }    }


UT:

import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.codecs.Codec;import org.apache.lucene.document.Document;import org.apache.lucene.document.TextField;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Term;import org.apache.lucene.index.IndexWriterConfig.OpenMode;import org.apache.lucene.search.ConstantScoreQuery;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.RAMDirectory;import org.apache.lucene.util.Version;import org.testng.annotations.AfterTest;import org.testng.annotations.BeforeTest;import org.testng.annotations.Test;import com.dp.arts.lucenex.codec.Dp10Codec;public class ExactPhraseFilterTest{    final Directory dir = new RAMDirectory();        @BeforeTest    public void setUp() throws IOException {        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer);        iwc.setOpenMode(OpenMode.CREATE);        iwc.setCodec(Codec.forName(Dp10Codec.DP10_CODEC_NAME));                IndexWriter writer = new IndexWriter(dir, iwc);         addDocument(writer, "新疆烧烤");  // 0        addDocument(writer, "啤酒");  // 1        addDocument(writer, "烤烧");  // 2        addDocument(writer, "烧烧烧");  // 3        addDocument(writer, "烤烧中华烧烤"); // 4        writer.close();    }        private void addDocument(IndexWriter writer, String str) throws IOException {        Document doc = new Document();        doc.add(new TextField("searchkeywords", str, Store.YES));        writer.addDocument(doc, new StandardAnalyzer(Version.LUCENE_40));    }        @AfterTest    public void tearDown() throws IOException    {        this.dir.close();    }        @Test    public void test1() throws IOException    {        IndexReader reader = DirectoryReader.open(dir);        IndexSearcher searcher = new IndexSearcher(reader);                ExactPhraseFilter pf = new ExactPhraseFilter();        pf.add(new Term("searchkeywords", "烧"));        pf.add(new Term("searchkeywords", "烤"));        Query query = new ConstantScoreQuery(pf);        TopDocs results = searcher.search(query, 20);                assert results.totalHits == 2;        assert results.scoreDocs[0].doc == 0;        assert results.scoreDocs[1].doc == 4;                searcher.getIndexReader().close();    }}


0 0
原创粉丝点击