Elasticsearch源码分析二--调用Lucene查询接口之常用词查询

来源:互联网 发布:淘宝swisse官方旗舰店 编辑:程序博客网 时间:2024/05/17 06:24
  • 简介
  • 查询语法
  • 源码分析

简介

常用词查询是在没有使用停用词的情况下, Elasticsearch为了提高常用词的查询相关性和精确性而提供的一个现代解决方案。
例如,“ crime and punishment”可以翻译成3个词查询,每一个都有性能上的成本(词越多,查询性能越低)。但“ and”这个词非常常见,对文档得分的影响非常低。
解决办法是常用词查询,将查询分为两组。第一组包含重要的词,出现的频率较低。第二组包含较高频率的、不那么重要的词。先执行第一个查询,Elasticsearch从第一组的所有词中计算分数。这样,通常都很重要的低频词总是被列入考虑范围。然后,Elasticsearch对第二组中的词执行二次查询,但只为与第一个查询中匹配的文档计算得分。这样只计算了相关文档的得分,实现了更高的性能。

查询语法

{
“query” : {
“common” : {
“title” : {
“query” : “crime and punishment”,
“cutoff_frequency” : 0.001
}}}

源码分析

'''(1)Elasticsearch code'''public class CommonTermsQueryParser implements QueryParser {    public static final String NAME = "common";    static final float DEFAULT_MAX_TERM_DOC_FREQ = 0.01f;    static final Occur DEFAULT_HIGH_FREQ_OCCUR = Occur.SHOULD;    static final Occur DEFAULT_LOW_FREQ_OCCUR = Occur.SHOULD;    static final boolean DEFAULT_DISABLE_COORDS = true;    @Override    public Query parse(QueryParseContext parseContext) throws IOException, QueryParsingException {        XContentParser parser = parseContext.parser();        XContentParser.Token token = parser.nextToken();        if (token != XContentParser.Token.FIELD_NAME) {            throw new QueryParsingException(parseContext.index(), "[common] query malformed, no field");        }        String fieldName = parser.currentName();        Object value = null;        float boost = 1.0f;        String queryAnalyzer = null;        String lowFreqMinimumShouldMatch = null;        String highFreqMinimumShouldMatch = null;        boolean disableCoords = DEFAULT_DISABLE_COORDS;        Occur highFreqOccur = DEFAULT_HIGH_FREQ_OCCUR;        Occur lowFreqOccur = DEFAULT_LOW_FREQ_OCCUR;        float maxTermFrequency = DEFAULT_MAX_TERM_DOC_FREQ;        token = parser.nextToken();        ....        '''构造Lucene类CommonTermsQuery的子类对象           参数highFreqCccur:为高频词组构建查询时用到的布尔运算符SHOULD/MUST           参数lowFreqOccur:为低频词组构建查询时用到的布尔运算符SHOULD/MUST            参数maxTermFrequency:用来构建高、低频词组,小于设定值的词将出现在低频词组中           参数disableCoords:是否启用分数因子计算'''        ExtendedCommonTermsQuery query = new ExtendedCommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords);        query.setBoost(boost);        return parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer, lowFreqMinimumShouldMatch, highFreqMinimumShouldMatch);    }'''(2)Lucene code'''public class CommonTermsQuery extends Query {  '''对query进行重写,区分低频词和高频词,并根据Elasticsearch传递的highFreqOccur和lowFreqOccur将高频词和低频词构造成BooleanQuery'''  @Override  public Query rewrite(IndexReader reader) throws IOException {    if (this.terms.isEmpty()) {      return new MatchNoDocsQuery();    } else if (this.terms.size() == 1) {      final Query tq = newTermQuery(this.terms.get(0), null);      tq.setBoost(getBoost());      return tq;    }    final List<LeafReaderContext> leaves = reader.leaves();    final int maxDoc = reader.maxDoc();    final TermContext[] contextArray = new TermContext[terms.size()];    final Term[] queryTerms = this.terms.toArray(new Term[0]);    collectTermContext(reader, leaves, contextArray, queryTerms);    return buildQuery(maxDoc, contextArray, queryTerms);  }  '''(3)Lucene 常用词查询举例'''  public void test() throws IOException {    Directory dir = newDirectory();    MockAnalyzer analyzer = new MockAnalyzer(random());    RandomIndexWriter w = new RandomIndexWriter(random(), dir, analyzer);    String[] docs = new String[] {"this is the end of the world right",        "is this it or maybe not",        "this is the end of the universe as we know it",        "there is the famous restaurant at the end of the universe",};    for (int i = 0; i < docs.length; i++) {      Document doc = new Document();      doc.add(newStringField("id", "" + i, Field.Store.YES));      doc.add(newTextField("field", docs[i], Field.Store.NO));      w.addDocument(doc);    }    IndexReader r = w.getReader();    IndexSearcher s = newSearcher(r);    {      CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,          random().nextBoolean() ? 2.0f : 0.5f);      '''this/is/end -- 高频词          world/universe/right -- 高频词 '''      query.add(new Term("field", "is"));      query.add(new Term("field", "this"));      query.add(new Term("field", "end"));      query.add(new Term("field", "world"));      query.add(new Term("field", "universe"));      query.add(new Term("field", "right"));      '''第一次查询先用高频词world/universe/right查找得到0、2、3;第二次查询只为第一次查询的结果文档0、2、3计算得分'''      TopDocs search = s.search(query, 10);      assertEquals(search.totalHits, 3);      assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));      assertEquals("2", r.document(search.scoreDocs[1].doc).get("id"));      assertEquals("3", r.document(search.scoreDocs[2].doc).get("id"));    }    { // only high freq      CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,          random().nextBoolean() ? 2.0f : 0.5f);      query.add(new Term("field", "is"));      query.add(new Term("field", "this"));      query.add(new Term("field", "end"));      TopDocs search = s.search(query, 10);      assertEquals(search.totalHits, 2);      assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));      assertEquals("2", r.document(search.scoreDocs[1].doc).get("id"));    }    { // low freq is mandatory      CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.MUST,          random().nextBoolean() ? 2.0f : 0.5f);      query.add(new Term("field", "is"));      query.add(new Term("field", "this"));      query.add(new Term("field", "end"));      query.add(new Term("field", "world"));      TopDocs search = s.search(query, 10);      assertEquals(search.totalHits, 1);      assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));    }    { // low freq is mandatory      CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.MUST,          random().nextBoolean() ? 2.0f : 0.5f);      query.add(new Term("field", "restaurant"));      query.add(new Term("field", "universe"));      TopDocs search = s.search(query, 10);      assertEquals(search.totalHits, 1);      assertEquals("3", r.document(search.scoreDocs[0].doc).get("id"));    }    IOUtils.close(r, w, dir, analyzer);  }

0 0