Elasticsearch源码分析十三--高亮显示highlight

来源：互联网发布：人工智能与知识产权编辑：程序博客网时间：2024/06/03 15:00

简介
查询语法
源码分析

简介

高亮显示是在结果文档中显示查询中的哪个或哪些单词被匹配的过程。
Elasticsearch底层使用Apache Lucene。 Lucene提供了三种类型的高亮实现：
第一种是标准类型（本文例子）；第二种叫FastVectorHighlighter，
它需要词向量和位置才能工作；第三种叫PostingsHighlighter。
Elasticsearch自动选择正确的高亮实现方式：如果字段的配置中，
term_vector属性设成了with_positions_offsets，则将使用FastVectorHighlighter。
使用词向量将导致索引变大，但高亮显示的执行需要更少的时间。此外，
对于存储了大量数据的字段来说，推荐使用FastVectorHighlighter

查询语法

例如：高亮显示在title字段中匹配的单词，注意highlight部分和query部分位于JSON中的同一层，
也可以看做第一层。在Elasticsearch代码中，位于JSON第一层的query、highlight等叫做Element。

 {    "query" : {       "term" : {            "title" : "crime"        }    },    "highlight" : {        "pre_tags" : [ "<b>" ],        "post_tags" : [ "</b>" ],        "fields" : {            "title" : {}        }    }}

该查询的结果如下，结果中除标准返回信息外，还有一个highlight部分，
该部分使用<b>这个HTML标签来包含高亮部分,高亮由pre_tags和post_tags属性指定，
默认使用<em>标签。

{    "took" : 2,    "timed_out" : false,    "_shards" : {    "total" : 5,    "successful" : 5,    "failed" : 0},"hits" : {    "total" : 1,    "max_score" : 0.19178301,    "hits" : [ {        "_index" : "library",        "_type" : "book",        "_id" : "4",        "_score" : 0.19178301,             {                 "title": "Crime and Punishment",                "characters": ["Raskolnikov"],                "tags": [],                "copies": 0, "available" : true},                "highlight" : {                    "title" : [ "**<b>Crime</b>** and Punishment" ]            }} ]    }}

源码分析

'''(1)Elasticsearch code：注册fetchPhase中元素的解析方法'''public class SearchService extends AbstractLifecycleComponent<SearchService> {    private final ImmutableMap<String, SearchParseElement> elementParsers;    public SearchService(Settings settings, ClusterService clusterService, IndicesService indicesService, IndicesLifecycle indicesLifecycle, IndicesWarmer indicesWarmer, ThreadPool threadPool,                         ScriptService scriptService, CacheRecycler cacheRecycler, DfsPhase dfsPhase, QueryPhase queryPhase, FetchPhase fetchPhase) {        super(settings);        this.threadPool = threadPool;        '''省略....'''        '''在此注册所有元素的解析方法'''        Map<String, SearchParseElement> elementParsers = new HashMap<String, SearchParseElement>();        elementParsers.putAll(dfsPhase.parseElements());        elementParsers.putAll(queryPhase.parseElements());        elementParsers.putAll(fetchPhase.parseElements());        elementParsers.put("stats", new StatsGroupsParseElement());        this.elementParsers = ImmutableMap.copyOf(elementParsers);        indicesLifecycle.addListener(indicesLifecycleListener);        this.keepAliveReaper = threadPool.scheduleWithFixedDelay(new Reaper(), keepAliveInterval);        this.indicesWarmer.addListener(new SearchWarmer());    }}'''(2)Elasticsearch code：在FetchPhase中注册highlight的解析方法'''public class FetchPhase implements SearchPhase {    private final FetchSubPhase[] fetchSubPhases;    @Inject    '''HighlightPhase高亮显示'''    public FetchPhase(HighlightPhase highlightPhase, ScriptFieldsFetchSubPhase scriptFieldsPhase, PartialFieldsFetchSubPhase partialFieldsPhase,                      MatchedFiltersFetchSubPhase matchFiltersPhase, ExplainFetchSubPhase explainPhase, VersionFetchSubPhase versionPhase) {        this.fetchSubPhases = new FetchSubPhase[]{scriptFieldsPhase, partialFieldsPhase, matchFiltersPhase, explainPhase, highlightPhase, versionPhase};    }    @Override    public Map<String, ? extends SearchParseElement> parseElements() {        ImmutableMap.Builder<String, SearchParseElement> parseElements = ImmutableMap.builder();        parseElements.put("fields", new FieldsParseElement());        for (FetchSubPhase fetchSubPhase : fetchSubPhases) {            parseElements.putAll(fetchSubPhase.parseElements());        }        return parseElements.build();    }}'''(3)Elasticsearch code：在FetchPhase中注册highlight的解析实例HighlighterParseElement'''public class HighlightPhase extends AbstractComponent implements FetchSubPhase {    @Override    public Map<String, ? extends SearchParseElement> parseElements() {        return ImmutableMap.of("highlight", new HighlighterParseElement());    }} '''(4)Elasticsearch code：在FetchPhase中注册highlight的解析实例HighlighterParseElement'''/** * <pre> * highlight : { *  tags_schema : "styled", *  pre_tags : ["tag1", "tag2"], *  post_tags : ["tag1", "tag2"], *  order : "score", *  highlight_filter : true, *  fields : { *      field1 : {  }, *      field2 : { fragment_size : 100, number_of_fragments : 2 }, *      field3 : { number_of_fragments : 5, order : "simple", tags_schema : "styled" }, *      field4 : { number_of_fragments: 0, pre_tags : ["openingTagA", "openingTagB"], post_tags : ["closingTag"] } *  } * } * </pre> */public class HighlighterParseElement implements SearchParseElement {        '''默认高亮显示的HTML标签'''    private static final String[] DEFAULT_PRE_TAGS = new String[]{"<em>"};    private static final String[] DEFAULT_POST_TAGS = new String[]{"</em>"};    private static final String[] STYLED_PRE_TAG = {            "<em class=\"hlt1\">", "<em class=\"hlt2\">", "<em class=\"hlt3\">",            "<em class=\"hlt4\">", "<em class=\"hlt5\">", "<em class=\"hlt6\">",            "<em class=\"hlt7\">", "<em class=\"hlt8\">", "<em class=\"hlt9\">",            "<em class=\"hlt10\">"    };    private static final String[] STYLED_POST_TAGS = {"</em>"};    @Override    public void parse(XContentParser parser, SearchContext context) throws Exception {        XContentParser.Token token;        String topLevelFieldName = null;        List<SearchContextHighlight.Field> fields = newArrayList();        String[] globalPreTags = DEFAULT_PRE_TAGS;        String[] globalPostTags = DEFAULT_POST_TAGS;        ......        String globalHighlighterType = null;        String globalFragmenter = null;        Map<String, Object> globalOptions = null;        '''此处的parser是JsonXContentParser实例'''        while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {            if (token == XContentParser.Token.FIELD_NAME) {                topLevelFieldName = parser.currentName();            } else if (token == XContentParser.Token.START_ARRAY) {                if ("pre_tags".equals(topLevelFieldName) || "preTags".equals(topLevelFieldName)) {                    List<String> preTagsList = Lists.newArrayList();                    while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {                        preTagsList.add(parser.text());                    }                    globalPreTags = preTagsList.toArray(new String[preTagsList.size()]);                } else if ("post_tags".equals(topLevelFieldName) || "postTags".equals(topLevelFieldName)) {                    List<String> postTagsList = Lists.newArrayList();                    while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {                        postTagsList.add(parser.text());                    }                    globalPostTags = postTagsList.toArray(new String[postTagsList.size()]);                }            } else if (token.isValue()) {                if ("order".equals(topLevelFieldName)) {                    globalScoreOrdered = "score".equals(parser.text());                } else if ("tags_schema".equals(topLevelFieldName) || "tagsSchema".equals(topLevelFieldName)) {                    String schema = parser.text();                    if ("styled".equals(schema)) {                        globalPreTags = STYLED_PRE_TAG;                        globalPostTags = STYLED_POST_TAGS;                    }          '''省略.....'''

0 0