es SynonymTokenFilterFactory 源码

来源:互联网 发布:java.util jar包下载 编辑:程序博客网 时间:2024/06/09 22:21
/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * *    http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied.  See the License for the * specific language governing permissions and limitations * under the License. */package org.elasticsearch.index.analysis;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.core.LowerCaseFilter;import org.apache.lucene.analysis.core.WhitespaceTokenizer;import org.apache.lucene.analysis.synonym.SolrSynonymParser;import org.apache.lucene.analysis.synonym.SynonymFilter;import org.apache.lucene.analysis.synonym.SynonymMap;import org.apache.lucene.analysis.synonym.WordnetSynonymParser;import org.elasticsearch.common.inject.Inject;import org.elasticsearch.common.inject.assistedinject.Assisted;import org.elasticsearch.common.io.FastStringReader;import org.elasticsearch.common.settings.Settings;import org.elasticsearch.env.Environment;import org.elasticsearch.index.Index;import org.elasticsearch.index.settings.IndexSettingsService;import org.elasticsearch.indices.analysis.IndicesAnalysisService;import java.io.Reader;import java.util.List;import java.util.Map;@AnalysisSettingsRequiredpublic class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {    private final SynonymMap synonymMap;    private final boolean ignoreCase;    @Inject    public SynonymTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, IndicesAnalysisService indicesAnalysisService, Map<String, TokenizerFactoryFactory> tokenizerFactories,                                     @Assisted String name, @Assisted Settings settings) {        super(index, indexSettingsService.getSettings(), name, settings);        //同义词流        Reader rulesReader = null;        //获取配置中的synonyms的同义词配置        if (settings.getAsArray("synonyms", null) != null) {            List<String> rules = Analysis.getWordList(env, settings, "synonyms");            StringBuilder sb = new StringBuilder();            for (String line : rules) {                sb.append(line).append(System.getProperty("line.separator"));            }            rulesReader = new FastStringReader(sb.toString());        //获取配置文件中同义词配置synonyms_path        } else if (settings.get("synonyms_path") != null) {            //获取配置路径的同义词文件流               rulesReader = Analysis.getReaderFromFile(env, settings, "synonyms_path");        } else {            throw new IllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");        }        this.ignoreCase = settings.getAsBoolean("ignore_case", false);        boolean expand = settings.getAsBoolean("expand", true);        //获取 tokenizer        String tokenizerName = settings.get("tokenizer", "whitespace");        //获取TokenizerFactoryFactory        TokenizerFactoryFactory tokenizerFactoryFactory = tokenizerFactories.get(tokenizerName);        if (tokenizerFactoryFactory == null) {            tokenizerFactoryFactory = indicesAnalysisService.tokenizerFactoryFactory(tokenizerName);        }        if (tokenizerFactoryFactory == null) {            throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");        }        final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, Settings.builder().put(indexSettingsService.getSettings()).put(settings).build());        Analyzer analyzer = new Analyzer() {            @Override            protected TokenStreamComponents createComponents(String fieldName) {                Tokenizer tokenizer = tokenizerFactory == null ? new WhitespaceTokenizer() : tokenizerFactory.create();                TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;                return new TokenStreamComponents(tokenizer, stream);            }        };        try {            SynonymMap.Builder parser = null;            if ("wordnet".equalsIgnoreCase(settings.get("format"))) {                parser = new WordnetSynonymParser(true, expand, analyzer);                //解析同义词数据流                ((WordnetSynonymParser) parser).parse(rulesReader);            } else {                parser = new SolrSynonymParser(true, expand, analyzer);                ((SolrSynonymParser) parser).parse(rulesReader);            }            synonymMap = parser.build();        } catch (Exception e) {            throw new IllegalArgumentException("failed to build synonyms", e);        }    }    @Override    public TokenStream create(TokenStream tokenStream) {        // fst is null means no synonyms        //使用 lucene 中的 SynonymFilter        return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase);    }}

 

  

    /**     * @return null If no settings set for "settingsPrefix" then return <code>null</code>.     * @throws IllegalArgumentException     *          If the Reader can not be instantiated.     * 获取配置同义词流      */    public static Reader getReaderFromFile(Environment env, Settings settings, String settingPrefix) {        String filePath = settings.get(settingPrefix, null);        if (filePath == null) {            return null;        }        final Path path = env.configFile().resolve(filePath);        try {            return FileSystemUtils.newBufferedReader(path.toUri().toURL(), Charsets.UTF_8);        } catch (IOException ioe) {            String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());            throw new IllegalArgumentException(message);        }    }

   

package org.apache.lucene.analysis.synonym;/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.IOException;import java.io.LineNumberReader;import java.io.Reader;import java.text.ParseException;import java.util.Arrays;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.util.CharsRef;import org.apache.lucene.util.CharsRefBuilder;/** * Parser for wordnet prolog format * <p> * See http://wordnet.princeton.edu/man/prologdb.5WN.html for a description of the format. * @lucene.experimental SynonymMap 解析子类

 

 */// TODO: allow you to specify syntactic categories (e.g. just nouns, etc)public class WordnetSynonymParser extends SynonymMap.Parser {  private final boolean expand;    public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {    super(dedup, analyzer);    this.expand = expand;  }  @Override  public void parse(Reader in) throws IOException, ParseException {    //一行一行解析    LineNumberReader br = new LineNumberReader(in);    try {      String line = null;      String lastSynSetID = "";      CharsRef synset[] = new CharsRef[8];      int synsetSize = 0;            while ((line = br.readLine()) != null) {        String synSetID = line.substring(2, 11);        if (!synSetID.equals(lastSynSetID)) {          addInternal(synset, synsetSize);          synsetSize = 0;        }        if (synset.length <= synsetSize+1) {          synset = Arrays.copyOf(synset, synset.length * 2);        }                synset[synsetSize] = parseSynonym(line, new CharsRefBuilder());        synsetSize++;        lastSynSetID = synSetID;      }            // final synset in the file      addInternal(synset, synsetSize);    } catch (IllegalArgumentException e) {      ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);      ex.initCause(e);      throw ex;    } finally {      br.close();    }  }   private CharsRef parseSynonym(String line, CharsRefBuilder reuse) throws IOException {    if (reuse == null) {      reuse = new CharsRefBuilder();    }        int start = line.indexOf('\'')+1;    int end = line.lastIndexOf('\'');        String text = line.substring(start, end).replace("''", "'");    return analyze(text, reuse);  }    private void addInternal(CharsRef synset[], int size) {    if (size <= 1) {      return; // nothing to do    }        if (expand) {      for (int i = 0; i < size; i++) {        for (int j = 0; j < size; j++) {          add(synset[i], synset[j], false);        }      }    } else {      for (int i = 0; i < size; i++) {        add(synset[i], synset[0], false);      }    }  }}

 

0 0
原创粉丝点击