elasticsearch 1.1.0 mmseg 英文数字分词

来源:互联网 发布:白莲花奖 知乎 编辑:程序博客网 时间:2024/03/29 19:45

elasticsearch 1.1.0  mmseg 插件的版本是1.2.2 版本。该版本没有解决英文数字分词问题。

比如  user123。分词后 user123

 

解决1:

mmseg插件升级 elasticsearch-analysis-mmseg-1.4.0。

https://github.com/medcl/elasticsearch-analysis-mmseg/commit/61b5e8199425c845a3060fe39f40e59868dd364b 

index:  analysis:     tokenizer:      mmseg_maxword:        type: mmseg        seg_type: max_word      mmseg_complex:        type: mmseg        seg_type: complex    analyzer:      mmseg_maxword:        type: custom        filter:        - lowercase        - cut_letter_digit        tokenizer: mmseg_maxword      mmseg:        type: custom        filter:        - lowercase        - cut_letter_digit        tokenizer: mmseg_maxword      mmseg_complex:        type: custom        filter:        - lowercase        tokenizer: mmseg_complex#index.analysis.analyzer.default.type : "org.elasticsearch.index.analysis.MMsegAnalyzerProvider"#index.analysis.analyzer.default.type : "ik"index.analysis.analyzer.default.type : "mmseg"

 

解决2:

         修改1.2.2 版本jar包中的MMSegAnalyzer,然后替换class

         

package com.chenlb.mmseg4j.analysis;import java.io.File;import java.io.Reader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;import com.chenlb.mmseg4j.Dictionary;import com.chenlb.mmseg4j.MaxWordSeg;import com.chenlb.mmseg4j.Seg;/** * 榛樿浣跨敤 max-word * * @see {@link SimpleAnalyzer}, {@link ComplexAnalyzer}, {@link MaxWordAnalyzer} * * @author chenlb */public class MMSegAnalyzer extends Analyzer {protected Dictionary dic;/** * @see Dictionary#getInstance() */public MMSegAnalyzer() {dic = Dictionary.getInstance();}/** * @param path 璇嶅簱璺緞 * @see Dictionary#getInstance(String) */public MMSegAnalyzer(String path) {dic = Dictionary.getInstance(path);}/** * @param path 璇嶅簱鐩綍 * @see Dictionary#getInstance(File) */public MMSegAnalyzer(File path) {dic = Dictionary.getInstance(path);}public MMSegAnalyzer(Dictionary dic) {super();this.dic = dic;}protected Seg newSeg() {return new MaxWordSeg(dic);}public Dictionary getDict() {return dic;}/*@Overrideprotected TokenStreamComponents createComponents(String fieldName, Reader reader) {return new TokenStreamComponents(new MMSegTokenizer(newSeg(), reader));}*/@Overrideprotected TokenStreamComponents createComponents(String fieldName,Reader reader) {Tokenizer t = new MMSegTokenizer(newSeg(), reader);return new TokenStreamComponents(t, new CutLetterDigitFilter(t));}}

 

       

0 0