Lucene 6.0下使用IK分词器

来源:互联网 发布:做数学题的软件 编辑:程序博客网 时间:2024/05/06 02:48

Lucene 6.0使用IK分词器需要修改修改IKAnalyzer和IKTokenizer.
使用时先新建一个MyIKTokenizer类,一个MyIkAnalyzer类:

MyIKTokenizer.java

import java.io.IOException;import java.io.Reader;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.tokenattributes.TypeAttribute;import org.wltea.analyzer.core.IKSegmenter;import org.wltea.analyzer.core.Lexeme;public class MyIKTokenizer extends Tokenizer {    // IK分词器实现    private IKSegmenter _IKImplement;    // 词元文本属性    private final CharTermAttribute termAtt;    // 词元位移属性    private final OffsetAttribute offsetAtt;    // 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)    private final TypeAttribute typeAtt;    // 记录最后一个词元的结束位置    private int endPosition;    public MyIKTokenizer(Reader in) {        this(in, false);    }    public MyIKTokenizer(Reader in, boolean useSmart) {        offsetAtt = addAttribute(OffsetAttribute.class);        termAtt = addAttribute(CharTermAttribute.class);        typeAtt = addAttribute(TypeAttribute.class);        _IKImplement = new IKSegmenter(input, useSmart);    }    @Override    public boolean incrementToken() throws IOException {        // 清除所有的词元属性        clearAttributes();        Lexeme nextLexeme = _IKImplement.next();        if (nextLexeme != null) {            // 将Lexeme转成Attributes            // 设置词元文本            termAtt.append(nextLexeme.getLexemeText());            // 设置词元长度            termAtt.setLength(nextLexeme.getLength());            // 设置词元位移            offsetAtt.setOffset(nextLexeme.getBeginPosition(),                    nextLexeme.getEndPosition());            // 记录分词的最后位置            endPosition = nextLexeme.getEndPosition();            // 记录词元分类            typeAtt.setType(nextLexeme.getLexemeTypeString());            // 返会true告知还有下个词元            return true;        }        // 返会false告知词元输出完毕        return false;    }    public void reset() throws IOException {        super.reset();        _IKImplement.reset(input);    }    @Override    public final void end() {        // set final offset        int finalOffset = correctOffset(this.endPosition);        offsetAtt.setOffset(finalOffset, finalOffset);    }}

MyIkAnalyzer.java

package cn.ucas.lucene.ik;import java.io.Reader;import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.util.IOUtils;public class MyIkAnalyzer extends Analyzer {    @Override    protected TokenStreamComponents createComponents(String arg0) {        Reader reader=null;        try{            reader=new StringReader(arg0);            MyIKTokenizer it = new MyIKTokenizer(reader);            return new Analyzer.TokenStreamComponents(it);        }finally {            IOUtils.closeWhileHandlingException(reader);        }    }}

在Lucene中使用IK分词器:

Analyzer myIkAnalyzer=new MyIkAnalyzer();

参考文档:
http://blog.inet198.cn/?upxiaofeng/article/details/51454648

3 0