Lucene4.10.3自定义分词

来源:互联网 发布:vc网络验证系列开发 编辑:程序博客网 时间:2024/04/28 15:49

Lucene4.10.3自定义分词只需三步:


一、

package analyzer;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.document.Document;import org.wltea.analyzer.lucene.IKTokenizer;import java.io.*;/** * 自定义分词器 * Created with IntelliJ IDEA. * User: wxshi * Date: 15-2-11 * Time: 下午3:56 * To change this template use File | Settings | File Templates. */public class MyAnalyzer extends Analyzer{    /**     * 自定义分词过程:     * 1.对流进行分词     * 2.对分词好的进行过滤处理     * 3.返回结果     * */    protected TokenStreamComponents createComponents(String s, Reader reader) {        IKTokenizer source = new IKTokenizer(reader,false);        //使用IK进行分词 ,细粒度非智能        TokenStream filter = new MySynonymTokenFilter(source);     //使用自己的同义词过滤器进行过滤        //filter = new BarFilter(filter);                          //此处可以继续过滤        return new TokenStreamComponents(source, filter);          //返回结果 ,第一种构造 ,复杂时推荐使用       // return new TokenStreamComponents(new SynonymTokenIzer(reader));   //返回结果,第二种构造,简单时推荐使用    }    public static void main(String args[]){        try{            File file = new File("f:\\lucene\\indexFile\\文件1.txt");            Document doc = new Document();            FileInputStream fis = new FileInputStream(file);            Reader reader = new BufferedReader(new InputStreamReader(fis,"GBK"));            Analyzer analyzer = new MyAnalyzer();            TokenStream tokenStream = analyzer.tokenStream("content",reader);      //ik分词流,不采用智能切分            CharTermAttribute termTextAttr = tokenStream.getAttribute(CharTermAttribute.class);            OffsetAttribute termOffsetAttr = tokenStream.getAttribute(OffsetAttribute.class);            //PositionIncrementAttribute termPosAttr = stream.getAttribute(PositionIncrementAttribute.class);            /**             *当incrementToken返回true时,其中Token的属性信息会将内部状态修改为下个词汇单元。             *lucene内建Attribute接口都是可读写的,TokenStream 在遍历Token流时,             *会调用Attribute接口的set方法,修改属性信息。             **/            int i = 0;            // 遍历            while (tokenStream.incrementToken()) {  // token流指针往后移                String str = termTextAttr.toString();                int bos = termOffsetAttr.startOffset();                int eos = termOffsetAttr.endOffset();                System.out.printf("pos=%d, [%s:(%d->%d)]\n", i, str, bos, eos);                ++i;            }        }catch (Exception e){            e.printStackTrace();        }    }}




二、

package analyzer;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import java.io.IOException;import java.util.HashMap;import java.util.Map;import java.util.Stack;/** * 同义词过滤器 * Created with IntelliJ IDEA. * User: wxshi * Date: 15-2-13 * Time: 上午10:14 * To change this template use File | Settings | File Templates. */public class MySynonymTokenFilter extends TokenFilter {    private CharTermAttribute termAttribute = null;   //词内容    private PositionIncrementAttribute poiAttribute = null; //位置属性    private State current;                                  //状态    private Stack<String> synonymStack = null;            //同义词栈    public MySynonymTokenFilter(TokenStream stream){        super(stream);        try {           stream.reset();        } catch (Exception ex) {            System.out.println("@@@@@@@@@@@@@@@@@@");        }        termAttribute = this.addAttribute(CharTermAttribute.class);    //获取词信息        poiAttribute = this.addAttribute(PositionIncrementAttribute.class);  //获取位置属性        synonymStack = new Stack<String>();    }    //该方法实现过滤过程    @Override    public boolean incrementToken() throws IOException {        if(synonymStack.size()>0){            String synonym = synonymStack.pop();            restoreState(current);                   //存储状态            termAttribute.setEmpty();                //此位置清空            termAttribute.append(synonym);           //此位置插入同义词            poiAttribute.setPositionIncrement(0);     //位置设置为0 ,表示同义词            return true;        }        if(!this.input.incrementToken()){            return false;        }        //如果改词中有同义词,捕获当前状态        if(getSynonym(termAttribute.toString())){            current = captureState();        }        return true;  //To change body of implemented methods use File | Settings | File Templates.    }    //获取同义词    private boolean getSynonym(String term){        Map<String,String[]> synonymMap = new HashMap<String, String[]>();        synonymMap.put("中华",new String[]{"天朝,天国"});        synonymMap.put("文学",new String[]{"文化,文艺"});        synonymMap.put("散文",new String[]{"文章"});        String synonyms[] = synonymMap.get(term);        if(synonyms!=null){            for(String synonym:synonyms) {                synonymStack.push(synonym);            }            return true;        }        return false;    }}




三、

package analyzer;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import java.io.IOException;import java.io.Reader;import java.util.HashMap;import java.util.Map;import java.util.Stack;/** * 自定义分词的时候将同义词加入 * Created with IntelliJ IDEA. * User: wxshi * Date: 15-2-26 * Time: 上午9:30 * To change this template use File | Settings | File Templates. */public class SynonymTokenIzer extends Tokenizer {    private CharTermAttribute termAttribute = null;   //词内容    private PositionIncrementAttribute poiAttribute = null; //位置属性    private OffsetAttribute offsetAttr;                 // 词元位移属性    private State current;                                  //状态    private Stack<String> synonymStack = null;            //同义词栈    // 是否结束    StringBuilder buffer = new StringBuilder(1024);    int position = 0;    // 构造    public SynonymTokenIzer(Reader input) {        super(input);        try {            super.reset();        } catch (Exception ex) {        }        offsetAttr = addAttribute(OffsetAttribute.class);        termAttribute = this.addAttribute(CharTermAttribute.class);    //获取词信息        poiAttribute = this.addAttribute(PositionIncrementAttribute.class);  //获取位置属性        synonymStack = new Stack<String>();    }    @Override    public boolean incrementToken() throws IOException {        if (buffer.length() > 0) buffer.delete(0, buffer.length());        int a=-1;        if(-1 != (a = input.read())){            position++;            // 设置词元位移            buffer.append((char) a);            termAttribute.setEmpty().append(buffer.toString().toLowerCase()); // 忽略大小写            offsetAttr.setOffset(0, position);            if(synonymStack.size()>0){                String synonym = synonymStack.pop();                restoreState(current);                   //存储状态                termAttribute.setEmpty();                //此位置清空                termAttribute.append(synonym);           //此位置插入同义词                poiAttribute.setPositionIncrement(0);     //位置设置为0 ,表示同义词                return true;            }            //如果改词中有同义词,捕获当前状态            if(getSynonym(termAttribute.toString())){                current = captureState();            }        }else{            return false;        }        return true;    }    //获取同义词    private boolean getSynonym(String term){        Map<String,String[]> synonymMap = new HashMap<String, String[]>();        synonymMap.put("中华",new String[]{"天朝,天国"});        synonymMap.put("文学",new String[]{"文化,文艺"});        synonymMap.put("散",new String[]{"文章"});        String synonyms[] = synonymMap.get(term);        if(synonyms!=null){            for(String synonym:synonyms) {                synonymStack.push(synonym);            }            return true;        }        return false;    }}






0 0
原创粉丝点击