Lucene自定义分词:合并IK分词+二元分词
来源:互联网 发布:卷积神经网络C语言 编辑:程序博客网 时间:2024/04/30 19:21
索引命中效果不佳,采用IK分词与二元分词的效果都不是特别好,于是设计了新的自定义分词器,先将句子用IK分词分开,再对长度超过3的词进行二元分词。
以下是分词器的实现效果图。
实现思路先建立IK分词器,在通过第一层filter将IK分的词截留,长度大于等于3的词置入CJK分词器进行处理,然后得到的结果送入第二层filter中进行去重。因为在上一层中会出现大量重复词.下面贴代码。
package com.huang.analyzer;import java.io.BufferedReader;import java.io.IOException;import java.io.Reader;import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.cjk.CJKAnalyzer;import org.apache.lucene.analysis.cjk.CJKBigramFilter;import org.apache.lucene.analysis.cjk.CJKTokenizer;import org.apache.lucene.analysis.cjk.CJKWidthFilter;import org.apache.lucene.analysis.core.LowerCaseFilter;import org.apache.lucene.analysis.core.LowerCaseTokenizer;import org.apache.lucene.analysis.core.UpperCaseFilter;import org.apache.lucene.analysis.standard.StandardFilter;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.search.Filter;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;import org.wltea.analyzer.lucene.IKTokenizer;public class MyAnalyzer extends Analyzer { private static final String text = "今天是个好日子"; //private static final String text = "How are you"; @Override protected TokenStreamComponents createComponents(String fieldname, Reader in) { // TODO Auto-generated method stub// Tokenizer token = new LowerCaseTokenizer(in); // TokenStreamComponents components = new TokenStreamComponents(token);// TokenStream ts = components.getTokenStream(); //Tokenizer tokenizer = new IKBinaryTokenizer(in); //Tokenizer tokenizer = new CJKTokenizer(in); //TokenStream filter = new UpperCaseFilter(tokenizer); //TokenStream filter = new CJKWidthFilter(tokenizer); //Tokenizer tokenizer = new IkCjkTokenizer(in); Tokenizer tokenizer = new IKTokenizer(in, false); TokenStream normsFilter = new NormsFilter(tokenizer); TokenStream distinctFilter = new DistinctFilter(normsFilter); TokenStreamComponents components = new TokenStreamComponents(tokenizer, distinctFilter); return components; } public static void main(String[] args) { // TODO Auto-generated method stub MyAnalyzer myAnalyzer = null; myAnalyzer = new MyAnalyzer(); StringReader reader = new StringReader(text); //开始分词 try { TokenStream ts = myAnalyzer.tokenStream("", reader); ts.addAttribute(CharTermAttribute.class); ts.reset(); int i = 0; while(ts.incrementToken()){ CharTermAttribute cta = null; cta = ts.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class); System.out.println(cta.toString() + "(" + offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + ")"); } ts.end(); ts.close(); reader.close(); System.out.println(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }}
package com.huang.analyzer;import java.io.IOException;import java.io.StringReader;import java.nio.CharBuffer;import java.util.ArrayList;import java.util.Collection;import java.util.Iterator;import java.util.List;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.cjk.CJKAnalyzer;import org.apache.lucene.analysis.cjk.CJKTokenizer;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.analysis.tokenattributes.TypeAttribute;import org.apache.lucene.util.AttributeSource;public class NormsFilter extends TokenFilter{ /**中文词组长度过滤,默认超过2位长度的中文才转换拼音*/ private int minTermLength; /**当前输入是否已输出*/ private boolean hasCurOut; /**词元输入缓存*/ private char[] curTermBuffer; /**词元输入长度*/ private int curTermLength; private final CharTermAttribute termAtt = (CharTermAttribute) addAttribute(CharTermAttribute.class); /**位置增量属性*/ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); /**cjk分词结果集*/ private Collection<String> terms; /**结果集迭代器*/ private Iterator<String> termIte; /**主词在字符串中的偏移量*/ private int firstStartOffset; /**副词在主词中的偏移量*/ private int secondStartOffset; private boolean norms; protected NormsFilter(TokenStream input) { super(input); } @Override public boolean incrementToken() throws IOException { // TODO Auto-generated method stub //开始处理或上一词元已经处理完成 while (true) { if (this.curTermBuffer == null) { //获取下一词元输入 if (!this.input.incrementToken()) { // 没有后继词元输入,处理完成,返回false,结束上层调用 return false; } this.curTermBuffer = ((char[]) this.termAtt.buffer().clone()); this.curTermLength = this.termAtt.length(); } //处理原输入词元 if (!hasCurOut && (this.termIte == null)) { // 标记以保证下次循环不会输出 this.hasCurOut = true; // 写入原输入词元 this.termAtt.copyBuffer(this.curTermBuffer, 0, this.curTermLength); this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement()); this.typeAtt.setType("CN_WORD"); if (curTermLength <= 2) { this.curTermBuffer = null; this.hasCurOut = false; } return true; } String text = this.termAtt.toString(); //norms为true表示在第二层中 if (norms == false) { if (text.length() >= 3) { //主词相对句子的偏移量 this.firstStartOffset = offsetAtt.startOffset(); //开始第二层 norms = true; //用容器来装cjk分好的二元分词 Collection<String> coll = new ArrayList<String>(); /**开始cjk分词*/ StringReader reader = new StringReader(text); Tokenizer cjk = new CJKTokenizer(reader); cjk.reset(); while (cjk.incrementToken()) { coll.add(cjk.getAttribute(CharTermAttribute.class).toString()); } cjk.end(); cjk.close(); this.terms = coll; if (this.terms != null) { this.termIte = this.terms.iterator(); } }else { this.termAtt.copyBuffer(curTermBuffer, 0, curTermLength); this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement()); this.typeAtt.setType("CN_WORD"); this.curTermBuffer = null; return true; } } if (norms == true) { // 有拼音结果集且未处理完成 while (this.termIte.hasNext()) { String pinyin = this.termIte.next(); this.termAtt.copyBuffer(pinyin.toCharArray(), 0, pinyin.length()); this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement()); this.typeAtt.setType("CN_WORD"); this.offsetAtt.setOffset(this.firstStartOffset, this.firstStartOffset + pinyin.length()); this.firstStartOffset++; if (!this.termIte.hasNext()) { // 没有中文或转换拼音失败,不用处理, // 清理缓存,下次取新词元 this.curTermBuffer = null; this.termIte = null; this.hasCurOut = false; this.firstStartOffset = 0; norms = false; } return true; } } } }}
package com.huang.analyzer;import java.io.IOException;import java.util.Collection;import java.util.LinkedList;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import org.apache.lucene.analysis.tokenattributes.TypeAttribute;public class DistinctFilter extends TokenFilter{ private Collection<String> terms; /**词元输入缓存*/ private char[] curTermBuffer; /**词元输入长度*/ private int curTermLength; /**词元文字属性*/ private final CharTermAttribute termAtt = (CharTermAttribute) addAttribute(CharTermAttribute.class); /**位置增量属性*/ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); /**词元类型属性*/ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); protected DistinctFilter(TokenStream input) { super(input); // TODO Auto-generated constructor stub this.terms = new LinkedList<String>(); } @Override public boolean incrementToken() throws IOException { // TODO Auto-generated method stub while (true) { if (this.curTermBuffer == null) { //获取下一词元输入 if (!this.input.incrementToken()) { // 没有后继词元输入,处理完成,返回false,结束上层调用 return false; } this.curTermBuffer = ((char[]) this.termAtt.buffer().clone()); this.curTermLength = this.termAtt.length(); } String text = this.termAtt.toString(); if (terms.contains(text)) { //this.curTermBuffer = "重复".toCharArray(); this.curTermBuffer = null; continue; }else { terms.add(text); } /***/ this.termAtt.copyBuffer(this.curTermBuffer, 0, this.curTermLength); this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement()); this.typeAtt.setType("CN_WORD"); /***/ this.curTermBuffer = null; return true; } } @Override public void reset() throws IOException { // TODO Auto-generated method stub super.reset(); this.curTermBuffer = null; terms.clear(); }}
0 0
- Lucene自定义分词:合并IK分词+二元分词
- Lucene中文分词IK Analyzer
- IK分词
- Lucene 自定义分词器
- Ik分词器自定义词典
- 自定义lucene分词器,单字分词
- lucene分词器分词
- IK分词器的使用lucene
- Lucene 6.0下使用IK分词器
- Lucene 6.0下使用IK分词器
- lucene笔记-002-IK分词器
- IC分词和IK分词
- lucene分词
- lucene分词
- Lucene 分词
- Lucene自定义同义词分词器
- Lucene采用自定义分词器
- Lucene实现自定义分词器
- Java模式
- Android开发 MeasureSpec介绍
- TCP/IP 邮件(四)
- 【韩天峰】关于未来世界与编程技术的畅想【转】
- Android studio 导入第三方lib或者jar
- Lucene自定义分词:合并IK分词+二元分词
- ormapping框架和jdbc的比较
- Robot Framework测试框架(学习)
- Android 代码优化
- C++ 动态内存
- WindowManager.LayoutParams 的各种属性
- 2015-09-23
- android工具eclipse如何添加源码
- 测试代码