Lucene4.10.3自定义分词
来源:互联网 发布:vc网络验证系列开发 编辑:程序博客网 时间:2024/04/28 15:49
Lucene4.10.3自定义分词只需三步:
一、
package analyzer;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.document.Document;import org.wltea.analyzer.lucene.IKTokenizer;import java.io.*;/** * 自定义分词器 * Created with IntelliJ IDEA. * User: wxshi * Date: 15-2-11 * Time: 下午3:56 * To change this template use File | Settings | File Templates. */public class MyAnalyzer extends Analyzer{ /** * 自定义分词过程: * 1.对流进行分词 * 2.对分词好的进行过滤处理 * 3.返回结果 * */ protected TokenStreamComponents createComponents(String s, Reader reader) { IKTokenizer source = new IKTokenizer(reader,false); //使用IK进行分词 ,细粒度非智能 TokenStream filter = new MySynonymTokenFilter(source); //使用自己的同义词过滤器进行过滤 //filter = new BarFilter(filter); //此处可以继续过滤 return new TokenStreamComponents(source, filter); //返回结果 ,第一种构造 ,复杂时推荐使用 // return new TokenStreamComponents(new SynonymTokenIzer(reader)); //返回结果,第二种构造,简单时推荐使用 } public static void main(String args[]){ try{ File file = new File("f:\\lucene\\indexFile\\文件1.txt"); Document doc = new Document(); FileInputStream fis = new FileInputStream(file); Reader reader = new BufferedReader(new InputStreamReader(fis,"GBK")); Analyzer analyzer = new MyAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("content",reader); //ik分词流,不采用智能切分 CharTermAttribute termTextAttr = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute termOffsetAttr = tokenStream.getAttribute(OffsetAttribute.class); //PositionIncrementAttribute termPosAttr = stream.getAttribute(PositionIncrementAttribute.class); /** *当incrementToken返回true时,其中Token的属性信息会将内部状态修改为下个词汇单元。 *lucene内建Attribute接口都是可读写的,TokenStream 在遍历Token流时, *会调用Attribute接口的set方法,修改属性信息。 **/ int i = 0; // 遍历 while (tokenStream.incrementToken()) { // token流指针往后移 String str = termTextAttr.toString(); int bos = termOffsetAttr.startOffset(); int eos = termOffsetAttr.endOffset(); System.out.printf("pos=%d, [%s:(%d->%d)]\n", i, str, bos, eos); ++i; } }catch (Exception e){ e.printStackTrace(); } }}
二、
package analyzer;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import java.io.IOException;import java.util.HashMap;import java.util.Map;import java.util.Stack;/** * 同义词过滤器 * Created with IntelliJ IDEA. * User: wxshi * Date: 15-2-13 * Time: 上午10:14 * To change this template use File | Settings | File Templates. */public class MySynonymTokenFilter extends TokenFilter { private CharTermAttribute termAttribute = null; //词内容 private PositionIncrementAttribute poiAttribute = null; //位置属性 private State current; //状态 private Stack<String> synonymStack = null; //同义词栈 public MySynonymTokenFilter(TokenStream stream){ super(stream); try { stream.reset(); } catch (Exception ex) { System.out.println("@@@@@@@@@@@@@@@@@@"); } termAttribute = this.addAttribute(CharTermAttribute.class); //获取词信息 poiAttribute = this.addAttribute(PositionIncrementAttribute.class); //获取位置属性 synonymStack = new Stack<String>(); } //该方法实现过滤过程 @Override public boolean incrementToken() throws IOException { if(synonymStack.size()>0){ String synonym = synonymStack.pop(); restoreState(current); //存储状态 termAttribute.setEmpty(); //此位置清空 termAttribute.append(synonym); //此位置插入同义词 poiAttribute.setPositionIncrement(0); //位置设置为0 ,表示同义词 return true; } if(!this.input.incrementToken()){ return false; } //如果改词中有同义词,捕获当前状态 if(getSynonym(termAttribute.toString())){ current = captureState(); } return true; //To change body of implemented methods use File | Settings | File Templates. } //获取同义词 private boolean getSynonym(String term){ Map<String,String[]> synonymMap = new HashMap<String, String[]>(); synonymMap.put("中华",new String[]{"天朝,天国"}); synonymMap.put("文学",new String[]{"文化,文艺"}); synonymMap.put("散文",new String[]{"文章"}); String synonyms[] = synonymMap.get(term); if(synonyms!=null){ for(String synonym:synonyms) { synonymStack.push(synonym); } return true; } return false; }}
三、
package analyzer;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;import java.io.IOException;import java.io.Reader;import java.util.HashMap;import java.util.Map;import java.util.Stack;/** * 自定义分词的时候将同义词加入 * Created with IntelliJ IDEA. * User: wxshi * Date: 15-2-26 * Time: 上午9:30 * To change this template use File | Settings | File Templates. */public class SynonymTokenIzer extends Tokenizer { private CharTermAttribute termAttribute = null; //词内容 private PositionIncrementAttribute poiAttribute = null; //位置属性 private OffsetAttribute offsetAttr; // 词元位移属性 private State current; //状态 private Stack<String> synonymStack = null; //同义词栈 // 是否结束 StringBuilder buffer = new StringBuilder(1024); int position = 0; // 构造 public SynonymTokenIzer(Reader input) { super(input); try { super.reset(); } catch (Exception ex) { } offsetAttr = addAttribute(OffsetAttribute.class); termAttribute = this.addAttribute(CharTermAttribute.class); //获取词信息 poiAttribute = this.addAttribute(PositionIncrementAttribute.class); //获取位置属性 synonymStack = new Stack<String>(); } @Override public boolean incrementToken() throws IOException { if (buffer.length() > 0) buffer.delete(0, buffer.length()); int a=-1; if(-1 != (a = input.read())){ position++; // 设置词元位移 buffer.append((char) a); termAttribute.setEmpty().append(buffer.toString().toLowerCase()); // 忽略大小写 offsetAttr.setOffset(0, position); if(synonymStack.size()>0){ String synonym = synonymStack.pop(); restoreState(current); //存储状态 termAttribute.setEmpty(); //此位置清空 termAttribute.append(synonym); //此位置插入同义词 poiAttribute.setPositionIncrement(0); //位置设置为0 ,表示同义词 return true; } //如果改词中有同义词,捕获当前状态 if(getSynonym(termAttribute.toString())){ current = captureState(); } }else{ return false; } return true; } //获取同义词 private boolean getSynonym(String term){ Map<String,String[]> synonymMap = new HashMap<String, String[]>(); synonymMap.put("中华",new String[]{"天朝,天国"}); synonymMap.put("文学",new String[]{"文化,文艺"}); synonymMap.put("散",new String[]{"文章"}); String synonyms[] = synonymMap.get(term); if(synonyms!=null){ for(String synonym:synonyms) { synonymStack.push(synonym); } return true; } return false; }}
0 0
- Lucene4.10.3自定义分词
- lucene4.7 分词器 自定义分词器
- lucene4.7 分词器(三) 之自定义分词器
- Lucene4.10.3自定义过滤器
- Lucene4.10.3索引,使用iK分词
- lucene4.3 自定义排序
- lucene4.5.0----自定义filter
- Lucene4.3开发之中文分词器
- lucene4.7 分词器(三)
- lucene4.7 分词器(三)
- Lucene4.4.0几种分词方法
- lucene4.10.3入门
- lucene4.10.3入门教程
- lucene4.10.3入门教程
- lucene4
- lucene4.0结合IK Analyzer分词器的简单示例
- ICTCLAS分词器与Lucene4.9的结合
- ICTCLAS分词器与Lucene4.9的结合
- [置顶] mongodb Replica Sets +Sharding高可用集群搭建
- 致“想创业”和“正在创业”的人们~
- 黑马程序员---【C语言】03函数
- 【POJ 2970】The lazy programmer(优先队列+贪心)
- Linux查看CPU信息[//proc/loadavg]
- Lucene4.10.3自定义分词
- Linux下安装rar
- Java 枚举7常见种用法
- MAC下MyEclipse连接到MySql数据库
- Android平台调用WebService详解
- 初窥 quick-cocos2d-x
- Linux 下切换JDK的脚本
- 浅析深究什么是SOA?
- Java-Collections Framework学习与总结-HashMap