lucene 自定义分词器小程序

来源:互联网 发布:license破解软件 编辑:程序博客网 时间:2024/05/21 17:12
测试类
package LuceneUtil;import java.io.Reader;import java.util.Set;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.LetterTokenizer;import org.apache.lucene.analysis.LowerCaseFilter;import org.apache.lucene.analysis.StopAnalyzer;import org.apache.lucene.analysis.StopFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.util.Version;//自定义过滤分词器public class MyStopAnalyzer extends Analyzer {private Set stops;public MyStopAnalyzer(String [] sws)//形参为 字符串数组{//会自动将字符串数组转换为Setstops=StopFilter.makeStopSet(Version.LUCENE_35, sws,true);//将原有的停用词加入到现在的停用词中stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);}public MyStopAnalyzer(){stops=StopAnalyzer.ENGLISH_STOP_WORDS_SET;}public TokenStream tokenStream(String FileName,Reader reader){return new StopFilter(Version.LUCENE_35, new LowerCaseFilter(Version.LUCENE_35,new LetterTokenizer(Version.LUCENE_35, reader)),  stops);}}

 
package LuceneTest;import java.io.BufferedWriter;import java.io.FileWriter;import java.io.IOException;import java.util.ArrayList;import java.util.Iterator;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.StopAnalyzer;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;import LuceneUtil.AnalyzerUtils;import LuceneUtil.MyStopAnalyzer;public class TestAnalyzer {static ArrayList<String> list=null;public static void main (String [] args) throws IOException{//addNewWord( "烟台大学 ");//test();test01();}public static void test(){Analyzer a1=new MMSegAnalyzer();String txt="我是一名大学生,我来自菏泽,我现在烟台大学。";AnalyzerUtils.displayToken(txt,a1); }public static void test01(){//使用自定义的过滤分词器//这个语句 可以吧 “you“,”meet”,和“毛泽东”  给和谐掉Analyzer a2=new MyStopAnalyzer(new String [] {"you","meet","毛泽东"});//系统自带的StopAnalyzerAnalyzer a3=new StopAnalyzer(Version.LUCENE_35);String txt=" i say :how are You,nice to meet you. 毛泽东";AnalyzerUtils.displayToken(txt,a2); AnalyzerUtils.displayToken(txt,a3); }
package LuceneUtil;import java.io.IOException;import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.omg.CORBA.portable.Streamable;//测试类public class AnalyzerUtils {public static  void displayToken(String str,Analyzer a){try {TokenStream ts=a.tokenStream("cotents", new StringReader(str));//创建一个属性,这个属性添加到流中,随着TokenStream增加CharTermAttribute cta=ts.addAttribute(CharTermAttribute.class);while (ts.incrementToken()){System.out.print("["+cta+"]");}System.out.println();} catch (IOException e) {e.printStackTrace();}}}

/*public static void addNewWord(String newWord) throws IOException{BufferedWriter bw=new BufferedWriter(new FileWriter("G:\\mmseg\\data\\words-my.dic"));ArrayList<String> list=new ArrayList<String>();list.add(newWord);Iterator<String> iterator=list.iterator();while (iterator.hasNext()){bw.write(iterator.next());bw.flush();bw.newLine();}bw.close();System.out.println("添加成功");}*/}

测试结果如下:

可见 我想和谐掉的 那几个字已被和谐

第一行为执行和谐后的结果

第二行为未被和谐的
[i][say][how][nice]
[i][say][how][you][nice][meet][you][毛泽东]

不足:还不能对单个汉语词语和谐 ,汉语只能屏蔽一句话。而英语却可以