lucene中的四种分词器

来源：互联网发布：离散数学视频知乎编辑：程序博客网时间：2024/06/06 19:48
import java.io.IOException; 
import java.io.StringReader; 
 
import javax.swing.text.AttributeSet.CharacterAttribute; 
 
import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.analysis.SimpleAnalyzer; 
import org.apache.lucene.analysis.StopAnalyzer; 
import org.apache.lucene.analysis.TokenStream; 
import org.apache.lucene.analysis.WhitespaceAnalyzer; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 
import org.apache.lucene.util.Version; 
import org.junit.Test; 
 
 
public class AnalyzerUtil { 
    public static void displayToken(String txt,Analyzer a){ 
        TokenStream ts = a.tokenStream("content", new StringReader(txt)); 
        PositionIncrementAttribute pia = ts.addAttribute(PositionIncrementAttribute.class); 
        OffsetAttribute oa = ts.addAttribute(OffsetAttribute.class); 
        CharTermAttribute ca = ts.addAttribute(CharTermAttribute.class); 
        TypeAttribute ta = ts.addAttribute(TypeAttribute.class); 
        try { 
            while(ts.incrementToken()){ 
                System.out.println(ca.toString()+"  positionincrement:"+pia.getPositionIncrement()+"  "+"offset:"+oa.startOffset()+"-"+oa.endOffset()+"   type:"+ta.type()); 
            } 
        } catch (IOException e) { 
            // TODO Auto-generated catch block 
            e.printStackTrace(); 
        } 
        System.out.println("-----------"); 
    } 
    @Test 
    public void test(){ 
        String txt = "how are you,thank you!"; 
        //构建四种分词器 
        Analyzer a1 = new StandardAnalyzer(Version.LUCENE_35); 
        Analyzer a2 = new StopAnalyzer(Version.LUCENE_35); 
        Analyzer a3 = new SimpleAnalyzer(Version.LUCENE_35); 
        Analyzer a4 = new WhitespaceAnalyzer(Version.LUCENE_35); 
        AnalyzerUtil.displayToken(txt, a1); 
        AnalyzerUtil.displayToken(txt, a2); 
        AnalyzerUtil.displayToken(txt, a3); 
        AnalyzerUtil.displayToken(txt, a4); 
    } 
}
本文出自 “Kenan_ITBlog” 博客，请务必保留此出处http://soukenan.blog.51cto.com/5130995/1122415