lucene-JE中文分词

来源:互联网 发布:僵尸病毒 知乎 编辑:程序博客网 时间:2024/04/29 05:26

1、比较好的JAVA写的JE分词http://jesoft.cn:9080/je-analysis-1.5.3.jar,官网http://www.jesoft.cn/

 

2、提供相关的API

//采用正向最大匹配的中文分词算法,相当于分词粒度等于0
MMAnalyzer analyzer = new MMAnalyzer();

//参数为分词粒度:当字数等于或超过该参数,且能成词,该词就被切分出来
MMAnalyzer analyzer = new MMAnalyzer(int wordLength);

//字符串切分,常用于客户的关键字输入
analyzer.segment(String text, String separator);


词典维护API(静态方法):

//增加一个新词典,采用每行一个词的读取方式(注意:多线程状态下此时的分词效果将不可预料)
MMAnalyzer.addDictionary(Reader reader);

//增加一个新词
MMAnalyzer.addWord(String newWord);

//删除词库中的全部词语(注意:非常危险的操作,在没有加载新的词库前所有的分词都将失效)
MMAnalyzer.clear();

//词库中是否包含该词
MMAnalyzer.contains(String word);

//从词库中移除该词
MMAnalyzer.removeWord(String word);

//当前词库中包含的词语总数
MMAnalyzer.size();

3、使用说明

符串切分  package demo.analysis;    import java.io.IOException;    import jeasy.analysis.MMAnalyzer;    public class Segment                 public static void main(String[] args)                String text = "据路透社报道,印度尼西亚社会事务部一官员星期二(29日)表示,"                 + "日惹市附近当地时间27日晨5时53分发生的里氏6.2级地震已经造成至少5427人死亡,"                 + "20000余人受伤,近20万人无家可归。";                     MMAnalyzer analyzer = new MMAnalyzer();          try                            System.out.println(analyzer.segment(text, " | "));          }           catch (IOException e)                            e.printStackTrace();                  生成效果:   

据 | 路透社 | 报道 | 印度尼西亚 | 社会 | 事务 | 部 | 官员 | 星期二 | 29日 | 表示 | 日惹 | 市 |
附近 | 当地时间 | 27日 | 晨 | 5时 | 53分 | 发生 | 里氏 | 6.2级 | 地震 | 已经 | 造成 | 至少 |
5427人 | 死亡 | 20000 | 余人 | 受伤 | 近 | 20万人 | 无家可归 |
  Lucene搜索  package demo.analysis;    import jeasy.analysis.MMAnalyzer;    import org.apache.lucene.analysis.Analyzer;  import org.apache.lucene.document.Document;  import org.apache.lucene.document.Field;  import org.apache.lucene.index.IndexWriter;  import org.apache.lucene.queryParser.QueryParser;  import org.apache.lucene.search.Hits;  import org.apache.lucene.search.IndexSearcher;  import org.apache.lucene.search.Query;  import org.apache.lucene.store.Directory;  import org.apache.lucene.store.RAMDirectory;    public class Segment              public static void main(String[] args)                String fieldName = "text";          String text = "据路透社报道,印度尼西亚社会事务部一官员星期二(29日)表示,"              + "日惹市附近当地时间27日晨5时53分发生的里氏6.2级地震已经造成至少5427人死亡,"              + "20000余人受伤,近20万人无家可归。"; //检索内容            //采用正向最大匹配的中文分词算法          Analyzer analyzer = new MMAnalyzer();            Directory directory = new RAMDirectory();          //Directory directory = FSDirectory.getDirectory("/tmp/testindex", true);            try                        IndexWriter iwriter = new IndexWriter(directory, analyzer, true);              iwriter.setMaxFieldLength(25000);              Document doc = new Document();              doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.TOKENIZED));              iwriter.addDocument(doc);              iwriter.close();                            IndexSearcher isearcher = new IndexSearcher(directory);              QueryParser parser = new QueryParser(fieldName, analyzer);              Query query = parser.parse("印度尼西亚 6.2级地震");//检索词              Hits hits = isearcher.search(query);              System.out.println("命中:" + hits.length());                for (int i = 0; i < hits.length(); i++)                                Document hitDoc = hits.doc(i);                  System.out.println("内容:" + hitDoc.get(fieldName));                             isearcher.close();              directory.close();          }           catch (Exception e)                        e.printStackTrace();          }                  生成效果:   命中:1 内容:据路透社报道,印度尼西亚社会事务部一官员星期二(29日)表示,日惹市附近当地时间27日晨5时53分发生的
里氏6.2级地震已经造成至少5427人死亡,20000余人受伤,近20万人无家可归。  搜索词加亮  package demo.analysis;    import jeasy.analysis.MMAnalyzer;    import org.apache.lucene.analysis.Analyzer;  import org.apache.lucene.analysis.TokenStream;  import org.apache.lucene.document.Document;  import org.apache.lucene.document.Field;  import org.apache.lucene.index.IndexReader;  import org.apache.lucene.index.IndexWriter;  import org.apache.lucene.index.TermPositionVector;  import org.apache.lucene.queryParser.QueryParser;  import org.apache.lucene.search.Hits;  import org.apache.lucene.search.IndexSearcher;  import org.apache.lucene.search.Query;  import org.apache.lucene.search.highlight.Highlighter;  import org.apache.lucene.search.highlight.QueryScorer;  import org.apache.lucene.search.highlight.TokenSources;  import org.apache.lucene.store.Directory;  import org.apache.lucene.store.RAMDirectory;    public class Segment         public static void main(String[] args)               String fieldName = "text";          String text = "据路透社报道,印度尼西亚社会事务部一官员星期二(29日)表示,"              + "日惹市附近当地时间27日晨5时53分发生的里氏6.2级地震已经造成至少5427人死亡,"              + "20000余人受伤,近20万人无家可归。"; //检索内容            //采用正向最大匹配的中文分词算法          Analyzer analyzer = new MMAnalyzer();            Directory directory = new RAMDirectory();          //Directory directory = FSDirectory.getDirectory("/tmp/testindex", true);            try                       IndexWriter iwriter = new IndexWriter(directory, analyzer, true);              iwriter.setMaxFieldLength(25000);              Document doc = new Document();              doc.add(new Field(fieldName, text, Field.Store.YES,                      Field.Index.TOKENIZED,                      Field.TermVector.WITH_POSITIONS_OFFSETS));              iwriter.addDocument(doc);              iwriter.close();                IndexSearcher isearcher = new IndexSearcher(directory);              QueryParser parser = new QueryParser(fieldName, analyzer);              Query query = parser.parse("印度尼西亚 6.2级地震");//检索词              Hits hits = isearcher.search(query);              System.out.println("命中:" + hits.length());                Highlighter highlighter = new Highlighter(new QueryScorer(query));              for (int i = 0; i < hits.length(); i++)                               text = hits.doc(i).get(fieldName);                  TermPositionVector tpv = (TermPositionVector) IndexReader.open(                      directory).getTermFreqVector(hits.id(i), fieldName);                  TokenStream tokenStream = TokenSources.getTokenStream(tpv);                  String result = highlighter.getBestFragments(tokenStream, text, 3, "...");                  System.out.println("内容:" + result);                             isearcher.close();              directory.close();                   catch (Exception e)                       e.printStackTrace();                  }

原创粉丝点击