基于LUCENE的java词频统计

来源:互联网 发布:怎么在mac上卸载app 编辑:程序博客网 时间:2024/05/16 01:37
package demo.analysis;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStreamReader;import java.io.Reader;import java.io.StringReader;import java.io.UnsupportedEncodingException;import java.util.HashMap;import java.util.Iterator;import java.util.Map;import jeasy.analysis.MMAnalyzer;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.TokenStream;public class Segment { public static void main(String args[]) throws IOException {  Segment s = new Segment();  String text = s.ReadFileByBufferdeReader("./1.txt");//  System.out.println(text);  s.getWordByReader(text); } public String ReadFileByBufferdeReader(String readFileName) {  String temp = "";  File f = new File(readFileName);   InputStreamReader read;  try {   read = new InputStreamReader(new FileInputStream(f), "utf-8");   BufferedReader reader = new BufferedReader(read);   String line;   while ((line = reader.readLine()) != null) {    temp += line + "\n";   }  } catch (UnsupportedEncodingException e) {   // TODO Auto-generated catch block   e.printStackTrace();  } catch (FileNotFoundException e) {   // TODO Auto-generated catch block   e.printStackTrace();  } catch (IOException e) {   // TODO Auto-generated catch block   e.printStackTrace();  }  return temp; } public void getWordByReader(String text) {  //采用正向最大匹配的中文分词算法  MMAnalyzer analyzer = new MMAnalyzer();  analyzer.addWord("任家坪");  Map<String, Integer> map = new HashMap<String, Integer>();  try {   System.out.println("Length = " + text.length());   Reader r = new StringReader(text);   TokenStream ts = analyzer.tokenStream(null, r);   System.out.println("开始分词...\n");   long begin = System.currentTimeMillis();   for (Token t = ts.next(); t != null; t = ts.next()) {    String str = t.termText();    Object o = map.get(str);    if (o == null) {     map.put(str, new Integer(1));    } else {     Integer I = new Integer(((Integer) o).intValue() + 1);     map.put(str, I);    }   }   //System.out.println(t.startOffset() + " - " + t.endOffset() + " = " + t.termText());   for (Iterator iter = map.entrySet().iterator(); iter.hasNext();) {    Map.Entry entry = (Map.Entry) iter.next();    System.out.println(entry.getKey() + ":" + entry.getValue());   }   long end = System.currentTimeMillis();   System.out.println("分词数量: "+map.size()+" 耗时 : " + (end - begin) + "ms");  } catch (IOException e) {   e.printStackTrace();  } } public void getWordBySegment(String text) {  MMAnalyzer analyzer = new MMAnalyzer();  try {   System.out.println(analyzer.segment(text, "\n"));  } catch (IOException e) {   // TODO Auto-generated catch block   e.printStackTrace();  } }}

0 0