java读取中文分词工具(三)

来源:互联网 发布:mysql blob 中文乱码 编辑:程序博客网 时间:2024/06/04 23:14




import java.io.EOFException;import java.io.File;import java.io.FileNotFoundException;import java.io.IOException;import java.io.RandomAccessFile;import java.util.ArrayList;/* * 文件格式:已分词的文本,词语之间用空格,换行等空白符分割。 * 到了文件末尾就结束 * 适合读取一行很大的文本,因为这里的缓冲不是一行,而是若干个词语(比一行少)。 * 代码实现方式:每次读若干个词语作为一个句子,逐个字节读,以空白符区分词语的开始和结束。 *  */public class WordReader {RandomAccessFile raf = null;ArrayList<String> sentence = null;int senSize = 1000;int senPos =0 ;public WordReader(String fileName) throws IOException{     File file=new File(fileName);        raf = new RandomAccessFile(file,"r") ;      sentence = new ArrayList<String>();}public String[] getNextWords(int count) throws IOException{if(senPos+count >= sentence.size())//到了段落末尾,读取新的段落{if(readSentence())return getNextWords(count);else return null;}String[] words = new String[count];for(int i=0;i<count;i++){words[i] = sentence.get(senPos+i);}senPos++;return words;}private boolean readSentence(){try{sentence.clear();for(int i=0;i<senSize;i++){//System.out.println(i);int len = 0;while(true){int b = raf.read();if(b == -1) return false;if(b == ' ' || b == '\n'|| b == '\r'|| b=='\t'){break;}len++;}raf.seek(raf.getFilePointer() -len-1);byte[] buffer = new byte[len];raf.read(buffer, 0, len);//byte[] sub = new byte[len];//for(int k=0;k<len;k++) sub[k] = buffer[k];String word = new String(buffer,"utf-8");//这里有坑,不会根据结束符0截断字符串,必须手动处理//System.out.println(word);sentence.add(word);while(true){int b = raf.read();if(b == -1) return false;if(b == ' ' || b == '\n' || b == '\r' || b=='\t'){continue;}else break;}raf.seek(raf.getFilePointer() -1);}senPos = 0;return true;}catch(EOFException ex){ex.printStackTrace();return false;}catch(IOException ex){ex.printStackTrace();return false;}}public static void main(String[] args) throws IOException {// TODO Auto-generated method stub//WordReader wr = new WordReader("/home/linger/sources/ParaModel/electronic_seg.txt");WordReader wr = new WordReader("/home/linger/sources/resultbig.txt");wr.readSentence();//System.out.println("-------------------------");//wr.readSentence();//int i=0;//while(true)//614005行//{//String[] words = wr.getNextWords(5);//if(words == null) break;//System.out.println(i++);//System.out.println(words.length);//System.out.printf("%s,%s,%s,%s,%s \n",words[0],words[1],words[2],words[3],words[4]);//}}}

本文作者:linger

本文链接:http://blog.csdn.net/lingerlanlan/article/details/38337483

1 0
原创粉丝点击