一种中文文本的快速分词方法(二)

来源:互联网 发布:淘宝宝贝怎么发布 编辑:程序博客网 时间:2024/04/30 07:59
package org.zhukovasky.chineseSeg;import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.FileReader;import java.io.IOException;import java.io.ObjectOutputStream;import java.io.OutputStream;import java.io.Reader;import org.zhukovasky.HashBinaryClass.HashBinaryContainer;import org.zhukovasky.HashBinaryClass.Maps;import org.zhukovasky.fileutil.WordCount;import org.zhukovasky.fileutil.WordDictUtil;import org.zhukovasky.invertedindex.MapWords;/** * 以下是中文文本的分词工具, * 文本的编码为UTF-8 * @author zhukovasky * @version 1.0 * @since 2013.12 * @email zhukovasky@163.com * */public class chineseSeg {/** * 以下方法为对中文文本的分词写入到倒排索引中 * @param afterprocess经过预处理后的文本 * @param invertedIndex 存放倒排索引的地址 * @param 字典所在的地址 *  * */public final static int MAXLENGTH=10;public static void FileSeg(File afterprocess,File invertedIndex,File dict){MapWords mapwords=new MapWords();Reader r=null;BufferedReader bf=null;ObjectOutputStream oos=null;OutputStream output=null;String Line=null;Maps map=WordCount.getDict(dict);int i=0;try {r=new FileReader(afterprocess);bf=new BufferedReader(r);Line=bf.readLine();int Kase=0;if(Line.length()<=MAXLENGTH+1){Kase=1;}else{Kase=2;}switch(Kase){case 1:{while(i<=Line.length()-1){String str=null;String str1=null;String str2=null;str=Line.substring(0);str1=Line.substring(0, 1);str2=Line.substring(1, 2);int seek=0;if(map.isCwordExist(str1)){if(map.getHBC(str1).isSecondWordExist(str2)){HashBinaryContainer hbc=map.getHBC(str1);String[] temp=hbc.getMatchArray(str2);String[] maxletemp=WordDictUtil.getStringLengthArray(temp);if(maxletemp[0].length()==1){String segword=str1+str2;seek=2;mapwords.addNewNodeElement(segword, afterprocess.getName(), i);i=i+2;}else{int length=maxletemp[0].length();String str3=str.substring(1, length+1);String segword=str1+str3;if(WordDictUtil.isWordMatched(str3, maxletemp)){mapwords.addNewNodeElement(segword, afterprocess.getName(), i);seek=segword.length();i=i+seek;}else{i=i+2;segword=str1+str2;mapwords.addNewNodeElement(segword, afterprocess.getName(), i);}}}else{mapwords.addNewNodeElement(str1, afterprocess.getName(), i);i++;}}else{mapwords.addNewNodeElement(str1, afterprocess.getName(), i);i++;}}};break;case 2:{while(i<=Line.length()-1){String str=null;String str1=null;String str2=null;if(i>=Line.length()-1-MAXLENGTH&&Line.length()-1-MAXLENGTH>0){str=Line.substring(i);if(i>=Line.length()-1&&Line.length()-1>0){if(i>Line.length()){break;}str1=Line.substring(i);str2=null;break;}else{str1=str.substring(0,1);str2=str.substring(1,2);int seek=0;if(map.isCwordExist(str1)){if(map.getHBC(str1).isSecondWordExist(str2)){HashBinaryContainer hbc=map.getHBC(str1);String[] temp=hbc.getMatchArray(str2);String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);if(MaxLeTemp[0].length()==1){String segword=str1+str2;seek=2;mapwords.addNewNodeElement(segword, afterprocess.getName(), i);i=i+2;}else{int length=MaxLeTemp[0].length();if(str.length()<length){break;}String str3=str.substring(1, length+1);String segword=str1+str3;if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){mapwords.addNewNodeElement(segword, afterprocess.getName(), i);seek=segword.length();i=i+seek;}else{i=i+2;segword=str1+str2;mapwords.addNewNodeElement(segword, afterprocess.getName(), i);}}}else{mapwords.addNewNodeElement(str1, afterprocess.getName(), i);seek=1;i=i+seek;}}else{mapwords.addNewNodeElement(str1, afterprocess.getName(), i);seek=1;i=i+seek;}}}else{str=Line.substring(i, i+MAXLENGTH);str1=str.substring(0, 1);str2=str.substring(1, 2);int seek=0;if(map.isCwordExist(str1)){if(map.getHBC(str1).isSecondWordExist(str2)){HashBinaryContainer hbc=map.getHBC(str1);String[] temp=hbc.getMatchArray(str2);String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);if(MaxLeTemp[0].length()==1){String segword=str1+str2;seek=2;mapwords.addNewNodeElement(segword, afterprocess.getName(), i);i=i+2;}else{int length=MaxLeTemp[0].length();String str3=str.substring(1, length+1);String segword=str1+str3;if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){mapwords.addNewNodeElement(segword, afterprocess.getName(), i);seek=segword.length();i=i+seek;}else{i=i+2;segword=str1+str2;mapwords.addNewNodeElement(segword, afterprocess.getName(), i);}}}else{mapwords.addNewNodeElement(str1, afterprocess.getName(), i);seek=1;i=i+seek;}}else{mapwords.addNewNodeElement(str1, afterprocess.getName(), i);seek=1;i=i+seek;}}}};break;}} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}finally{try {r.close();bf.close();} catch (IOException e) {e.printStackTrace();}}try {output=new FileOutputStream(invertedIndex);oos=new ObjectOutputStream(output);oos.writeObject(mapwords);} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}finally{try {oos.close();} catch (IOException e) {e.printStackTrace();}}}/** * 以下方法为对中文文本的分词写入到倒排索引中 * @param afterprocess[k]经过预处理后的文本 * @param invertedIndex 存放倒排索引的地址 * @param 字典所在的地址 * */public static void FileArraysSeg(File[] afterprocess,File invertedIndex,File dict){MapWords mapwords=new MapWords();Reader r=null;BufferedReader bf=null;ObjectOutputStream oos=null;OutputStream output=null;String Line=null;Maps map=WordCount.getDict(dict);int i=0;int MAXLENGTH=9;//取决于词典中最大长度词条for(int k=0;k<afterprocess.length;k++){try {r=new FileReader(afterprocess[k]);bf=new BufferedReader(r);Line=bf.readLine();int Kase=0;if(Line.length()<=MAXLENGTH+1){Kase=1;}else{Kase=2;}switch(Kase){case 1:{while(i<=Line.length()-1){String str=null;String str1=null;String str2=null;str=Line.substring(0);str1=Line.substring(0, 1);str2=Line.substring(1, 2);int seek=0;if(map.isCwordExist(str1)){if(map.getHBC(str1).isSecondWordExist(str2)){HashBinaryContainer hbc=map.getHBC(str1);String[] temp=hbc.getMatchArray(str2);String[] maxletemp=WordDictUtil.getStringLengthArray(temp);if(maxletemp[0].length()==1){String segword=str1+str2;seek=2;mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);i=i+2;}else{int length=maxletemp[0].length();String str3=str.substring(1, length+1);String segword=str1+str3;if(WordDictUtil.isWordMatched(str3, maxletemp)){mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);seek=segword.length();i=i+seek;}else{i=i+2;segword=str1+str2;mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);}}}else{mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);i++;}}else{mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);i++;}}};break;case 2:{while(i<=Line.length()-1){String str=null;String str1=null;String str2=null;if(i>=Line.length()-1-MAXLENGTH&&Line.length()-1-MAXLENGTH>0){str=Line.substring(i);if(i>=Line.length()-1&&Line.length()-1>0){if(i>Line.length()){break;}str1=Line.substring(i);str2=null;break;}else{str1=str.substring(0,1);str2=str.substring(1,2);int seek=0;if(map.isCwordExist(str1)){if(map.getHBC(str1).isSecondWordExist(str2)){HashBinaryContainer hbc=map.getHBC(str1);String[] temp=hbc.getMatchArray(str2);String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);if(MaxLeTemp[0].length()==1){String segword=str1+str2;seek=2;mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);i=i+2;}else{int length=MaxLeTemp[0].length();if(str.length()<length){break;}String str3=str.substring(1, length+1);String segword=str1+str3;if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);seek=segword.length();i=i+seek;}else{i=i+2;segword=str1+str2;mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);}}}else{mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);seek=1;i=i+seek;}}else{mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);seek=1;i=i+seek;}}}else{str=Line.substring(i, i+MAXLENGTH);str1=str.substring(0, 1);str2=str.substring(1, 2);int seek=0;if(map.isCwordExist(str1)){if(map.getHBC(str1).isSecondWordExist(str2)){HashBinaryContainer hbc=map.getHBC(str1);String[] temp=hbc.getMatchArray(str2);String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);if(MaxLeTemp[0].length()==1){String segword=str1+str2;seek=2;mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);i=i+2;}else{int length=MaxLeTemp[0].length();String str3=str.substring(1, length+1);String segword=str1+str3;if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);seek=segword.length();i=i+seek;}else{i=i+2;segword=str1+str2;mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);}}}else{mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);seek=1;i=i+seek;}}else{mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);seek=1;i=i+seek;}}}};break;}} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}finally{try {r.close();bf.close();} catch (IOException e) {e.printStackTrace();}}}try {output=new FileOutputStream(invertedIndex);oos=new ObjectOutputStream(output);oos.writeObject(mapwords);} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}finally{try {oos.close();} catch (IOException e) {e.printStackTrace();}}}}

0 0
原创粉丝点击