一种中文文本的快速分词方法(二)
来源:互联网 发布:淘宝宝贝怎么发布 编辑:程序博客网 时间:2024/04/30 07:59
package org.zhukovasky.chineseSeg;import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.FileReader;import java.io.IOException;import java.io.ObjectOutputStream;import java.io.OutputStream;import java.io.Reader;import org.zhukovasky.HashBinaryClass.HashBinaryContainer;import org.zhukovasky.HashBinaryClass.Maps;import org.zhukovasky.fileutil.WordCount;import org.zhukovasky.fileutil.WordDictUtil;import org.zhukovasky.invertedindex.MapWords;/** * 以下是中文文本的分词工具, * 文本的编码为UTF-8 * @author zhukovasky * @version 1.0 * @since 2013.12 * @email zhukovasky@163.com * */public class chineseSeg {/** * 以下方法为对中文文本的分词写入到倒排索引中 * @param afterprocess经过预处理后的文本 * @param invertedIndex 存放倒排索引的地址 * @param 字典所在的地址 * * */public final static int MAXLENGTH=10;public static void FileSeg(File afterprocess,File invertedIndex,File dict){MapWords mapwords=new MapWords();Reader r=null;BufferedReader bf=null;ObjectOutputStream oos=null;OutputStream output=null;String Line=null;Maps map=WordCount.getDict(dict);int i=0;try {r=new FileReader(afterprocess);bf=new BufferedReader(r);Line=bf.readLine();int Kase=0;if(Line.length()<=MAXLENGTH+1){Kase=1;}else{Kase=2;}switch(Kase){case 1:{while(i<=Line.length()-1){String str=null;String str1=null;String str2=null;str=Line.substring(0);str1=Line.substring(0, 1);str2=Line.substring(1, 2);int seek=0;if(map.isCwordExist(str1)){if(map.getHBC(str1).isSecondWordExist(str2)){HashBinaryContainer hbc=map.getHBC(str1);String[] temp=hbc.getMatchArray(str2);String[] maxletemp=WordDictUtil.getStringLengthArray(temp);if(maxletemp[0].length()==1){String segword=str1+str2;seek=2;mapwords.addNewNodeElement(segword, afterprocess.getName(), i);i=i+2;}else{int length=maxletemp[0].length();String str3=str.substring(1, length+1);String segword=str1+str3;if(WordDictUtil.isWordMatched(str3, maxletemp)){mapwords.addNewNodeElement(segword, afterprocess.getName(), i);seek=segword.length();i=i+seek;}else{i=i+2;segword=str1+str2;mapwords.addNewNodeElement(segword, afterprocess.getName(), i);}}}else{mapwords.addNewNodeElement(str1, afterprocess.getName(), i);i++;}}else{mapwords.addNewNodeElement(str1, afterprocess.getName(), i);i++;}}};break;case 2:{while(i<=Line.length()-1){String str=null;String str1=null;String str2=null;if(i>=Line.length()-1-MAXLENGTH&&Line.length()-1-MAXLENGTH>0){str=Line.substring(i);if(i>=Line.length()-1&&Line.length()-1>0){if(i>Line.length()){break;}str1=Line.substring(i);str2=null;break;}else{str1=str.substring(0,1);str2=str.substring(1,2);int seek=0;if(map.isCwordExist(str1)){if(map.getHBC(str1).isSecondWordExist(str2)){HashBinaryContainer hbc=map.getHBC(str1);String[] temp=hbc.getMatchArray(str2);String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);if(MaxLeTemp[0].length()==1){String segword=str1+str2;seek=2;mapwords.addNewNodeElement(segword, afterprocess.getName(), i);i=i+2;}else{int length=MaxLeTemp[0].length();if(str.length()<length){break;}String str3=str.substring(1, length+1);String segword=str1+str3;if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){mapwords.addNewNodeElement(segword, afterprocess.getName(), i);seek=segword.length();i=i+seek;}else{i=i+2;segword=str1+str2;mapwords.addNewNodeElement(segword, afterprocess.getName(), i);}}}else{mapwords.addNewNodeElement(str1, afterprocess.getName(), i);seek=1;i=i+seek;}}else{mapwords.addNewNodeElement(str1, afterprocess.getName(), i);seek=1;i=i+seek;}}}else{str=Line.substring(i, i+MAXLENGTH);str1=str.substring(0, 1);str2=str.substring(1, 2);int seek=0;if(map.isCwordExist(str1)){if(map.getHBC(str1).isSecondWordExist(str2)){HashBinaryContainer hbc=map.getHBC(str1);String[] temp=hbc.getMatchArray(str2);String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);if(MaxLeTemp[0].length()==1){String segword=str1+str2;seek=2;mapwords.addNewNodeElement(segword, afterprocess.getName(), i);i=i+2;}else{int length=MaxLeTemp[0].length();String str3=str.substring(1, length+1);String segword=str1+str3;if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){mapwords.addNewNodeElement(segword, afterprocess.getName(), i);seek=segword.length();i=i+seek;}else{i=i+2;segword=str1+str2;mapwords.addNewNodeElement(segword, afterprocess.getName(), i);}}}else{mapwords.addNewNodeElement(str1, afterprocess.getName(), i);seek=1;i=i+seek;}}else{mapwords.addNewNodeElement(str1, afterprocess.getName(), i);seek=1;i=i+seek;}}}};break;}} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}finally{try {r.close();bf.close();} catch (IOException e) {e.printStackTrace();}}try {output=new FileOutputStream(invertedIndex);oos=new ObjectOutputStream(output);oos.writeObject(mapwords);} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}finally{try {oos.close();} catch (IOException e) {e.printStackTrace();}}}/** * 以下方法为对中文文本的分词写入到倒排索引中 * @param afterprocess[k]经过预处理后的文本 * @param invertedIndex 存放倒排索引的地址 * @param 字典所在的地址 * */public static void FileArraysSeg(File[] afterprocess,File invertedIndex,File dict){MapWords mapwords=new MapWords();Reader r=null;BufferedReader bf=null;ObjectOutputStream oos=null;OutputStream output=null;String Line=null;Maps map=WordCount.getDict(dict);int i=0;int MAXLENGTH=9;//取决于词典中最大长度词条for(int k=0;k<afterprocess.length;k++){try {r=new FileReader(afterprocess[k]);bf=new BufferedReader(r);Line=bf.readLine();int Kase=0;if(Line.length()<=MAXLENGTH+1){Kase=1;}else{Kase=2;}switch(Kase){case 1:{while(i<=Line.length()-1){String str=null;String str1=null;String str2=null;str=Line.substring(0);str1=Line.substring(0, 1);str2=Line.substring(1, 2);int seek=0;if(map.isCwordExist(str1)){if(map.getHBC(str1).isSecondWordExist(str2)){HashBinaryContainer hbc=map.getHBC(str1);String[] temp=hbc.getMatchArray(str2);String[] maxletemp=WordDictUtil.getStringLengthArray(temp);if(maxletemp[0].length()==1){String segword=str1+str2;seek=2;mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);i=i+2;}else{int length=maxletemp[0].length();String str3=str.substring(1, length+1);String segword=str1+str3;if(WordDictUtil.isWordMatched(str3, maxletemp)){mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);seek=segword.length();i=i+seek;}else{i=i+2;segword=str1+str2;mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);}}}else{mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);i++;}}else{mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);i++;}}};break;case 2:{while(i<=Line.length()-1){String str=null;String str1=null;String str2=null;if(i>=Line.length()-1-MAXLENGTH&&Line.length()-1-MAXLENGTH>0){str=Line.substring(i);if(i>=Line.length()-1&&Line.length()-1>0){if(i>Line.length()){break;}str1=Line.substring(i);str2=null;break;}else{str1=str.substring(0,1);str2=str.substring(1,2);int seek=0;if(map.isCwordExist(str1)){if(map.getHBC(str1).isSecondWordExist(str2)){HashBinaryContainer hbc=map.getHBC(str1);String[] temp=hbc.getMatchArray(str2);String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);if(MaxLeTemp[0].length()==1){String segword=str1+str2;seek=2;mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);i=i+2;}else{int length=MaxLeTemp[0].length();if(str.length()<length){break;}String str3=str.substring(1, length+1);String segword=str1+str3;if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);seek=segword.length();i=i+seek;}else{i=i+2;segword=str1+str2;mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);}}}else{mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);seek=1;i=i+seek;}}else{mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);seek=1;i=i+seek;}}}else{str=Line.substring(i, i+MAXLENGTH);str1=str.substring(0, 1);str2=str.substring(1, 2);int seek=0;if(map.isCwordExist(str1)){if(map.getHBC(str1).isSecondWordExist(str2)){HashBinaryContainer hbc=map.getHBC(str1);String[] temp=hbc.getMatchArray(str2);String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);if(MaxLeTemp[0].length()==1){String segword=str1+str2;seek=2;mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);i=i+2;}else{int length=MaxLeTemp[0].length();String str3=str.substring(1, length+1);String segword=str1+str3;if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);seek=segword.length();i=i+seek;}else{i=i+2;segword=str1+str2;mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);}}}else{mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);seek=1;i=i+seek;}}else{mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);seek=1;i=i+seek;}}}};break;}} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}finally{try {r.close();bf.close();} catch (IOException e) {e.printStackTrace();}}}try {output=new FileOutputStream(invertedIndex);oos=new ObjectOutputStream(output);oos.writeObject(mapwords);} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}finally{try {oos.close();} catch (IOException e) {e.printStackTrace();}}}}
0 0
- 一种中文文本的快速分词方法(二)
- 一种中文文本的快速分词方法(三)
- 一种中文文本的快速分词方法(一)(未完待续)
- KTDictSeg 一种简单快速准确的中文分词方法
- 数据库分词查询的优缺点以及英文和中文各自的分词方法(二)
- 中文文本处理之jieba分词笔记(二)
- 字本位的中文文本分词
- 快速分词方法的问题
- 一种没有语料字典的分词方法
- 一种没有语料字典的分词方法
- [搜索]一种分词方法的实现
- 文本分词方法
- (2)中文分词——基于词典的方法
- 数据库分词查询的优缺点以及英文和中文各自的分词方法(一)
- 一种快速文件传输的方法
- 快速排序的一种方法
- Python点滴02_Python3打开中文文本时报错的一种处理方法
- 文本分析之中文分词
- [二分查找]Babelfish uva 10282
- php 简单生成 excel文件
- HDOJ 2196 Computer
- Android多线程方式处理图片下载及显示
- mac下安装wordpress以及配置相关环境
- 一种中文文本的快速分词方法(二)
- codeforces Strings of Power
- ZOJ 3201 Tree of Tree
- Struts2模型驱动
- Computer Science Study Summary --part 3
- A+B Problem(V)
- dp 01串
- Android 发送HTTP GET POST 请求以及通过 MultipartEntityBuilder 上传文件
- 一种中文文本的快速分词方法(三)