二阶和三阶隐马尔柯夫过程(HMM)进行中文分词的效果对比

来源:互联网 发布:淘宝店铺广告短信 编辑:程序博客网 时间:2024/06/03 06:59


第一部分 引言

        关于隐马尔柯夫模型的详细内容在此就不详细介绍了,介绍HMM模型的文章很多,请读者自行去学习。二阶隐马尔柯夫模型解决问题有两个假设:其一是当前的状态仅与它前面相邻的状态有关;其二是状态转换和从某个状态发射某个观察符号的概率与时间t无关(即不动性假设)。HMM是在这两个假设的前提下解决各种各样的问题的。

       对于第二个假设,我们不去讨论它。现在来看第一个假设,二阶马尔柯夫过程假设当前状态仅与前面相邻的一个状态有关,那么对于分词来说,有些词可能会满足这样的情况,但也有可能会有些词并不这么简单的处理,他们可能和前面的数个状态有关。基于以上考虑,本文就二阶马尔柯夫过程和三阶马尔柯夫过程对中文分词做了一个实验,验证二者哪个更适合于中文的分词。


第二部分 试验结果

        语料来源:1998年1月《人民日报》

        预处理:首先将原始语料去掉段首的编号,然后将每个句子作为一行,编码转换为utf-8格式。最后按照9:1的比例将语料分为训练语料train.txt和测试语料test.txt。

        分别用二阶马尔柯夫过程和三阶马尔柯夫过程进行分词。试验结果如下图所示:


        从上图中可以看出,

        (1)三阶HMM的分词准确率和召回率均优于二阶HMM。

        (2)三阶HMM的切分词的总数要少于二阶HMM,但是切分正确的数量要大于二阶HMM。换句话说,二阶HMM倾向于将句子切的更细,而三阶HMM会比较保守,将句子切分出来的词少一些。

        (3)另外,对于交集型的歧义,三阶HMM明显要少于二阶HMM。说明马尔柯夫链增长之后,对于减少交集型歧义是很有作用的。


第三部分 源代码

        以下为本次实验过程的源代码,好多代码和之前讲述HMM分词的帖子里面的代码是一样的,只是一少部分作了修改,为了本文内容的完整性,我还是把代码全都在粘一遍把。注意之前的代码处理的是gb2312格式的文本,本文的代码文本本的格式要转换为utf-8哦,要不然,你把gb2312读进去出现了很多乱码,别说我没告诉你哦····

        (1)文件名:util.h。下面好几个文件都要用到该文件,如将测试文件中的/去掉

#ifndef UTIL_H#define UTIL_H#include <string>using namespace std;/* * 函数功能:将字符串中的所有特定子串置换为新的字符串 * 函数输入:str     需要进行操作的字符串 *         old_str 旧的字符串 *         new_str 新的字符串 * 函数输出:置换完毕的字符串 */string& replace_all(string &str, string old_str, string new_str){while(1){string::size_type pos(0);if((pos = str.find(old_str)) != string::npos){str.replace(pos, old_str.length(), new_str);}else{break;}}return str;}#endif

        (2)文件名:prehmm.cpp。对文件进行预处理工作,函数的功能请参见代码中的注释。

#include <iostream>#include <fstream>#include <sstream>#include <string>#include <cstdlib>#include <map>#include "util.h"using namespace std;/* * 函数功能:将训练语料和测试语料中出现的汉字进行编码,将他们的对应关系存入文件 *         格式为:汉字-编码,编码从0开始 * 函数输入:infile_1 训练语料文件名 *         infile_2 测试语料文件名 *         outfile  指定的输出文件名 * 函数输出:名为outfile的文件 */void makeDB(string infile_1, string infile_2, string outfile){//读取输入文件ifstream fin_1(infile_1.c_str());ifstream fin_2(infile_2.c_str());if(!(fin_1 && fin_2)){cerr << "makeDB : Open input file fail !" << endl;exit(-1);}//打开输出文件ofstream fout(outfile.c_str());if(!fout){cerr << "makeDB : Open output file fail !" << endl;exit(-1);}map<string, int> map_cchar;int id = -1;string line = "";string cchar = "";//读取输入文件内容while(getline(fin_1, line)){line = replace_all(line, "/", "");if(line.size() >= 3){//逐字读取for(int i = 0; i < line.size() - 2; i += 3){cchar = line.substr(i, 3);if(map_cchar.find(cchar) == map_cchar.end()){++id;map_cchar[cchar] = id;}}}}while(getline(fin_2, line)){line = replace_all(line, "/", "");if(line.size() >= 3){//逐字读取for(int i = 0; i < line.size() - 2; i += 3){cchar = line.substr(i, 3);if(map_cchar.find(cchar) == map_cchar.end()){++id;map_cchar[cchar] = id;}}}}//输出到文件map<string, int>::iterator iter;for(iter = map_cchar.begin(); iter != map_cchar.end(); ++iter){//cout << iter -> first << " " << iter -> second << endl;fout << iter -> first << " " << iter -> second << endl;}fin_1.close();fin_2.close();fout.close();}/* * 函数功能:将训练语料每个汉字后面加入对应的BMES状态 * 函数输入:infile  训练语料文件名 *         outfile 指定的输出文件名 * 函数输出:名为outfile的文件 */void makeBMES(string infile, string outfile){ifstream fin(infile.c_str());ofstream fout(outfile.c_str());if(!(fin && fout)){cerr << "makeBMES : Open file failed !" << endl;exit(-1);}string word_in = "";string word_out = "";string line_in = "";string line_out = "";while(getline(fin, line_in)){if(line_in.size() >= 3){line_out.clear();line_in = replace_all(line_in, "/", " ");istringstream strstm(line_in);while(strstm >> word_in){word_out.clear();if(word_in.size()%3 != 0){cout << "单词不符合要求:" << word_in << endl;continue;}int num = word_in.size()/3;//单词中包含多少个汉字if(num == 0){continue;}if(num == 1){word_out = word_in;word_out += "/S";}else{//复制单词中的第一个字word_out.insert(word_out.size(), word_in, 0, 3);word_out += "/B";//逐个复制单词中间的字for(int i = 1; i < num - 1; i++){word_out.insert(word_out.size(), word_in, 3*i, 3);word_out += "/M";}//复制单词中最后的汉字word_out.insert(word_out.size(), word_in, 3*num - 3, 3);word_out += "/E";}line_out += word_out;}fout << line_out << endl;}}}/* * 主函数 */int main(int argc, char *argv[]){if(argc < 5){cout << "Usage: " << argv[0] << " train_file test_file db_file bmes_file" << endl;exit(-1);}//构造DB文件,输入训练语料、测试语料、输出文件名makeDB(argv[1], argv[2], argv[3]);//构造BMES文件,输入训练语料、输出文件名makeBMES(argv[1], argv[4]);}

        (3)文件名:db.h。将汉字和编码的映射文件内存,构造为map,供其他程序使用

#ifndef DB_H#define DB_H#include <iostream>#include <fstream>#include <map>#include <vector>#include <cstdlib>#include "util.h"using namespace std;/* * 转换类,获取编号 */class DB{private:map<string, int> cchar_map;//汉字-编码映射map<int, string> index_map;//编码-汉字映射public:DB();DB(string file);string getCchar(int id);//根据编码获得汉字int getObservIndex(string cchar);//根据汉字获得编码int getStateIndex(char state);//根据状态获得状态编号vector<int> makeObservs(string line);//将输入的句子构造为发射符号序列};//无参构造函数DB::DB(){}//有参构造函数DB::DB(string file){ifstream fin(file.c_str());if(!fin){cout << "Open input file fail ! Can't init Trans !" << endl;exit(-1);}string line = "";string word = "";string cchar = "";int id = 0;while(getline(fin, line)){istringstream strstm(line);strstm >> word;cchar = word;strstm >> word;id = atoi(word.c_str());//加入mapcchar_map[cchar] = id;index_map[id] = cchar;}cout << "cchar_map大小: " << cchar_map.size() << endl;cout << "index_map大小: " << index_map.size() << endl;}//将状态转换为数字编号int DB::getStateIndex(char state){switch(state){case 'B' :return 0;break;case 'M' :return 1;break;case 'E' :return 2;break;case 'S' :return 3;break;default :return -1;break;}}//将汉字转换为数字编号int DB::getObservIndex(string cchar){map<string, int>::iterator iter = cchar_map.find(cchar);if(iter != cchar_map.end()){return iter -> second;}else{return -1;}}//将数字编号转换为汉字string DB::getCchar(int id){map<int, string>::iterator iter = index_map.find(id);if(iter != index_map.end()){return iter -> second;}else{return NULL;}}//将输入的句子构造为发射符号序列vector<int> DB::makeObservs(string line){    vector<int> vec_observ; //输出符号的集合    string cchar = "";      //存放每个汉字    string word = "";       //存放一个单词    int num = 0;            //单词的字数    int index = -1;         //单词对应的编号    line = replace_all(line, "/", " ");    cout << line << endl;    istringstream strstm(line);    while(strstm >> word){        if(word.size()%3 != 0){            cout << "单词不符合要求:" << word << endl;            continue;        }        num = word.size()/3;        if(num == 0){            continue;        }else{            for(int i = 0; i < num; i++){                cchar = word.substr(3*i, 3);                index = getObservIndex(cchar);                vec_observ.push_back(index);//cout << "cchar = " << cchar << "   index = " << index << endl;            }        }    }    return vec_observ;}#endif

        (4)文件名:matrix.cpp。用最大似然估计的方法建立HMM的模型参数。

#include <iostream>#include <fstream>#include <sstream>#include <string>#include <iomanip>#include <cmath>#include <list>#include "db.h"using namespace std;const int N = 4;//隐藏状态的数目const int M = 4677;//汉字的个数const double VALUE = 1.0;//平滑算法增加的值//定义字典对象DB db("db.txt");/* * 模型训练,将频数转换为频率(加1平滑) */void turingAdd(const int count[], double prob[], int len){double sum = 0.0;for(int i = 0; i < len; ++i){sum += count[i];}sum = sum + VALUE * len;for(int i = 0; i < len; ++i){prob[i] = -log((count[i] + VALUE) / sum);//取对数}}/* * 模型训练,将发射频数转换为频率(古德-图灵平滑) */void turingGood(const int count[], double prob[], int len){map<int, list<int> > freq_map;//key为词频,value为该词频对应的汉字列表map<int, list<int> >::iterator iter;//迭代器int sum = 0;//词频总和//初始化freq_mapfor(int i = 0; i < len; i++){int freq = count[i];//词频sum += freq;iter = freq_map.find(freq);if(iter != freq_map.end()){//该词频已经存在,把当前词加入相应的listfreq_map[freq].push_back(i);}else{//该词频不存在,建立对应的汉字listlist<int> lst;lst.push_back(i);freq_map[freq] = lst;}}//若sum=0,则结果初始化为0.0即可if(sum == 0){for(int i = 0; i < len; i++){prob[i] = 0.0;}return;}//数据平滑处理iter = freq_map.begin();while(iter != freq_map.end()){double pr;//频率int freq = iter -> first;int freqsize = iter -> second.size();if(++iter != freq_map.end()){int freq_2 = iter -> first;if(freq_2 = freq + 1){int freqsize_2 = iter -> second.size();pr = ((1.0 + freq) * freqsize_2) / (sum * freqsize);}else{pr = 1.0 * freq / sum;}}else{pr = 1.0 * freq / sum;}//计算结果list<int> lst = (--iter) -> second;list<int>::iterator iter_in = lst.begin();while(iter_in != lst.end()){int index = *iter_in;prob[index] = pr;++iter_in;}//准备下次迭代++iter;}//概率归一化double total = 0.0;for(int i = 0; i < len; i++){total += prob[i];}for(int i = 0; i < len; i++){prob[i] = -log((double)prob[i] / total);//取对数}}/* * 主函数,生成HMM模型的参数 * 状态转移概率矩阵、初始状态概率矩阵、符号发射概率矩阵 */int main(int argc, char *argv[]){if(argc < 2){cout << "Usage: " << argv[0] << " bmes_file !" << endl;exit(-1);}ifstream fin(argv[1]);if(!fin){cerr << "Open input file " << argv[1] << "filed !" << endl;exit(-1);}int Pi[N] = {0};//初始状态出现次数int A1[N][N] = {0};//二阶状态转移次数int A2[N][N][N] = {0};//三阶状态转移次数int B1[N][M] = {0};//二阶符号发射次数int B2[N][N][M] = {0};//三阶符号发射次数//抽取文件中的状态和观察值string line = "";//存放每一行的内容int line_num = 0;//句子编号int count = 0;while(getline(fin, line)){line_num++;char state;//状态string cchar = "";//一个汉字int i, j, k, m;string::size_type pos = 0;//当前处理位置if((pos = line.find("/", pos + 1)) != string::npos){//抽取句子的第一个状态state = line.at(pos + 1);i = db.getStateIndex(state);Pi[i]++;//抽取句子的第一个观察值cchar = line.substr(pos - 3, 3);m = db.getObservIndex(cchar);B1[i][m]++;if((pos = line.find("/", pos + 1)) != string::npos){//抽取句子的第二个状态state = line.at(pos + 1);j = db.getStateIndex(state);A1[i][j]++;//抽取句子的第二个观察值cchar = line.substr(pos - 3, 3);m = db.getObservIndex(cchar);B1[j][m]++;B2[i][j][m]++;while((pos = line.find("/", pos + 1)) != string::npos){//抽取句子的其他状态state = line.at(pos + 1);k = db.getStateIndex(state);A1[j][k]++;A2[i][j][k]++;//抽取句子的其他观察值cchar = line.substr(pos - 3, 3);m = db.getObservIndex(cchar);B1[k][m]++;B2[j][k][m]++;//准备下次迭代i = j;j = k;}}}}fin.close();//打开输出流ofstream fout_1("Pi.mat");//初始概率矩阵ofstream fout_2("A1.mat");//二阶状态转移矩阵ofstream fout_3("A2.mat");//三阶状态转移矩阵ofstream fout_4("B1.mat");//二阶发射概率矩阵ofstream fout_5("B2.mat");//三阶发射概率矩阵if(!(fout_1 && fout_2 && fout_3 && fout_4 && fout_5)){cerr << "Create Matrix file failed !" << endl;exit(-1);}fout_1 << setprecision(8);fout_2 << setprecision(8);fout_3 << setprecision(8);fout_4 << setprecision(8);fout_5 << setprecision(8);//初始状态矩阵写入文件double arr_pi[N] = {0.0};//turingGood(Pi, arr_pi, N);turingAdd(Pi, arr_pi, N);for(int i = 0; i < N; i++){fout_1 << arr_pi[i] << "\t";}fout_1 << endl;//二阶状态转移矩阵写入文件double arr_a_1[N] = {0.0};for(int i = 0; i < N; i++){//turingGood(A1[i], arr_a_1, N);turingAdd(A1[i], arr_a_1, N);for(int j = 0; j < N; j++){fout_2 << arr_a_1[j] << "\t";}fout_2 << endl;}//三阶状态转移矩阵写入文件double arr_a_2[N] = {0.0};for(int i = 0; i < N; i++){for(int j = 0; j < N; j++){//turingGood(A2[i][j], arr_a_2, N);turingAdd(A2[i][j], arr_a_2, N);for(int k = 0; k < N; k++){fout_3 << arr_a_2[k] << "\t";}fout_3 << endl;}}//二阶发射概率矩阵写入文件double arr_b_1[M] = {0.0};for(int i = 0; i < N; i++){//turingGood(B1[i], arr_b_1, M);turingAdd(B1[i], arr_b_1, M);for(int j = 0; j < M; j++){fout_4 << arr_b_1[j] << "\t";}fout_4 << endl;}//三阶发射概率矩阵写入文件double arr_b_2[M] = {0.0};for(int i = 0; i < N; i++){for(int j = 0; j < N; j++){//turingGood(B2[i][j], arr_b_2, M);turingAdd(B2[i][j], arr_b_2, M);for(int k = 0; k < M; k++){fout_5 << arr_b_2[k] << "\t";}fout_5 << endl;}}fout_1.close();fout_2.close();fout_3.close();fout_4.close();fout_5.close();return 0;}

        (5)文件名:hmm.h。将存储在文件中的HMM的模型参数读取到内存中,构造为一个HMM对象,供其他程序使用。

#ifndef HMM_H#define HMM_H#include <fstream>#include <sstream>#include <string>#include <cstdlib>const int N = 4;const int M = 4677;using namespace std;//定义HMM模型class HMM{public:int n;//状态数目int m;//可能的观察符号数目double Pi[N];//初始状态概率double A1[N][N];//状态转移概率矩阵double A2[N][N][N];//状态转移概率矩阵double B1[N][M];//符号发射概率矩阵double B2[N][N][M];//符号发射概率矩阵HMM();HMM(string f_pi, string f_a1, string f_a2, string f_b1, string f_b2);};//无参构造函数HMM::HMM(){}//有参构造函数HMM::HMM(string f_pi, string f_a1, string f_a2, string f_b1, string f_b2){ifstream fin_1(f_pi.c_str());ifstream fin_2(f_a1.c_str());ifstream fin_3(f_a2.c_str());ifstream fin_4(f_b1.c_str());ifstream fin_5(f_b2.c_str());if(!(fin_1 && fin_2 && fin_3 && fin_4 && fin_5)){exit(-1);}n = N;m = M;string line = "";string word = "";//读取Pigetline(fin_1, line);istringstream strstm_1(line);for(int i = 0; i < N; i++){strstm_1 >> word;Pi[i] = atof(word.c_str());}//读取A1for(int i = 0; i < N; i++){getline(fin_2, line);istringstream strstm_2(line);for(int j = 0; j < N; j++){strstm_2 >> word;A1[i][j] = atof(word.c_str());}}//读取A2for(int i = 0; i < N; i++){for(int j = 0; j < N; j++){getline(fin_3, line);istringstream strstm_3(line);for(int k = 0; k < N; k++){strstm_3 >> word;A2[i][j][k] = atof(word.c_str());}}}//读取B1for(int i = 0; i < N; i++){getline(fin_4, line);istringstream strstm_4(line);for(int j = 0; j < M; j++){strstm_4 >> word;B1[i][j] = atof(word.c_str());}}//读取B2for(int i = 0; i < N; i++){for(int j = 0; j < N; j++){getline(fin_5, line);istringstream strstm_5(line);for(int k = 0; k < M; k++){strstm_5 >> word;B2[i][j][k] = atof(word.c_str());}}}fin_1.close();fin_2.close();fin_3.close();fin_4.close();fin_5.close();}#endif

        (6)文件名:viterbi.cpp。维特比算法,用于分词。这是最核心的部分哦···

#include <iostream>#include <fstream>#include <sstream>#include <string>#include <stack>#include "hmm.h"#include "db.h"using namespace std;HMM hmm("Pi.mat", "A1.mat", "A2.mat", "B1.mat", "B2.mat");//初始化HMM模型DB db("db.txt");//初始化字典/* * Viterbi算法进行分词,二阶马尔柯夫过程 */string viterbiTwo(string str_in){//计算输入句子中的汉字个数int row = str_in.size() / 3;string str_out = "";//如果输入字符串为空,则直接返回空if(row == 0){return str_out;}//如果只有一个字的话,则直接输出即可if(row < 2){str_out = str_in + "/";return str_out;}//分配矩阵空间double **delta = new double *[row];int **path = new int *[row];for(int i = 0; i < row; i++){delta[i] = new double[N]();path[i] = new int[N]();}//中间变量string cchar = "";//存放汉字int min_path = -1;double val = 0.0;double min_val = 0.0;//初始化矩阵,给delta和path矩阵的第一行赋初值cchar = str_in.substr(0, 3);int cchar_num = db.getObservIndex(cchar);for(int i = 0; i < N; i++){delta[0][i] = hmm.Pi[i] + hmm.B1[i][cchar_num];//对数path[0][i] = -1;}//给delta和path的后续行赋值(对数)for(int t = 1; t < row; t++){cchar = str_in.substr(3*t, 3);cchar_num = db.getObservIndex(cchar);for(int j = 0; j < N; j++){min_val = 100000.0;min_path = -1;for(int i = 0; i < N; i++){val = delta[t-1][i] + hmm.A1[i][j];if(val < min_val){min_val = val;min_path = i;}}delta[t][j] = min_val + hmm.B1[j][cchar_num];path[t][j] = min_path;}}//找delta矩阵最后一行的最大值min_val = 100000.0;min_path = -1;for(int i = 0; i < N; i++){if(delta[row-1][i] < min_val){min_val = delta[row-1][i];min_path = i;}}//从min_path出发,回溯得到最可能的路径stack<int> path_st;path_st.push(min_path);for(int i = row - 1; i > 0; i--){min_path = path[i][min_path];path_st.push(min_path);}//释放二维数组for(int i = 0; i < row; i++){delete []delta[i];delete []path[i];}delete []delta;delete []path;//根据标记好的状态序列分词int pos = 0;int index = -1;while(!path_st.empty()){index = path_st.top();path_st.pop();str_out.insert(str_out.size(), str_in, pos, 3);if(index == 2 || index == 3){//状态为E或Sstr_out.append("/");}pos += 3;}}/* * Viterbi算法进行分词:三阶马尔柯夫过程 */string viterbiThree(string str_in){//计算输入句子中的汉字个数int row = str_in.size() / 3;string str_out = "";//如果输入字符串为空,则直接返回空if(row == 0){return str_out;}//如果只有一个字的话,则直接输出即可if(row < 2){str_out = str_in + "/";return str_out;}//分配矩阵空间double ***delta = new double **[row];int ***path = new int **[row];for(int i = 0; i < row; i++){delta[i] = new double *[N];path[i] = new int *[N];for(int j = 0; j < N; j++){delta[i][j] = new double[N];path[i][j] = new int[N];for(int k = 0; k < N; k++){delta[i][j][k] = 0.0;path[i][j][k] = 0;}}}//初始化矩阵,给delta和path矩阵的第1个面赋初值//初始状态需要两个面,第0面不赋值,只给第1个面赋值string cchar_1 = str_in.substr(0, 3);//第1个字string cchar_2 = str_in.substr(3, 3);//第2个字int num_1 = db.getObservIndex(cchar_1);//第1个字的编号int num_2 = db.getObservIndex(cchar_2);//第2个字的编号for(int i = 0; i < N; i++){for(int j = 0; j < N; j++){delta[1][i][j] = hmm.Pi[i] + hmm.B1[i][num_1] + hmm.A1[i][j] + hmm.B2[i][j][num_2];//对数path[1][i][j] = -1;}}//中间变量string cchar_3 = "";//存放汉字int min_path = -1;double val = 0.0;double min_val = 0.0;//给delta和path的后续面赋值(对数)//第0、1面为初始面,后续面从2开始,到row-1为止for(int t = 2; t < row; t++){cchar_3 = str_in.substr(3*t, 3);int num_3 = db.getObservIndex(cchar_3);for(int j = 0; j < N; j++){for(int k = 0; k < N; k++){min_val = 100000.0;min_path = -1;for(int i = 0; i < N; i++){val = delta[t-1][i][j] + hmm.A2[i][j][k];if(val < min_val){min_val = val;min_path = i;}}delta[t][j][k] = min_val + hmm.B2[j][k][num_3];path[t][j][k] = min_path;}}}//找delta矩阵最后一个面的最大值,最后一个面为row-1min_val = 100000.0;int min_path_i = -1;int min_path_j = -1;for(int i = 0; i < N; i++){for(int j = 0; j < N; j++){if(delta[row-1][i][j] < min_val){min_val = delta[row-1][i][j];min_path_i = i;min_path_j = j;}}}//从min_path_i和min_path_j出发,回溯得到最可能的路径//回溯从row-1开始,到2为止stack<int> path_st;path_st.push(min_path_j);path_st.push(min_path_i);for(int t = row - 1; t > 1; t--){int min_path_k = path[t][min_path_i][min_path_j];path_st.push(min_path_k);min_path_j = min_path_i;min_path_i = min_path_k;}//释放三维数组for(int i = 0; i < row; i++){for(int j = 0; j < N; j++){delete []delta[i][j];delete []path[i][j];}delete []delta[i];delete []path[i];}delete []delta;delete []path;//根据标记好的状态序列分词int pos = 0;int index = -1;while(!path_st.empty()){index = path_st.top();path_st.pop();str_out.insert(str_out.size(), str_in, pos, 3);if(index == 2 || index == 3){//状态为E或Sstr_out.append("/");}pos += 3;}}

        (7)文件名:main.cpp。主函数,调用维特比算法进行分词工作,并对分词结果进行比对,统计后输出结果。

#include <cstdlib>#include <vector>#include <iomanip>#include <map>#include <algorithm>#include <sys/time.h>#include <sys/stat.h>#include "util.h"#include "viterbi.cpp"const long MaxCount = 50000;//需要切分的最大句子数量,若该值大于文件中//实际的句子数量,以实际句子数量为准。//获取当前时间(ms)long getCurrentTime(){struct timeval tv;gettimeofday(&tv, NULL);return tv.tv_sec*1000 + tv.tv_usec/1000;}//获取文件大小unsigned long getFileSize(string file_path){unsigned long filesize = -1;struct stat statbuff;if(stat(file_path.c_str(), &statbuff) < 0){return filesize;}else{filesize = statbuff.st_size;}return filesize;}/* * 函数功能:计算切分标记的位置 * 函数输入:1.strline_in未进行切分的汉字字符串           2.strline_right进行切分后的汉字字符串 * 函数输出:vecetor,其中存放了strline_in中哪些位置放置了分词标记 *         注意:vector中不包含最后标记的位置,但是包含位置0。 */vector<int> getPos(string strline_right, string strline_in){int pos_1 = 0;int pos_2 = -1;int pos_3 = 0;string word = "";vector<int> vec;int length = strline_right.length();while(pos_2 < length){//前面的分词标记pos_1 = pos_2;//后面的分词标记pos_2 = strline_right.find('/', pos_1 + 1);if(pos_2 > pos_1){//将两个分词标记之间的单词取出word  = strline_right.substr(pos_1 + 1, pos_2 - pos_1 - 1);//根据单词去输入序列中查出出现的位置pos_3 = strline_in.find(word, pos_3);//将位置存入数组vec.push_back(pos_3);pos_3 = pos_3 + word.size();}else{break;}}return vec;}/* * 获取标准切分和程序切分的结果 */string getString(string word, int pos, vector<int> vec_right){char ss[1000];int i = 0;int k = 0;if(vec_right.size() == 0){return word;}while(vec_right[i] < pos){i++;}for(int j = 0; j < word.size(); j++){if(j == vec_right[i] - pos){if(j != 0){ss[k] = '/';++k;}++i;}ss[k] = word[j];++k;}ss[k] = '\0';string word_str = ss;return word_str;}/* * 函数功能:获取单个句子切分的结果统计 * 函数输入:1.vec_right 正确的分词标记位置集合 *           2.vec_out   函数切分得到的分词标记位置集合 * 函数输出:返回一个veceor,含有4个元素,分别为: *          切分正确、组合型歧义、未登录词、交集型歧义的数量 * */vector<int> getCount_2(string strline, vector<int> vec_right, vector<int> vec_out, vector<string> &vec_err){vector<int> vec(4, 0);//存放计算结果//建立mapmap<int, int> map_result;for(int i = 0; i < vec_right.size(); i++){map_result[vec_right[i]] += 1;}for(int i = 0; i < vec_out.size(); i++){map_result[vec_out[i]] += 2;}//统计map中的信息//若value=1,只在vec_right中//若value=2,只在vec_out中//若value=3,在vec_right和vec_out中都有map<int, int>::iterator p_pre, p_cur;int count_value_1 = 0;int count_value_2 = 0;int count_value_3 = 0;p_pre = map_result.begin();p_cur = map_result.begin();while(p_cur != map_result.end()){while(p_cur != map_result.end() && p_cur -> second == 3){p_pre = p_cur;++count_value_3;//切分正确的数目++p_cur;//迭代器后移}while(p_cur != map_result.end() && p_cur -> second != 3){if(p_cur -> second == 1){++count_value_1;}else if(p_cur -> second == 2){++count_value_2;}++p_cur;}//确定切分错误的字符串if(p_cur == map_result.end() && p_cur == (++p_pre)){continue;}int pos_1 = p_pre -> first;int pos_2 = p_cur -> first; string word = strline.substr(pos_1, pos_2 - pos_1);//切分错误的单词string word_right = getString(word, pos_1, vec_right);//正确的切分方式string word_out = getString(word, pos_1, vec_out);//得到的切分方式 string str_err = "";//不同的错误类型if(count_value_1 > 0 && count_value_2 == 0){str_err = "  组合型歧义: " + word + "    正确切分: " + word_right + "    错误切分: " + word_out;vec_err.push_back(str_err);cout << str_err << endl;vec[1] += count_value_1;}else if(count_value_1 == 0 && count_value_2 > 0){str_err = "  未登录词语: " + word + "    正确切分: " + word_right + "    错误切分: " + word_out;vec_err.push_back(str_err);cout << str_err << endl;vec[2] += count_value_2;}else if(count_value_1 > 0 && count_value_2 > 0){str_err = "  交集型歧义: " + word + "    正确切分: " + word_right + "    错误切分: " + word_out;vec_err.push_back(str_err);cout << str_err << endl;vec[3] += count_value_2;}//计数器复位count_value_1 = 0;count_value_2 = 0;}vec[0] += count_value_3;return vec;}/* * 主函数:进行分词并统计分词结果 * */int main(int argc, char *argv[]){if(argc < 3){cout << "Usage: " << argv[0] << " test_file result_file" << endl;exit(-1);}long time_1 = getCurrentTime();string strline_right;//输入语料:用作标准分词结果string strline_in;//去掉分词标记的语料(用作分词的输入)string strline_out_1;//隐马尔科夫模型(二阶)分词完毕的语料string strline_out_2;//隐马尔科夫模型(三阶)分词完毕的语料ifstream fin(argv[1]);//打开输入文件if(!fin){cout << "Unable to open input file !" << argv[1] << endl;exit(-1);}ofstream fout(argv[2]);//确定输出文件if(!fout){cout << "Unable to open output file !" << endl;exit(-1);}long count = 0;//句子编号long count_1 = 0;//隐马尔科夫模型(二阶)切分完全正确的句子总数long count_2 = 0;//隐马尔科夫模型(三阶)切分完全正确的句子总数long count_right_all = 0;//准确的切分总数//二阶long count_out_1_all = 0;//隐马尔科夫模型切分总数long count_out_1_right_all = 0;//隐马尔科夫模型切分正确总数long count_out_1_fail_1_all = 0;//隐马尔科夫模型(组合型歧义)long count_out_1_fail_2_all = 0;//隐马尔科夫模型(未登录词语)long count_out_1_fail_3_all = 0;//隐马尔科夫模型(交集型歧义)//三阶long count_out_2_all = 0;//隐马尔科夫模型切分总数long count_out_2_right_all = 0;//隐马尔科夫模型切分正确总数long count_out_2_fail_1_all = 0;//隐马尔科夫模型(组合型歧义)long count_out_2_fail_2_all = 0;//隐马尔科夫模型(未登录词语)long count_out_2_fail_3_all = 0;//隐马尔科夫模型(交集型歧义)vector<string> vec_err_1;//隐马尔科夫模型(二阶)切分错误的词vector<string> vec_err_2;//隐马尔科夫模型(三阶)切分错误的词while(getline(fin, strline_right, '\n') && count < MaxCount){if(strline_right.length() > 1){//去掉分词标记strline_in = strline_right;strline_in = replace_all(strline_in, "/", "");//隐马尔科夫模型分词strline_out_1 = strline_right;istringstream strstm(strline_in);string sentence;string result_1;string result_2;string line_out_1;string line_out_2;while(strstm >> sentence){//二阶切分result_1 = viterbiTwo(sentence);line_out_1 += result_1;//三阶切分result_2 = viterbiThree(sentence);line_out_2 += result_2;}strline_out_1 = line_out_1;strline_out_2 = line_out_2;//输出分词结果count++;cout << "----------------------------------------------" << endl;cout << "句子编号:" << count << endl;cout << endl;cout << "待分词的句子长度: " << strline_in.length() << "  句子:" << endl;cout << strline_in << endl;cout << endl;cout << "标准比对结果长度: " << strline_right.length() << "  句子:" << endl;cout << strline_right << endl;cout << endl;cout << "隐马尔科夫模型(二阶)分词长度: " << strline_out_1.length() << "  句子:" << endl;cout << strline_out_1 << endl;cout << endl;cout << "隐马尔科夫模型(三阶)分词长度: " << strline_out_2.length() << "  句子:" << endl;cout << strline_out_2 << endl;cout << endl;//输出分词结果的数字序列表示vector<int> vec_right = getPos(strline_right, strline_in);vector<int> vec_out_1 = getPos(strline_out_1, strline_in);vector<int> vec_out_2 = getPos(strline_out_2, strline_in);cout << "标准结果:" << endl;for(int i = 0; i < vec_right.size(); i++){cout << setw(4) << vec_right[i];}cout << endl;cout << "隐马尔科夫模型(二阶)分词结果:" << endl;for(int i = 0; i < vec_out_1.size(); i++){cout << setw(4) << vec_out_1[i];}cout << endl;cout << "隐马尔科夫模型(三阶)分词结果:" << endl;for(int i = 0; i < vec_out_2.size(); i++){cout << setw(4) << vec_out_2[i];}cout << endl;//输出匹配的错误列表cout << endl;if(vec_right == vec_out_1){cout << "隐马尔科夫模型(二阶)分词完全正确!" << endl;count_1++;}else{cout << "隐马尔科夫模型(二阶)分词错误列表:" << endl;}vector<int> vec_count_1 = getCount_2(strline_in, vec_right, vec_out_1, vec_err_1);cout << endl;if(vec_right == vec_out_2){cout << "隐马尔科夫模型(三阶)分词完全正确!" << endl;count_2++;}else{cout << "隐马尔科夫模型(三阶)分词错误列表:" << endl;}vector<int> vec_count_2 = getCount_2(strline_in, vec_right, vec_out_2, vec_err_2);//准确的切分数量int count_right = vec_right.size();//切分得到的数量int count_out_1 = vec_out_1.size();int count_out_2 = vec_out_2.size();//切分正确的数量int count_out_1_right = vec_count_1[0];cout << "切分得到:" << count_out_1 << endl;cout << "切分正确:" << count_out_1_right << endl;cout << "隐马尔科夫模型(二阶):" << endl;cout << "  组合型歧义:" << vec_count_1[1] << endl;cout << "  未登录词语:" << vec_count_1[2] << endl;cout << "  交集型歧义:" << vec_count_1[3] << endl;int count_out_2_right = vec_count_2[0];cout << "切分得到:" << count_out_2 << endl;cout << "切分正确:" << count_out_2_right << endl;cout << "隐马尔科夫模型(三阶):" << endl;cout << "  组合型歧义:" << vec_count_2[1] << endl;cout << "  未登录词语:" << vec_count_2[2] << endl;cout << "  交集型歧义:" << vec_count_2[3] << endl;count_right_all += count_right;count_out_1_all += count_out_1;count_out_1_right_all += count_out_1_right;count_out_1_fail_1_all += vec_count_1[1];count_out_1_fail_2_all += vec_count_1[2];count_out_1_fail_3_all += vec_count_1[3];count_out_2_all += count_out_2;count_out_2_right_all += count_out_2_right;count_out_2_fail_1_all += vec_count_2[1];count_out_2_fail_2_all += vec_count_2[2];count_out_2_fail_3_all += vec_count_2[3];}}long time_2 = getCurrentTime();unsigned long file_size = getFileSize("test.txt");//打印错误的切分内容cout << endl;cout << "---------------------------------" << endl;cout << "错误样例(已排序):" << endl;//对错误切分内容进行排序并掉重复的sort(vec_err_1.begin(), vec_err_1.end());sort(vec_err_2.begin(), vec_err_2.end());vector<string>::iterator end_unique_1 = unique(vec_err_1.begin(), vec_err_1.end());vector<string>::iterator end_unique_2 = unique(vec_err_2.begin(), vec_err_2.end());int num_1 = end_unique_1 - vec_err_1.begin();int num_2 = end_unique_2 - vec_err_2.begin();cout << "----------------------------------" << endl;cout << "隐马尔科夫模型(二阶)切分错误数量:" << num_1 << endl;for(int i = 0; i < num_1; i++){cout << vec_err_1[i] << endl;}cout << endl;cout << "----------------------------------" << endl;cout << "隐马尔科夫模型(三阶)切分错误数量:" << num_2 << endl;for(int i = 0; i < num_2; i++){cout << vec_err_2[i] << endl;}cout << endl;//计算准确率和召回率double kk_1 = (double)count_out_1_right_all / count_out_1_all;//隐马尔科夫模型(二阶)准确率double kk_2 = (double)count_out_1_right_all / count_right_all;//隐马尔科夫模型(二阶)召回率double kk_3 = (double)count_out_2_right_all / count_out_2_all;//隐马尔科夫模型(三阶)准确率double kk_4 = (double)count_out_2_right_all / count_right_all;//隐马尔科夫模型(三阶)召回率//集中输出结果cout << endl;cout << "---------------------------------" << endl;cout << "分词消耗时间:" << time_2 - time_1 << "ms" << endl;cout << "测试文件大小:" << file_size/1024 << " KB" << endl;cout << "分词速度为:  " << (double)file_size*1000/((time_2 - time_1)*1024) << " KB/s" << endl;cout << endl;cout << "句子总数:" << count << endl;cout << "隐马尔科夫模型(二阶)切分完全正确的句子数目: " << count_1 << "\t ( " << (double)count_1*100/count << " % )" << endl;cout << "隐马尔科夫模型(三阶)切分完全正确的句子数目: " << count_2 << "\t ( " << (double)count_2*100/count << " % )" << endl;cout << endl;cout << "准确的切分总数:" << count_right_all << endl;//准确的切分总数cout << "隐马尔科夫模型(二阶)切分总数:" << count_out_1_all << endl;//隐马尔科夫模型切分总数cout << "隐马尔科夫模型(三阶)切分总数:" << count_out_2_all << endl;//隐马尔科夫模型切分总数cout << "隐马尔科夫模型(二阶)切分正确总数:" << count_out_1_right_all << endl;//隐马尔科夫模型切分正确总数cout << "隐马尔科夫模型(三阶)切分正确总数:" << count_out_2_right_all << endl;//隐马尔科夫模型切分正确总数cout << endl;cout << "隐马尔科夫模型(二阶):" << endl;long count_out_1_fail_all = count_out_1_fail_1_all + count_out_1_fail_2_all + count_out_1_fail_3_all;cout << "  组合型歧义:" << count_out_1_fail_1_all << "\t ( " << (double)count_out_1_fail_1_all*100/count_out_1_fail_all << " % )" << endl;cout << "  未登录词语:" << count_out_1_fail_2_all << "\t ( " << (double)count_out_1_fail_2_all*100/count_out_1_fail_all << " % )" << endl;cout << "  交集型歧义:" << count_out_1_fail_3_all << "\t ( " << (double)count_out_1_fail_3_all*100/count_out_1_fail_all << " % )" << endl;cout << endl;cout << "隐马尔科夫模型(三阶):" << endl;long count_out_2_fail_all = count_out_2_fail_1_all + count_out_2_fail_2_all + count_out_2_fail_3_all;cout << "  组合型歧义:" << count_out_2_fail_1_all << "\t ( " << (double)count_out_2_fail_1_all*100/count_out_2_fail_all << " % )" << endl;cout << "  未登录词语:" << count_out_2_fail_2_all << "\t ( " << (double)count_out_2_fail_2_all*100/count_out_2_fail_all << " % )" << endl;cout << "  交集型歧义:" << count_out_2_fail_3_all << "\t ( " << (double)count_out_2_fail_3_all*100/count_out_2_fail_all << " % )" << endl;cout << endl;cout << "统计结果:" << endl;cout << "隐马尔科夫模型(二阶)    准确率:" << kk_1*100 << "%  \t召回率:" << kk_2*100 << "%" << endl;cout << "隐马尔科夫模型(三阶)    准确率:" << kk_3*100 << "%  \t召回率:" << kk_4*100 << "%" << endl;return 0;}


4 0