第一部分 引言



第二部分 试验结果








第三部分 源代码



#ifndef UTIL_H#define UTIL_H#include <string>using namespace std;/* * 函数功能:将字符串中的所有特定子串置换为新的字符串 * 函数输入:str     需要进行操作的字符串 *         old_str 旧的字符串 *         new_str 新的字符串 * 函数输出:置换完毕的字符串 */string& replace_all(string &str, string old_str, string new_str){while(1){string::size_type pos(0);if((pos = str.find(old_str)) != string::npos){str.replace(pos, old_str.length(), new_str);}else{break;}}return str;}#endif


#include <iostream>#include <fstream>#include <sstream>#include <string>#include <cstdlib>#include <map>#include "util.h"using namespace std;/* * 函数功能:将训练语料和测试语料中出现的汉字进行编码,将他们的对应关系存入文件 *         格式为:汉字-编码,编码从0开始 * 函数输入:infile_1 训练语料文件名 *         infile_2 测试语料文件名 *         outfile  指定的输出文件名 * 函数输出:名为outfile的文件 */void makeDB(string infile_1, string infile_2, string outfile){//读取输入文件ifstream fin_1(infile_1.c_str());ifstream fin_2(infile_2.c_str());if(!(fin_1 && fin_2)){cerr << "makeDB : Open input file fail !" << endl;exit(-1);}//打开输出文件ofstream fout(outfile.c_str());if(!fout){cerr << "makeDB : Open output file fail !" << endl;exit(-1);}map<string, int> map_cchar;int id = -1;string line = "";string cchar = "";//读取输入文件内容while(getline(fin_1, line)){line = replace_all(line, "/", "");if(line.size() >= 3){//逐字读取for(int i = 0; i < line.size() - 2; i += 3){cchar = line.substr(i, 3);if(map_cchar.find(cchar) == map_cchar.end()){++id;map_cchar[cchar] = id;}}}}while(getline(fin_2, line)){line = replace_all(line, "/", "");if(line.size() >= 3){//逐字读取for(int i = 0; i < line.size() - 2; i += 3){cchar = line.substr(i, 3);if(map_cchar.find(cchar) == map_cchar.end()){++id;map_cchar[cchar] = id;}}}}//输出到文件map<string, int>::iterator iter;for(iter = map_cchar.begin(); iter != map_cchar.end(); ++iter){//cout << iter -> first << " " << iter -> second << endl;fout << iter -> first << " " << iter -> second << endl;}fin_1.close();fin_2.close();fout.close();}/* * 函数功能:将训练语料每个汉字后面加入对应的BMES状态 * 函数输入:infile  训练语料文件名 *         outfile 指定的输出文件名 * 函数输出:名为outfile的文件 */void makeBMES(string infile, string outfile){ifstream fin(infile.c_str());ofstream fout(outfile.c_str());if(!(fin && fout)){cerr << "makeBMES : Open file failed !" << endl;exit(-1);}string word_in = "";string word_out = "";string line_in = "";string line_out = "";while(getline(fin, line_in)){if(line_in.size() >= 3){line_out.clear();line_in = replace_all(line_in, "/", " ");istringstream strstm(line_in);while(strstm >> word_in){word_out.clear();if(word_in.size()%3 != 0){cout << "单词不符合要求:" << word_in << endl;continue;}int num = word_in.size()/3;//单词中包含多少个汉字if(num == 0){continue;}if(num == 1){word_out = word_in;word_out += "/S";}else{//复制单词中的第一个字word_out.insert(word_out.size(), word_in, 0, 3);word_out += "/B";//逐个复制单词中间的字for(int i = 1; i < num - 1; i++){word_out.insert(word_out.size(), word_in, 3*i, 3);word_out += "/M";}//复制单词中最后的汉字word_out.insert(word_out.size(), word_in, 3*num - 3, 3);word_out += "/E";}line_out += word_out;}fout << line_out << endl;}}}/* * 主函数 */int main(int argc, char *argv[]){if(argc < 5){cout << "Usage: " << argv[0] << " train_file test_file db_file bmes_file" << endl;exit(-1);}//构造DB文件,输入训练语料、测试语料、输出文件名makeDB(argv[1], argv[2], argv[3]);//构造BMES文件,输入训练语料、输出文件名makeBMES(argv[1], argv[4]);}


#ifndef DB_H#define DB_H#include <iostream>#include <fstream>#include <map>#include <vector>#include <cstdlib>#include "util.h"using namespace std;/* * 转换类,获取编号 */class DB{private:map<string, int> cchar_map;//汉字-编码映射map<int, string> index_map;//编码-汉字映射public:DB();DB(string file);string getCchar(int id);//根据编码获得汉字int getObservIndex(string cchar);//根据汉字获得编码int getStateIndex(char state);//根据状态获得状态编号vector<int> makeObservs(string line);//将输入的句子构造为发射符号序列};//无参构造函数DB::DB(){}//有参构造函数DB::DB(string file){ifstream fin(file.c_str());if(!fin){cout << "Open input file fail ! Can't init Trans !" << endl;exit(-1);}string line = "";string word = "";string cchar = "";int id = 0;while(getline(fin, line)){istringstream strstm(line);strstm >> word;cchar = word;strstm >> word;id = atoi(word.c_str());//加入mapcchar_map[cchar] = id;index_map[id] = cchar;}cout << "cchar_map大小: " << cchar_map.size() << endl;cout << "index_map大小: " << index_map.size() << endl;}//将状态转换为数字编号int DB::getStateIndex(char state){switch(state){case 'B' :return 0;break;case 'M' :return 1;break;case 'E' :return 2;break;case 'S' :return 3;break;default :return -1;break;}}//将汉字转换为数字编号int DB::getObservIndex(string cchar){map<string, int>::iterator iter = cchar_map.find(cchar);if(iter != cchar_map.end()){return iter -> second;}else{return -1;}}//将数字编号转换为汉字string DB::getCchar(int id){map<int, string>::iterator iter = index_map.find(id);if(iter != index_map.end()){return iter -> second;}else{return NULL;}}//将输入的句子构造为发射符号序列vector<int> DB::makeObservs(string line){    vector<int> vec_observ; //输出符号的集合    string cchar = "";      //存放每个汉字    string word = "";       //存放一个单词    int num = 0;            //单词的字数    int index = -1;         //单词对应的编号    line = replace_all(line, "/", " ");    cout << line << endl;    istringstream strstm(line);    while(strstm >> word){        if(word.size()%3 != 0){            cout << "单词不符合要求:" << word << endl;            continue;        }        num = word.size()/3;        if(num == 0){            continue;        }else{            for(int i = 0; i < num; i++){                cchar = word.substr(3*i, 3);                index = getObservIndex(cchar);                vec_observ.push_back(index);//cout << "cchar = " << cchar << "   index = " << index << endl;            }        }    }    return vec_observ;}#endif


#include <iostream>#include <fstream>#include <sstream>#include <string>#include <iomanip>#include <cmath>#include <list>#include "db.h"using namespace std;const int N = 4;//隐藏状态的数目const int M = 4677;//汉字的个数const double VALUE = 1.0;//平滑算法增加的值//定义字典对象DB db("db.txt");/* * 模型训练,将频数转换为频率(加1平滑) */void turingAdd(const int count[], double prob[], int len){double sum = 0.0;for(int i = 0; i < len; ++i){sum += count[i];}sum = sum + VALUE * len;for(int i = 0; i < len; ++i){prob[i] = -log((count[i] + VALUE) / sum);//取对数}}/* * 模型训练,将发射频数转换为频率(古德-图灵平滑) */void turingGood(const int count[], double prob[], int len){map<int, list<int> > freq_map;//key为词频,value为该词频对应的汉字列表map<int, list<int> >::iterator iter;//迭代器int sum = 0;//词频总和//初始化freq_mapfor(int i = 0; i < len; i++){int freq = count[i];//词频sum += freq;iter = freq_map.find(freq);if(iter != freq_map.end()){//该词频已经存在,把当前词加入相应的listfreq_map[freq].push_back(i);}else{//该词频不存在,建立对应的汉字listlist<int> lst;lst.push_back(i);freq_map[freq] = lst;}}//若sum=0,则结果初始化为0.0即可if(sum == 0){for(int i = 0; i < len; i++){prob[i] = 0.0;}return;}//数据平滑处理iter = freq_map.begin();while(iter != freq_map.end()){double pr;//频率int freq = iter -> first;int freqsize = iter -> second.size();if(++iter != freq_map.end()){int freq_2 = iter -> first;if(freq_2 = freq + 1){int freqsize_2 = iter -> second.size();pr = ((1.0 + freq) * freqsize_2) / (sum * freqsize);}else{pr = 1.0 * freq / sum;}}else{pr = 1.0 * freq / sum;}//计算结果list<int> lst = (--iter) -> second;list<int>::iterator iter_in = lst.begin();while(iter_in != lst.end()){int index = *iter_in;prob[index] = pr;++iter_in;}//准备下次迭代++iter;}//概率归一化double total = 0.0;for(int i = 0; i < len; i++){total += prob[i];}for(int i = 0; i < len; i++){prob[i] = -log((double)prob[i] / total);//取对数}}/* * 主函数,生成HMM模型的参数 * 状态转移概率矩阵、初始状态概率矩阵、符号发射概率矩阵 */int main(int argc, char *argv[]){if(argc < 2){cout << "Usage: " << argv[0] << " bmes_file !" << endl;exit(-1);}ifstream fin(argv[1]);if(!fin){cerr << "Open input file " << argv[1] << "filed !" << endl;exit(-1);}int Pi[N] = {0};//初始状态出现次数int A1[N][N] = {0};//二阶状态转移次数int A2[N][N][N] = {0};//三阶状态转移次数int B1[N][M] = {0};//二阶符号发射次数int B2[N][N][M] = {0};//三阶符号发射次数//抽取文件中的状态和观察值string line = "";//存放每一行的内容int line_num = 0;//句子编号int count = 0;while(getline(fin, line)){line_num++;char state;//状态string cchar = "";//一个汉字int i, j, k, m;string::size_type pos = 0;//当前处理位置if((pos = line.find("/", pos + 1)) != string::npos){//抽取句子的第一个状态state = line.at(pos + 1);i = db.getStateIndex(state);Pi[i]++;//抽取句子的第一个观察值cchar = line.substr(pos - 3, 3);m = db.getObservIndex(cchar);B1[i][m]++;if((pos = line.find("/", pos + 1)) != string::npos){//抽取句子的第二个状态state = line.at(pos + 1);j = db.getStateIndex(state);A1[i][j]++;//抽取句子的第二个观察值cchar = line.substr(pos - 3, 3);m = db.getObservIndex(cchar);B1[j][m]++;B2[i][j][m]++;while((pos = line.find("/", pos + 1)) != string::npos){//抽取句子的其他状态state = line.at(pos + 1);k = db.getStateIndex(state);A1[j][k]++;A2[i][j][k]++;//抽取句子的其他观察值cchar = line.substr(pos - 3, 3);m = db.getObservIndex(cchar);B1[k][m]++;B2[j][k][m]++;//准备下次迭代i = j;j = k;}}}}fin.close();//打开输出流ofstream fout_1("Pi.mat");//初始概率矩阵ofstream fout_2("A1.mat");//二阶状态转移矩阵ofstream fout_3("A2.mat");//三阶状态转移矩阵ofstream fout_4("B1.mat");//二阶发射概率矩阵ofstream fout_5("B2.mat");//三阶发射概率矩阵if(!(fout_1 && fout_2 && fout_3 && fout_4 && fout_5)){cerr << "Create Matrix file failed !" << endl;exit(-1);}fout_1 << setprecision(8);fout_2 << setprecision(8);fout_3 << setprecision(8);fout_4 << setprecision(8);fout_5 << setprecision(8);//初始状态矩阵写入文件double arr_pi[N] = {0.0};//turingGood(Pi, arr_pi, N);turingAdd(Pi, arr_pi, N);for(int i = 0; i < N; i++){fout_1 << arr_pi[i] << "\t";}fout_1 << endl;//二阶状态转移矩阵写入文件double arr_a_1[N] = {0.0};for(int i = 0; i < N; i++){//turingGood(A1[i], arr_a_1, N);turingAdd(A1[i], arr_a_1, N);for(int j = 0; j < N; j++){fout_2 << arr_a_1[j] << "\t";}fout_2 << endl;}//三阶状态转移矩阵写入文件double arr_a_2[N] = {0.0};for(int i = 0; i < N; i++){for(int j = 0; j < N; j++){//turingGood(A2[i][j], arr_a_2, N);turingAdd(A2[i][j], arr_a_2, N);for(int k = 0; k < N; k++){fout_3 << arr_a_2[k] << "\t";}fout_3 << endl;}}//二阶发射概率矩阵写入文件double arr_b_1[M] = {0.0};for(int i = 0; i < N; i++){//turingGood(B1[i], arr_b_1, M);turingAdd(B1[i], arr_b_1, M);for(int j = 0; j < M; j++){fout_4 << arr_b_1[j] << "\t";}fout_4 << endl;}//三阶发射概率矩阵写入文件double arr_b_2[M] = {0.0};for(int i = 0; i < N; i++){for(int j = 0; j < N; j++){//turingGood(B2[i][j], arr_b_2, M);turingAdd(B2[i][j], arr_b_2, M);for(int k = 0; k < M; k++){fout_5 << arr_b_2[k] << "\t";}fout_5 << endl;}}fout_1.close();fout_2.close();fout_3.close();fout_4.close();fout_5.close();return 0;}


#ifndef HMM_H#define HMM_H#include <fstream>#include <sstream>#include <string>#include <cstdlib>const int N = 4;const int M = 4677;using namespace std;//定义HMM模型class HMM{public:int n;//状态数目int m;//可能的观察符号数目double Pi[N];//初始状态概率double A1[N][N];//状态转移概率矩阵double A2[N][N][N];//状态转移概率矩阵double B1[N][M];//符号发射概率矩阵double B2[N][N][M];//符号发射概率矩阵HMM();HMM(string f_pi, string f_a1, string f_a2, string f_b1, string f_b2);};//无参构造函数HMM::HMM(){}//有参构造函数HMM::HMM(string f_pi, string f_a1, string f_a2, string f_b1, string f_b2){ifstream fin_1(f_pi.c_str());ifstream fin_2(f_a1.c_str());ifstream fin_3(f_a2.c_str());ifstream fin_4(f_b1.c_str());ifstream fin_5(f_b2.c_str());if(!(fin_1 && fin_2 && fin_3 && fin_4 && fin_5)){exit(-1);}n = N;m = M;string line = "";string word = "";//读取Pigetline(fin_1, line);istringstream strstm_1(line);for(int i = 0; i < N; i++){strstm_1 >> word;Pi[i] = atof(word.c_str());}//读取A1for(int i = 0; i < N; i++){getline(fin_2, line);istringstream strstm_2(line);for(int j = 0; j < N; j++){strstm_2 >> word;A1[i][j] = atof(word.c_str());}}//读取A2for(int i = 0; i < N; i++){for(int j = 0; j < N; j++){getline(fin_3, line);istringstream strstm_3(line);for(int k = 0; k < N; k++){strstm_3 >> word;A2[i][j][k] = atof(word.c_str());}}}//读取B1for(int i = 0; i < N; i++){getline(fin_4, line);istringstream strstm_4(line);for(int j = 0; j < M; j++){strstm_4 >> word;B1[i][j] = atof(word.c_str());}}//读取B2for(int i = 0; i < N; i++){for(int j = 0; j < N; j++){getline(fin_5, line);istringstream strstm_5(line);for(int k = 0; k < M; k++){strstm_5 >> word;B2[i][j][k] = atof(word.c_str());}}}fin_1.close();fin_2.close();fin_3.close();fin_4.close();fin_5.close();}#endif


#include <iostream>#include <fstream>#include <sstream>#include <string>#include <stack>#include "hmm.h"#include "db.h"using namespace std;HMM hmm("Pi.mat", "A1.mat", "A2.mat", "B1.mat", "B2.mat");//初始化HMM模型DB db("db.txt");//初始化字典/* * Viterbi算法进行分词,二阶马尔柯夫过程 */string viterbiTwo(string str_in){//计算输入句子中的汉字个数int row = str_in.size() / 3;string str_out = "";//如果输入字符串为空,则直接返回空if(row == 0){return str_out;}//如果只有一个字的话,则直接输出即可if(row < 2){str_out = str_in + "/";return str_out;}//分配矩阵空间double **delta = new double *[row];int **path = new int *[row];for(int i = 0; i < row; i++){delta[i] = new double[N]();path[i] = new int[N]();}//中间变量string cchar = "";//存放汉字int min_path = -1;double val = 0.0;double min_val = 0.0;//初始化矩阵,给delta和path矩阵的第一行赋初值cchar = str_in.substr(0, 3);int cchar_num = db.getObservIndex(cchar);for(int i = 0; i < N; i++){delta[0][i] = hmm.Pi[i] + hmm.B1[i][cchar_num];//对数path[0][i] = -1;}//给delta和path的后续行赋值(对数)for(int t = 1; t < row; t++){cchar = str_in.substr(3*t, 3);cchar_num = db.getObservIndex(cchar);for(int j = 0; j < N; j++){min_val = 100000.0;min_path = -1;for(int i = 0; i < N; i++){val = delta[t-1][i] + hmm.A1[i][j];if(val < min_val){min_val = val;min_path = i;}}delta[t][j] = min_val + hmm.B1[j][cchar_num];path[t][j] = min_path;}}//找delta矩阵最后一行的最大值min_val = 100000.0;min_path = -1;for(int i = 0; i < N; i++){if(delta[row-1][i] < min_val){min_val = delta[row-1][i];min_path = i;}}//从min_path出发,回溯得到最可能的路径stack<int> path_st;path_st.push(min_path);for(int i = row - 1; i > 0; i--){min_path = path[i][min_path];path_st.push(min_path);}//释放二维数组for(int i = 0; i < row; i++){delete []delta[i];delete []path[i];}delete []delta;delete []path;//根据标记好的状态序列分词int pos = 0;int index = -1;while(!path_st.empty()){index = path_st.top();path_st.pop();str_out.insert(str_out.size(), str_in, pos, 3);if(index == 2 || index == 3){//状态为E或Sstr_out.append("/");}pos += 3;}}/* * Viterbi算法进行分词:三阶马尔柯夫过程 */string viterbiThree(string str_in){//计算输入句子中的汉字个数int row = str_in.size() / 3;string str_out = "";//如果输入字符串为空,则直接返回空if(row == 0){return str_out;}//如果只有一个字的话,则直接输出即可if(row < 2){str_out = str_in + "/";return str_out;}//分配矩阵空间double ***delta = new double **[row];int ***path = new int **[row];for(int i = 0; i < row; i++){delta[i] = new double *[N];path[i] = new int *[N];for(int j = 0; j < N; j++){delta[i][j] = new double[N];path[i][j] = new int[N];for(int k = 0; k < N; k++){delta[i][j][k] = 0.0;path[i][j][k] = 0;}}}//初始化矩阵,给delta和path矩阵的第1个面赋初值//初始状态需要两个面,第0面不赋值,只给第1个面赋值string cchar_1 = str_in.substr(0, 3);//第1个字string cchar_2 = str_in.substr(3, 3);//第2个字int num_1 = db.getObservIndex(cchar_1);//第1个字的编号int num_2 = db.getObservIndex(cchar_2);//第2个字的编号for(int i = 0; i < N; i++){for(int j = 0; j < N; j++){delta[1][i][j] = hmm.Pi[i] + hmm.B1[i][num_1] + hmm.A1[i][j] + hmm.B2[i][j][num_2];//对数path[1][i][j] = -1;}}//中间变量string cchar_3 = "";//存放汉字int min_path = -1;double val = 0.0;double min_val = 0.0;//给delta和path的后续面赋值(对数)//第0、1面为初始面,后续面从2开始,到row-1为止for(int t = 2; t < row; t++){cchar_3 = str_in.substr(3*t, 3);int num_3 = db.getObservIndex(cchar_3);for(int j = 0; j < N; j++){for(int k = 0; k < N; k++){min_val = 100000.0;min_path = -1;for(int i = 0; i < N; i++){val = delta[t-1][i][j] + hmm.A2[i][j][k];if(val < min_val){min_val = val;min_path = i;}}delta[t][j][k] = min_val + hmm.B2[j][k][num_3];path[t][j][k] = min_path;}}}//找delta矩阵最后一个面的最大值,最后一个面为row-1min_val = 100000.0;int min_path_i = -1;int min_path_j = -1;for(int i = 0; i < N; i++){for(int j = 0; j < N; j++){if(delta[row-1][i][j] < min_val){min_val = delta[row-1][i][j];min_path_i = i;min_path_j = j;}}}//从min_path_i和min_path_j出发,回溯得到最可能的路径//回溯从row-1开始,到2为止stack<int> path_st;path_st.push(min_path_j);path_st.push(min_path_i);for(int t = row - 1; t > 1; t--){int min_path_k = path[t][min_path_i][min_path_j];path_st.push(min_path_k);min_path_j = min_path_i;min_path_i = min_path_k;}//释放三维数组for(int i = 0; i < row; i++){for(int j = 0; j < N; j++){delete []delta[i][j];delete []path[i][j];}delete []delta[i];delete []path[i];}delete []delta;delete []path;//根据标记好的状态序列分词int pos = 0;int index = -1;while(!path_st.empty()){index = path_st.top();path_st.pop();str_out.insert(str_out.size(), str_in, pos, 3);if(index == 2 || index == 3){//状态为E或Sstr_out.append("/");}pos += 3;}}


#include <cstdlib>#include <vector>#include <iomanip>#include <map>#include <algorithm>#include <sys/time.h>#include <sys/stat.h>#include "util.h"#include "viterbi.cpp"const long MaxCount = 50000;//需要切分的最大句子数量,若该值大于文件中//实际的句子数量,以实际句子数量为准。//获取当前时间(ms)long getCurrentTime(){struct timeval tv;gettimeofday(&tv, NULL);return tv.tv_sec*1000 + tv.tv_usec/1000;}//获取文件大小unsigned long getFileSize(string file_path){unsigned long filesize = -1;struct stat statbuff;if(stat(file_path.c_str(), &statbuff) < 0){return filesize;}else{filesize = statbuff.st_size;}return filesize;}/* * 函数功能:计算切分标记的位置 * 函数输入:1.strline_in未进行切分的汉字字符串           2.strline_right进行切分后的汉字字符串 * 函数输出:vecetor,其中存放了strline_in中哪些位置放置了分词标记 *         注意:vector中不包含最后标记的位置,但是包含位置0。 */vector<int> getPos(string strline_right, string strline_in){int pos_1 = 0;int pos_2 = -1;int pos_3 = 0;string word = "";vector<int> vec;int length = strline_right.length();while(pos_2 < length){//前面的分词标记pos_1 = pos_2;//后面的分词标记pos_2 = strline_right.find('/', pos_1 + 1);if(pos_2 > pos_1){//将两个分词标记之间的单词取出word  = strline_right.substr(pos_1 + 1, pos_2 - pos_1 - 1);//根据单词去输入序列中查出出现的位置pos_3 = strline_in.find(word, pos_3);//将位置存入数组vec.push_back(pos_3);pos_3 = pos_3 + word.size();}else{break;}}return vec;}/* * 获取标准切分和程序切分的结果 */string getString(string word, int pos, vector<int> vec_right){char ss[1000];int i = 0;int k = 0;if(vec_right.size() == 0){return word;}while(vec_right[i] < pos){i++;}for(int j = 0; j < word.size(); j++){if(j == vec_right[i] - pos){if(j != 0){ss[k] = '/';++k;}++i;}ss[k] = word[j];++k;}ss[k] = '\0';string word_str = ss;return word_str;}/* * 函数功能:获取单个句子切分的结果统计 * 函数输入:1.vec_right 正确的分词标记位置集合 *           2.vec_out   函数切分得到的分词标记位置集合 * 函数输出:返回一个veceor,含有4个元素,分别为: *          切分正确、组合型歧义、未登录词、交集型歧义的数量 * */vector<int> getCount_2(string strline, vector<int> vec_right, vector<int> vec_out, vector<string> &vec_err){vector<int> vec(4, 0);//存放计算结果//建立mapmap<int, int> map_result;for(int i = 0; i < vec_right.size(); i++){map_result[vec_right[i]] += 1;}for(int i = 0; i < vec_out.size(); i++){map_result[vec_out[i]] += 2;}//统计map中的信息//若value=1,只在vec_right中//若value=2,只在vec_out中//若value=3,在vec_right和vec_out中都有map<int, int>::iterator p_pre, p_cur;int count_value_1 = 0;int count_value_2 = 0;int count_value_3 = 0;p_pre = map_result.begin();p_cur = map_result.begin();while(p_cur != map_result.end()){while(p_cur != map_result.end() && p_cur -> second == 3){p_pre = p_cur;++count_value_3;//切分正确的数目++p_cur;//迭代器后移}while(p_cur != map_result.end() && p_cur -> second != 3){if(p_cur -> second == 1){++count_value_1;}else if(p_cur -> second == 2){++count_value_2;}++p_cur;}//确定切分错误的字符串if(p_cur == map_result.end() && p_cur == (++p_pre)){continue;}int pos_1 = p_pre -> first;int pos_2 = p_cur -> first; string word = strline.substr(pos_1, pos_2 - pos_1);//切分错误的单词string word_right = getString(word, pos_1, vec_right);//正确的切分方式string word_out = getString(word, pos_1, vec_out);//得到的切分方式 string str_err = "";//不同的错误类型if(count_value_1 > 0 && count_value_2 == 0){str_err = "  组合型歧义: " + word + "    正确切分: " + word_right + "    错误切分: " + word_out;vec_err.push_back(str_err);cout << str_err << endl;vec[1] += count_value_1;}else if(count_value_1 == 0 && count_value_2 > 0){str_err = "  未登录词语: " + word + "    正确切分: " + word_right + "    错误切分: " + word_out;vec_err.push_back(str_err);cout << str_err << endl;vec[2] += count_value_2;}else if(count_value_1 > 0 && count_value_2 > 0){str_err = "  交集型歧义: " + word + "    正确切分: " + word_right + "    错误切分: " + word_out;vec_err.push_back(str_err);cout << str_err << endl;vec[3] += count_value_2;}//计数器复位count_value_1 = 0;count_value_2 = 0;}vec[0] += count_value_3;return vec;}/* * 主函数:进行分词并统计分词结果 * */int main(int argc, char *argv[]){if(argc < 3){cout << "Usage: " << argv[0] << " test_file result_file" << endl;exit(-1);}long time_1 = getCurrentTime();string strline_right;//输入语料:用作标准分词结果string strline_in;//去掉分词标记的语料(用作分词的输入)string strline_out_1;//隐马尔科夫模型(二阶)分词完毕的语料string strline_out_2;//隐马尔科夫模型(三阶)分词完毕的语料ifstream fin(argv[1]);//打开输入文件if(!fin){cout << "Unable to open input file !" << argv[1] << endl;exit(-1);}ofstream fout(argv[2]);//确定输出文件if(!fout){cout << "Unable to open output file !" << endl;exit(-1);}long count = 0;//句子编号long count_1 = 0;//隐马尔科夫模型(二阶)切分完全正确的句子总数long count_2 = 0;//隐马尔科夫模型(三阶)切分完全正确的句子总数long count_right_all = 0;//准确的切分总数//二阶long count_out_1_all = 0;//隐马尔科夫模型切分总数long count_out_1_right_all = 0;//隐马尔科夫模型切分正确总数long count_out_1_fail_1_all = 0;//隐马尔科夫模型(组合型歧义)long count_out_1_fail_2_all = 0;//隐马尔科夫模型(未登录词语)long count_out_1_fail_3_all = 0;//隐马尔科夫模型(交集型歧义)//三阶long count_out_2_all = 0;//隐马尔科夫模型切分总数long count_out_2_right_all = 0;//隐马尔科夫模型切分正确总数long count_out_2_fail_1_all = 0;//隐马尔科夫模型(组合型歧义)long count_out_2_fail_2_all = 0;//隐马尔科夫模型(未登录词语)long count_out_2_fail_3_all = 0;//隐马尔科夫模型(交集型歧义)vector<string> vec_err_1;//隐马尔科夫模型(二阶)切分错误的词vector<string> vec_err_2;//隐马尔科夫模型(三阶)切分错误的词while(getline(fin, strline_right, '\n') && count < MaxCount){if(strline_right.length() > 1){//去掉分词标记strline_in = strline_right;strline_in = replace_all(strline_in, "/", "");//隐马尔科夫模型分词strline_out_1 = strline_right;istringstream strstm(strline_in);string sentence;string result_1;string result_2;string line_out_1;string line_out_2;while(strstm >> sentence){//二阶切分result_1 = viterbiTwo(sentence);line_out_1 += result_1;//三阶切分result_2 = viterbiThree(sentence);line_out_2 += result_2;}strline_out_1 = line_out_1;strline_out_2 = line_out_2;//输出分词结果count++;cout << "----------------------------------------------" << endl;cout << "句子编号:" << count << endl;cout << endl;cout << "待分词的句子长度: " << strline_in.length() << "  句子:" << endl;cout << strline_in << endl;cout << endl;cout << "标准比对结果长度: " << strline_right.length() << "  句子:" << endl;cout << strline_right << endl;cout << endl;cout << "隐马尔科夫模型(二阶)分词长度: " << strline_out_1.length() << "  句子:" << endl;cout << strline_out_1 << endl;cout << endl;cout << "隐马尔科夫模型(三阶)分词长度: " << strline_out_2.length() << "  句子:" << endl;cout << strline_out_2 << endl;cout << endl;//输出分词结果的数字序列表示vector<int> vec_right = getPos(strline_right, strline_in);vector<int> vec_out_1 = getPos(strline_out_1, strline_in);vector<int> vec_out_2 = getPos(strline_out_2, strline_in);cout << "标准结果:" << endl;for(int i = 0; i < vec_right.size(); i++){cout << setw(4) << vec_right[i];}cout << endl;cout << "隐马尔科夫模型(二阶)分词结果:" << endl;for(int i = 0; i < vec_out_1.size(); i++){cout << setw(4) << vec_out_1[i];}cout << endl;cout << "隐马尔科夫模型(三阶)分词结果:" << endl;for(int i = 0; i < vec_out_2.size(); i++){cout << setw(4) << vec_out_2[i];}cout << endl;//输出匹配的错误列表cout << endl;if(vec_right == vec_out_1){cout << "隐马尔科夫模型(二阶)分词完全正确!" << endl;count_1++;}else{cout << "隐马尔科夫模型(二阶)分词错误列表:" << endl;}vector<int> vec_count_1 = getCount_2(strline_in, vec_right, vec_out_1, vec_err_1);cout << endl;if(vec_right == vec_out_2){cout << "隐马尔科夫模型(三阶)分词完全正确!" << endl;count_2++;}else{cout << "隐马尔科夫模型(三阶)分词错误列表:" << endl;}vector<int> vec_count_2 = getCount_2(strline_in, vec_right, vec_out_2, vec_err_2);//准确的切分数量int count_right = vec_right.size();//切分得到的数量int count_out_1 = vec_out_1.size();int count_out_2 = vec_out_2.size();//切分正确的数量int count_out_1_right = vec_count_1[0];cout << "切分得到:" << count_out_1 << endl;cout << "切分正确:" << count_out_1_right << endl;cout << "隐马尔科夫模型(二阶):" << endl;cout << "  组合型歧义:" << vec_count_1[1] << endl;cout << "  未登录词语:" << vec_count_1[2] << endl;cout << "  交集型歧义:" << vec_count_1[3] << endl;int count_out_2_right = vec_count_2[0];cout << "切分得到:" << count_out_2 << endl;cout << "切分正确:" << count_out_2_right << endl;cout << "隐马尔科夫模型(三阶):" << endl;cout << "  组合型歧义:" << vec_count_2[1] << endl;cout << "  未登录词语:" << vec_count_2[2] << endl;cout << "  交集型歧义:" << vec_count_2[3] << endl;count_right_all += count_right;count_out_1_all += count_out_1;count_out_1_right_all += count_out_1_right;count_out_1_fail_1_all += vec_count_1[1];count_out_1_fail_2_all += vec_count_1[2];count_out_1_fail_3_all += vec_count_1[3];count_out_2_all += count_out_2;count_out_2_right_all += count_out_2_right;count_out_2_fail_1_all += vec_count_2[1];count_out_2_fail_2_all += vec_count_2[2];count_out_2_fail_3_all += vec_count_2[3];}}long time_2 = getCurrentTime();unsigned long file_size = getFileSize("test.txt");//打印错误的切分内容cout << endl;cout << "---------------------------------" << endl;cout << "错误样例(已排序):" << endl;//对错误切分内容进行排序并掉重复的sort(vec_err_1.begin(), vec_err_1.end());sort(vec_err_2.begin(), vec_err_2.end());vector<string>::iterator end_unique_1 = unique(vec_err_1.begin(), vec_err_1.end());vector<string>::iterator end_unique_2 = unique(vec_err_2.begin(), vec_err_2.end());int num_1 = end_unique_1 - vec_err_1.begin();int num_2 = end_unique_2 - vec_err_2.begin();cout << "----------------------------------" << endl;cout << "隐马尔科夫模型(二阶)切分错误数量:" << num_1 << endl;for(int i = 0; i < num_1; i++){cout << vec_err_1[i] << endl;}cout << endl;cout << "----------------------------------" << endl;cout << "隐马尔科夫模型(三阶)切分错误数量:" << num_2 << endl;for(int i = 0; i < num_2; i++){cout << vec_err_2[i] << endl;}cout << endl;//计算准确率和召回率double kk_1 = (double)count_out_1_right_all / count_out_1_all;//隐马尔科夫模型(二阶)准确率double kk_2 = (double)count_out_1_right_all / count_right_all;//隐马尔科夫模型(二阶)召回率double kk_3 = (double)count_out_2_right_all / count_out_2_all;//隐马尔科夫模型(三阶)准确率double kk_4 = (double)count_out_2_right_all / count_right_all;//隐马尔科夫模型(三阶)召回率//集中输出结果cout << endl;cout << "---------------------------------" << endl;cout << "分词消耗时间:" << time_2 - time_1 << "ms" << endl;cout << "测试文件大小:" << file_size/1024 << " KB" << endl;cout << "分词速度为:  " << (double)file_size*1000/((time_2 - time_1)*1024) << " KB/s" << endl;cout << endl;cout << "句子总数:" << count << endl;cout << "隐马尔科夫模型(二阶)切分完全正确的句子数目: " << count_1 << "\t ( " << (double)count_1*100/count << " % )" << endl;cout << "隐马尔科夫模型(三阶)切分完全正确的句子数目: " << count_2 << "\t ( " << (double)count_2*100/count << " % )" << endl;cout << endl;cout << "准确的切分总数:" << count_right_all << endl;//准确的切分总数cout << "隐马尔科夫模型(二阶)切分总数:" << count_out_1_all << endl;//隐马尔科夫模型切分总数cout << "隐马尔科夫模型(三阶)切分总数:" << count_out_2_all << endl;//隐马尔科夫模型切分总数cout << "隐马尔科夫模型(二阶)切分正确总数:" << count_out_1_right_all << endl;//隐马尔科夫模型切分正确总数cout << "隐马尔科夫模型(三阶)切分正确总数:" << count_out_2_right_all << endl;//隐马尔科夫模型切分正确总数cout << endl;cout << "隐马尔科夫模型(二阶):" << endl;long count_out_1_fail_all = count_out_1_fail_1_all + count_out_1_fail_2_all + count_out_1_fail_3_all;cout << "  组合型歧义:" << count_out_1_fail_1_all << "\t ( " << (double)count_out_1_fail_1_all*100/count_out_1_fail_all << " % )" << endl;cout << "  未登录词语:" << count_out_1_fail_2_all << "\t ( " << (double)count_out_1_fail_2_all*100/count_out_1_fail_all << " % )" << endl;cout << "  交集型歧义:" << count_out_1_fail_3_all << "\t ( " << (double)count_out_1_fail_3_all*100/count_out_1_fail_all << " % )" << endl;cout << endl;cout << "隐马尔科夫模型(三阶):" << endl;long count_out_2_fail_all = count_out_2_fail_1_all + count_out_2_fail_2_all + count_out_2_fail_3_all;cout << "  组合型歧义:" << count_out_2_fail_1_all << "\t ( " << (double)count_out_2_fail_1_all*100/count_out_2_fail_all << " % )" << endl;cout << "  未登录词语:" << count_out_2_fail_2_all << "\t ( " << (double)count_out_2_fail_2_all*100/count_out_2_fail_all << " % )" << endl;cout << "  交集型歧义:" << count_out_2_fail_3_all << "\t ( " << (double)count_out_2_fail_3_all*100/count_out_2_fail_all << " % )" << endl;cout << endl;cout << "统计结果:" << endl;cout << "隐马尔科夫模型(二阶)    准确率:" << kk_1*100 << "%  \t召回率:" << kk_2*100 << "%" << endl;cout << "隐马尔科夫模型(三阶)    准确率:" << kk_3*100 << "%  \t召回率:" << kk_4*100 << "%" << endl;return 0;}

