一阶HMM词性标注

来源：互联网发布：mmd随心所欲动作数据编辑：程序博客网时间：2024/05/01 12:22

手头的语料库依然是msr_training.utf8和msr_test.utf8，它来自于自于SIGHAN Bakeoff 2005的 icwb2-data.rar

1.rmspace.cpp研究院的训练文档是已经分好词，但我们并不需要这个结果，我们要使用计算所有分词系统重新进行分词并进行词性标注，所以第一步要把训练文档中行内的空格去掉。

#include<iostream>#include<fstream>#include<sstream>#include<string>using namespace std;int main(int argc,char *argv[]){    if(argc<3){        cerr<<"Usage:"<<argv[0]<<" inputfile outputfile"<<endl;        return 1;    }    ifstream ifs(argv[1]);    ofstream ofs(argv[2]);    if(!(ifs && ofs)){        cerr<<"open file failed."<<endl;        return 1;    }    string line,word,line_out;    while(getline(ifs,line)){        line_out.clear();        istringstream strstm(line);        while(strstm>>word)            line_out+=word;        ofs<<line_out<<endl;    }    ifs.close();    ofs.close();    return 0;}

2.对第1步得到的输出文件还需要稍作修善，即把每行句首和句尾的双引号去掉。这个可以用vim来完成：1,$s/^“//g 1,$s/”$//g

3.wordseg.cpp对第2步得到的输出文件进行分词。g++ wordseg.cpp -o wordseg -I/home/orisun/master/ICTCLAS50_Linux_RHAS_32_C/API -lICTCLAS50运行命令时注意要把libICTCLAS50.so拷贝到当前目录下。

#include <string>#include <iostream>#define OS_LINUX#include "ICTCLAS50.h"using namespace std; int main(int argc, char *argv[]){    if (argc < 2) {      //命令行中需要给定要处理的文件名        cout << "Usage:command filename" << endl;        return 1;    }    string filename = argv[1];    string outfile = filename + ".ws";    string initPath = "/home/orisun/master/ICTCLAS50_Linux_RHAS_32_C/API";    if (!ICTCLAS_Init(initPath.c_str())) {        cout << "Init fails" << endl;        return -1;    }    ICTCLAS_FileProcess(filename.c_str(), outfile.c_str(), CODE_TYPE_UTF8,1);    ICTCLAS_Exit();    return 0;}

4.由于我们要做的是词性标注，所以先要对测试文档进行分词。仍然使用wordseg.cpp。

5.rmpos.cpp计算所的分词系统在分词的同时也做了词性标注（修改配置文件Configure.xml是不起作用的）,所以现在还得把测试文本中标注好的词性去掉。

#include<iostream>#include<fstream>#include<sstream>#include<string>using namespace std;int main(int argc,char *argv[]){    if(argc<3){        cerr<<"Usage:"<<argv[0]<<" inputfile outputfile"<<endl;        return 1;    }    ifstream ifs(argv[1]);    ofstream ofs(argv[2]);    if(!(ifs && ofs)){        cerr<<"open file failed."<<endl;        return 1;    }    string line,word,line_out,chinese;    while(getline(ifs,line)){        line_out.clear();        istringstream strstm(line);        while(strstm>>word){        string::size_type pos=word.find("/");        chinese=word.substr(0,pos);            line_out+=chinese+" ";        }        ofs<<line_out<<endl;    }    ifs.close();    ofs.close();    return 0;}

6.对训练文本（即第3步的输出）也实行rmpos.cpp。

7.createdict.cpp第5步和第6步生成了训练集和测试集中出现的所有词语和标点符号，现在要把它们都存入GDBM数据库。

#include<sys/stat.h>#include<gdbm.h>#include<iostream>#include<string>#include<fstream>#include<sstream>using namespace std;int main(int argc,char *argv[]){if(argc<2){cerr<<"Usage: "<<argv[0]<<" inputfile";return 1;}ifstream ifs(argv[1]);if(!ifs){cerr<<"open file failed."<<endl;return 1;}GDBM_FILE dbm_ptr;    dbm_ptr = gdbm_open("dict_db",0,GDBM_WRCREAT,S_IRUSR | S_IWUSR,NULL);   char v='w';datum key,value;value.dptr=&v;value.dsize=1;string line,word;while(getline(ifs,line)){    istringstream strstm(line);    while(strstm>>word){    char *chinese=const_cast<char*>(word.c_str());    key.dptr=chinese;    key.dsize=word.size();    //cout<<chinese<<"\t"<<word.size()<<endl;    gdbm_store(dbm_ptr,key,value,GDBM_REPLACE);    }    }        ifs.close();gdbm_close(dbm_ptr);return 0;}

8.indexword.cpp对数据库中所有的词语（包含标点）进行序号的标记。

#include<stdio.h>#include<string.h>#include<stdlib.h>#include<sys/stat.h>#include<gdbm.h>#include<ctype.h>#define DB_FILE_BLOCK "dict_db"int main(int argc,char* argv[]){GDBM_FILE dbm_ptr;    dbm_ptr = gdbm_open(DB_FILE_BLOCK,0,GDBM_WRCREAT,S_IRUSR | S_IWUSR,NULL);    datum key,data;    long index=0;//从0开始编号    char index_str[10]={0};    for(key=gdbm_firstkey(dbm_ptr);key.dptr;key=gdbm_nextkey(dbm_ptr,key)){    data=gdbm_fetch(dbm_ptr,key);    bzero(index_str,sizeof(index_str));    sprintf(index_str,"%ld",index++);    data.dptr=index_str;    data.dsize=sizeof(index_str);    gdbm_store(dbm_ptr,key,data,GDBM_REPLACE);    }    gdbm_close(dbm_ptr);    return 0;}

9.query.c和lookup.c（可选辅助）前者打印输出数据库中的所有数据，后者根据用户输出的key去GDBM中查询对应的value。

#include<stdio.h>#include<string.h>#include<stdlib.h>#include<sys/stat.h>#include<gdbm.h>#include<ctype.h>#define DB_FILE_BLOCK "dict_db"int main(int argc,char* argv[]){GDBM_FILE dbm_ptr;    dbm_ptr = gdbm_open(DB_FILE_BLOCK,0,GDBM_READER,S_IRUSR | S_IWUSR,NULL);    datum key,data;    for(key=gdbm_firstkey(dbm_ptr);key.dptr;key=gdbm_nextkey(dbm_ptr,key)){    data=gdbm_fetch(dbm_ptr,key);    printf("%s--%s\t",key.dptr,data.dptr);    }    printf("\n");    gdbm_close(dbm_ptr);    return 0;}

#include<sys/stat.h>#include<gdbm.h>#include<stdio.h>#include<string.h>#include<stdlib.h>int main(int argc,char *argv[]){char *word=(char*)malloc(50);GDBM_FILE dbm_ptr;dbm_ptr=gdbm_open("dict_db",0,GDBM_WRCREAT,S_IRUSR | S_IWUSR,NULL);datum key,value;while(1){printf("please input a word.\n");bzero(word,50);scanf("%s",word);if(strcmp(word,"exit")==0)break;key.dptr=word;key.dsize=strlen(word);value=gdbm_fetch(dbm_ptr,key);if(value.dsize==0){printf("%s not exist in dict.\n",word);}else{printf("%s--%s\n",key.dptr,value.dptr);}}gdbm_close(dbm_ptr);return 0;}

10.AMatrix.cpp统计训练文本（当然是第3步的输出）生成状态转移矩阵和初始状态概率矩阵，分别写入A.mat和PI.mat。

header.h头文件中主要包含ICTCLAS的词性标注集和Good-Turing平滑算法。

#ifndef _HEADER_H#define _HEADER_H#include<vector>#include<list>#include<map>using namespace std;const int POS_NUM=97;//计算所汉语词性标记集去掉标点符号共有POS_NUM个元素/*POS_NUM种词性，即POS_NUM种状态*/string posarr[POS_NUM]={"n","nr","nr1","nr2","nrj","nrf","ns","nsf","nt","nz","nl","ng","t","tg","s","f","v","vd","vn","vshi","vyou","vf","vx","vi","vl","vg","a","ad","an","ag","al","b","bl","z","r","rr","rz","rzt","rzs","rzv","ry","ryt","rys","ryv","Rg","m","mq","Mg","q","qv","qt","d","dl","dg","p","pba","pbei","c","cc","u","uzhe","ule","uguo","ude1","ude2","ude3","usuo","udeng","uyy","udh","uls","uzhi","ulian","e","y","o","h","k","x","xx","xu","w","wkz","wky","wyz","wyy","wj","ww","wd","wf","wn","wm","ws","wp","wb","wh","wt"};void goodturing(const int count[],double prob[],int len){map<int, list<int> > count_map;//map可以自动按key排好序int N=0;for(int i=0;i<len;++i){int c=count[i];N+=c;map<int, list<int> >::const_iterator itr;itr=count_map.find(c);if(itr==count_map.end()){list<int> l;l.push_back(i);count_map[c]=l;}else{count_map[c].push_back(i);}}map<int, list<int> >::const_iterator iter=count_map.begin();while(iter!=count_map.end()){double pr;int r=iter->first;int nr=iter->second.size();if(++iter!=count_map.end()){int r_new=iter->first;if(r_new=r+1){int nr_1=iter->second.size();pr=(1.0+r)*nr_1/(N*nr);}else{pr=1.0*r/N;}}else{pr=1.0*r/N;}list<int> l=(--iter)->second;list<int>::const_iterator itr1=l.begin();while(itr1!=l.end()){int index=*itr1;itr1++;prob[index]=pr;}++iter;}//概率归一化double sum=0;for(int i=0;i<len;++i)sum+=prob[i];for(int i=0;i<len;++i)prob[i]/=sum;}#endif

#include<iostream>#include<string>#include<fstream>#include<sstream>#include<vector>#include<algorithm>#include<iomanip>#include<iterator>#include<cassert>#include"header.h"int A[POS_NUM][POS_NUM];//记录状态间转移的次数int PI[POS_NUM];//记录各种状态出现的次数inline int indexof(string search){for(int i=0;i<POS_NUM;++i){if(search==posarr[i]){return i;}}return -1;}int main(int argc,char *argv[]){if(argc<2){cout<<"Usage:"<<argv[0]<<" pos_tagged_file"<<endl;return 1;}//打开输入文件ifstream ifs(argv[1]);if(!ifs){cerr<<"open file "<<argv[1]<<" failed."<<endl;return 1;}string line,word;while(getline(ifs,line)){istringstream strstm(line);string pre,post;//pre是前一个状态，post是后一个状态strstm>>word;string::size_type pos=word.find("/");post=word.substr(pos+1);int index1,index2;index2=indexof(post);if(index2<0){cout<<post<<" not exist"<<endl;return 1;}PI[index2]++;while(strstm>>word){pre=post;pos=word.find("/");post=word.substr(pos+1);//cout<<"pre="<<pre<<"\tpost="<<post<<endl;index1=indexof(pre);//if(index1<0){//cout<<pre<<" not exist"<<endl;//return 1;//}index2=indexof(post);//if(index2<0){//cout<<post<<" not exist"<<endl;//return 1;//}A[index1][index2]++;PI[index2]++;}}ifs.close();ofstream ofs1("A.mat");ofstream ofs2("PI.mat");if(!(ofs1 && ofs2)){cerr<<"create file failed."<<endl;return 1;}ofs1<<setprecision(8);ofs2<<setprecision(8);double arr_out[POS_NUM]={0.0};for(int i=0;i<POS_NUM;++i){goodturing(A[i],arr_out,POS_NUM);for(int j=0;j<POS_NUM;++j){ofs1<<arr_out[j]<<"\t";}ofs1<<endl;}goodturing(PI,arr_out,POS_NUM);for(int j=0;j<POS_NUM;++j){ofs2<<arr_out[j]<<"\t";}ofs2<<endl;ofs1.close();ofs2.close();return 0;}

11.BMatrix.cpp统计训练文本（当然是第3步的输出）生成发射矩阵，写入B.mat。

#include<iostream>#include<fstream>#include<sstream>#include<string>#include<iomanip>#include<cassert>#include<cstdlib>#include<gdbm.h>#include<sys/stat.h>#include"header.h"const int TERM_NUM=70000;int matrix[POS_NUM][TERM_NUM]={0.0};//混淆矩阵（或称发射矩阵）inline int indexof(string search){for(int i=0;i<POS_NUM;++i){if(search==posarr[i]){return i;}}return -1;}int main(int argc,char *argv[]){if(argc<2){cout<<"Usage: "<<argv[0]<<" pos_tagged_file"<<endl;return 1;}ifstream ifs(argv[1]);if(!ifs){cerr<<"open file "<<argv[1]<<" failed."<<endl;return 1;}GDBM_FILE dbm_ptr;dbm_ptr=gdbm_open("dict_db",0,GDBM_READER,S_IRUSR|S_IWUSR,NULL);datum key,value;string line,word,term,pos;string slash="/";while(getline(ifs,line)){istringstream strstm(line);while(strstm>>word){string::size_type loc=word.find(slash);assert(loc!=string::npos);term=word.substr(0,loc);//词语pos=word.substr(loc+1);//词性//cout<<term<<"\t"<<pos<<endl;int rowindex=indexof(pos);assert(rowindex>=0);key.dsize=term.size();key.dptr=const_cast<char*>(term.c_str());value=gdbm_fetch(dbm_ptr,key);int colindex=atoi(value.dptr);//cout<<rowindex<<"\t"<<colindex<<endl;matrix[rowindex][colindex]++;}}ifs.close();gdbm_close(dbm_ptr);//将发射矩阵写入文件ofstream ofs("B.mat");if(!ofs){cerr<<"create file B.mat failed."<<endl;return 1;}ofs<<setprecision(8);double arr_out[TERM_NUM]={0.0};for(int i=0;i<POS_NUM;++i){goodturing(matrix[i],arr_out,TERM_NUM);for(int j=0;j<TERM_NUM;++j){ofs<<arr_out[j]<<"\t";}ofs<<endl;}ofs.close();return 0;}

12.postag.cpp对测试文本（第5步的输出）进行词性标注。

#include<sys/stat.h>#include<ctype.h>#include<gdbm.h>#include<iostream>#include<sstream>#include<fstream>#include<string>#include<cstring>#include<cstdlib>#include<stack>#include<vector>#include"header.h"const string DB_FILE_BLOCK="dict_db";const int TERM_NUM=70000;const int TERM_MAXLEN=100;GDBM_FILE dbm_ptr;double PI[POS_NUM];//初始状态概率矩阵double A[POS_NUM][POS_NUM];//状态转移矩阵double B[POS_NUM][TERM_NUM];//发射矩阵/*从文件中读出HMM模型参数*/void initHMM(string f1,string f2,string f3){ifstream ifs1(f1.c_str());ifstream ifs2(f2.c_str());ifstream ifs3(f3.c_str());if(!(ifs1 && ifs2 && ifs3)){cerr<<"Open file failed!"<<endl;exit(1);}//读取PIstring line;if(getline(ifs1,line)){istringstream strstm(line);string word;for(int i=0;i<POS_NUM;++i){strstm>>word;PI[i]=atof(word.c_str());}}else{cerr<<"Read PI failed!"<<endl;exit(1);}//读取Afor(int i=0;i<POS_NUM;++i){getline(ifs2,line);istringstream strstm(line);string word;for(int j=0;j<POS_NUM;++j){strstm>>word;A[i][j]=atof(word.c_str());}}//读取Bfor(int i=0;i<POS_NUM;++i){getline(ifs3,line);istringstream strstm(line);string word;for(int j=0;j<TERM_NUM;++j){strstm>>word;B[i][j]=atof(word.c_str());}}ifs1.close();ifs2.close();ifs3.close();}/*Viterbi算法进行词性标注*/void viterbi(vector<string> terms,string &result){if(terms.size()==0)return;result.clear();int row=terms.size();//观察序列的长度double **Q=new double*[row];//初始化Q矩阵for(int i=0;i<row;++i)Q[i]=new double[POS_NUM]();int **Path=new int*[row];//初始化Path矩阵for(int i=0;i<row;++i)Path[i]=new int[POS_NUM]();//给Q和Path矩阵的第1行赋值datum key,data;char chinese[TERM_MAXLEN]={0};char *bp=const_cast<char*>(terms[0].c_str());strncpy(chinese,bp,terms[0].size());//读取句子中的第1个词key.dptr=chinese;key.dsize=terms[0].size();data=gdbm_fetch(dbm_ptr,key);//从数据库中获取汉字对应的index，该index对应发射矩阵的列int colindex=atoi(data.dptr);for(int i=0;i<POS_NUM;++i){Path[0][i]=-1;Q[0][i]=PI[i]*B[i][colindex];}//给Q和Path矩阵的后续行赋值for(int i=1;i<row;++i){bp=const_cast<char*>(terms[i].c_str());strncpy(chinese,bp,terms[i].size());//读取句子中的下一个汉字key.dptr=chinese;key.dsize=terms[i].size();data=gdbm_fetch(dbm_ptr,key);colindex=atoi(data.dptr);for(int j=0;j<POS_NUM;++j){double max=-1.0;int maxindex=-1;for(int k=0;k<POS_NUM;++k){double product=Q[i-1][k]*A[k][j];if(product>max){max=product;maxindex=k;}}Q[i][j]=max*B[j][colindex];Path[i][j]=maxindex;}}//找Q矩阵最后一行的最大值double max=-1.0;int maxindex=-1;for(int i=0;i<POS_NUM;++i){if(Q[row-1][i]>max){max=Q[row-1][i];maxindex=i;}}//从maxindex出发，根据Path矩阵找出最可能的状态序列stack<int> st;st.push(maxindex);for(int i=row-1;i>0;--i){maxindex=Path[i][maxindex];st.push(maxindex);}//释放二维数组for(int i=0;i<row;++i){delete []Q[i];delete []Path[i];}delete []Q;delete []Path;//根据标记好的状态序列分词int mark=-1;for(int i=0;i<terms.size();++i){mark=st.top();st.pop();result+=terms[i]+"/"+posarr[mark]+"\t";}}int main(int argc,char *argv[]){if(argc<3){cout<<"Usage: "<<argv[0]<<" inputfile outputfile"<<endl;return 1;}dbm_ptr = gdbm_open(DB_FILE_BLOCK.c_str(),0,GDBM_READER,S_IRUSR | S_IWUSR,NULL);initHMM("PI.mat","A.mat","B.mat");ifstream ifs(argv[1]);ofstream ofs(argv[2]);if(!(ifs&&ofs)){cerr<<"Open file failed!"<<endl;return 1;}string line;//循环读取每一行while(getline(ifs,line)){istringstream strstm(line);string term;vector<string> term_vec;string result;while(strstm>>term){term_vec.push_back(term);}viterbi(term_vec,result);ofs<<result<<endl;}ifs.close();ofs.close();gdbm_close(dbm_ptr);return 0;}

看一下效果吧，左边是ICTCLAS的pos-tagging结果，作为标准答案，右边是我用一阶HMM词性标注的结果。

使用简单的加1平滑：

可以看到词性标注准确度还很低，并且"mq"贡献了大部分的错误率。

使用Good-Turing平滑后的效果，大体上已经看不出有什么错误：