利用SRILM 对注音的句子进行补全(Viterbi)

来源:互联网 发布:制作软件的工具 编辑:程序博客网 时间:2024/04/28 20:01

给一个句子,部分汉字是用注音的声母代替的,还原成原始的句子

感悟:

调用一个接受char* 的函数,记得char*后面加上'\0';

文件记得及时关闭;

作用域 may matter

map<string,string> big5 sucks

char* to int 


#include <iostream>#include <fstream>#include <string>#include <sstream>#include <map>#include "Ngram.h"#include "Prob.h"using namespace std;string deleteBlank(string s);map<int, string> readMap(map<int, string> zyMap);double gramProb(const char *w1, const char *w2);string bestStr(string s,map<int,string> zyMap);int big5ToInt(char *ch);string testPath="./testdata/";string resultPath="./result2/";string mapPath="ZhuYin-Big5.map";Vocab voc;Ngram lm( voc,2);int main(){map<int, string> zyMap;zyMap=readMap(zyMap);File lmFile("./bigram.lm", "r" );lm.read(lmFile);lmFile.close();int i=1;for(i=1;i<11;i++){stringstream ss;string s;ss<<i;ss>>s;string path1=testPath+s+".txt";string path2=resultPath+s+".txt";ifstream infile(path1.c_str());ofstream outfile(path2.c_str(),ios::app);string line;while(getline(infile,line)){//const  char *cline=line.c_str();string newline=deleteBlank(line);string bestString=bestStr(newline,zyMap);outfile<<bestString.c_str();}infile.close();outfile.close();}return 0;}int big5ToInt(char *ch){int a=ch[1]+(ch[0]<<8);return a;}//delete the blank in the stringstring deleteBlank(string s){string::iterator it;for(it=s.begin();it!=s.end();){if(*it==' ')it=s.erase(it);else++it;}return s;}//read the mapmap<int, string> readMap(map<int, string> zyMap){ifstream infile(mapPath.c_str());string line;int i=0;while(getline(infile,line)){char w[2];line=deleteBlank(line);w[0]=line.c_str()[0];w[1]=line.c_str()[1];int key=w[1]+(w[0]<<8);string value=line.substr(2,line.length()-2);zyMap[key]=value;}return zyMap;}//compute the most possible stringstring bestStr(string s,map<int,string> zyMap){int k=0;string bestString;//double candidateNum[s.length()/2];int trackMatrix[s.length()/2][10000];double vMatrix[s.length()/2][10000];memset(vMatrix,0.0,5000*s.length()*sizeof(double));memset(trackMatrix,0,5000*s.length()*sizeof(int));int maxId=0;double max=LogP_Zero;;double prob;char wo1[3];char wo2[3];wo1[2]='\0';wo2[2]='\0';char w1[2];char w2[2];int word1;int word2;int i=0;int j=0;double maxPro=LogP_Zero;string values1;string values2;int l2;int l1;int sl=s.length();for(k=0;k<=sl-4;k+=2){w1[0]=s.c_str()[k];w1[1]=s.c_str()[k+1];w2[0]=s.c_str()[k+2];w2[1]=s.c_str()[k+3];word1=w1[1]+(w1[0]<<8);word2=w2[1]+(w2[0]<<8);values1=zyMap[word1];values2=zyMap[word2];l2=values2.length();l1=values1.length();for(j=0;j<l2/2;j++){maxPro=LogP_Zero;wo2[0]=values2.c_str()[2*j];wo2[1]=values2.c_str()[2*j+1];for(i=0;i<l1/2;i++){wo1[0]=values1.c_str()[2*i];wo1[1]=values1.c_str()[2*i+1];prob=gramProb(wo1,wo2);if((prob+vMatrix[k/2][i])>maxPro){maxPro=prob+vMatrix[k/2][i];trackMatrix[k/2+1][j]=i;}}if(k==(sl-4)&&maxPro>max){maxId=j;max=maxPro;}vMatrix[k/2+1][j]=maxPro;}}//cout<<"maxId:"<<maxId<<endl;//back tracking string resultStr="";int word;char bestWord[3];bestWord[2]='\0';string values;for(k=sl/2-1;k>=0;k--){word=s.c_str()[2*k+1]+(s.c_str()[2*k]<<8);values=zyMap[word];bestWord[0]=values.c_str()[maxId*2];bestWord[1]=values.c_str()[maxId*2+1];resultStr=" "+string(bestWord)+resultStr ;if(k>0)maxId=trackMatrix[k][maxId];}resultStr="<s> "+resultStr+"  </s>\n";return resultStr;}double gramProb(const char *w1, const char *w2){VocabIndex wid1 = voc.getIndex(w1);VocabIndex wid2 = voc.getIndex(w2);if(wid1 == Vocab_None)  //OOVwid1 = voc.getIndex(Vocab_Unknown);if(wid2 == Vocab_None)  //OOVwid2 = voc.getIndex(Vocab_Unknown);VocabIndex context[] = { wid1, Vocab_None };return lm.wordProb( wid2, context);}


0 0
原创粉丝点击