N-gram

来源:互联网 发布:淘宝客pid查询 编辑:程序博客网 时间:2024/04/29 20:11

#include "iostream"#include "string.h"#include "string"#include "map"#include "fstream"#include "set"#include "vector"#include "stdio.h"using namespace std;vector<string> words;          //词汇库set<string> stopWords;         //停用词map<string, float> dictMakeUni;  //制作一元词典时用到map<string, float> dictMakeBi;   //制作二元词典时用到map<string, float> dictMakeTri;  //制作三元词典时用到map<string, float> dictReadUni;  //读取一元词典时用到map<string, float> dictReadBi;   //读取二元词典时用到map<string, float> dictReadTri;  //读取三元词典时用到vector<string> wordList;       //将测试语句分词后的结果const int maxLength = 20;      //最长单词:10,一个汉字的长度是2const int laplace = 1;  //平滑值string stringBi;     //二元分词结果string stringTri;    //三元分词结果double bestRate = 0;  //句子切分概率float smallP;        //一个小概率值int allNum = 0;      //所有词的总数//读取停用词,这里只是一些标点符号void readStopWords(char *file){stopWords.clear();ifstream fin;fin.open(file);if(!fin)cout << "停用词文件不存在";char buf[20];while(!fin.eof()){fin.getline(buf, 20);stopWords.insert(string(buf));}fin.close();}//制作词汇表、一元词典void makeWords(char *file){words.clear();ifstream fin;fin.open(file);if(!fin)cout << "语料库不存在";dictMakeUni.clear();ofstream foutKey, foutVal;foutKey.open("dictKeyUni.txt");if(!foutKey)cout << "dictKeyUni.txt创建失败";foutVal.open("dictValUni.txt");if(!foutVal)cout << "dictValUni.txt创建失败";char buf[500];char *d = " ";float total = 0;while(!fin.eof()){fin.getline(buf, 500);char *p;p = strtok(buf, d);while(p){string word(p);if(stopWords.find(word) == stopWords.end())  //不属于停用词{if(dictMakeUni.find(word) != dictMakeUni.end())dictMakeUni[word] += 1;elsedictMakeUni[word] = 1;  //至少出现一次words.push_back(word);total++;}p = strtok(NULL, d);}}allNum = total;smallP = 1.0/(float)total;map<string, float>::iterator it;for(it = dictMakeUni.begin(); it != dictMakeUni.end(); it++){foutKey << it->first << "\n";foutVal << (float)it->second/total << "\n";}foutKey.close();foutVal.close();}//制作二元词典void makeDictBi(){dictMakeBi.clear();ofstream foutKey, foutVal;foutKey.open("dictKeyBi.txt");if(!foutKey)cout << "dictKeyBi.txt创建失败";foutVal.open("dictValBi.txt");if(!foutVal)cout << "dictValBi.txt创建失败";float total = 0;for(int i=0; i<words.size()-1; i++){string word = words[i] + words[i+1];total++;if(dictMakeBi.find(word) != dictMakeBi.end())dictMakeBi[word] += 1; elsedictMakeBi[word] = 1;  //至少出现一次}map<string, float>::iterator it;for(it = dictMakeBi.begin(); it != dictMakeBi.end(); it++){foutKey << it->first << "\n";foutVal << (float)it->second/total << "\n";}foutKey.close();foutVal.close();}//制作三元词典void makeDictTri(){dictMakeTri.clear();ofstream foutKey, foutVal;foutKey.open("dictKeyTri.txt");if(!foutKey)cout << "dictKeyTri.txt创建失败";foutVal.open("dictValTri.txt");if(!foutVal)cout << "dictValTri.txt创建失败";float total = 0;for(int i=0; i<words.size()-2; i++){string word = words[i] + words[i+1] + words[i+2];total++;if(dictMakeTri.find(word) != dictMakeTri.end())dictMakeTri[word] += 1; elsedictMakeTri[word] = 1;  //至少出现一次}map<string, float>::iterator it;for(it = dictMakeTri.begin(); it != dictMakeTri.end(); it++){foutKey << it->first << "\n";foutVal << (float)it->second/total << "\n";}foutKey.close();foutVal.close();}//读取makeDictUni()中制作好的词典,一元词典void readDictUni(){dictReadBi.clear();ifstream finKey, finVal;finKey.open("dictKeyUni.txt");if(!finKey)cout << "dictKeyUni.txt不存在";finVal.open("dictValUni.txt");if(!finVal)cout << "dictValUni.txt不存在";char buf1[50], buf2[50];char *d = " ";while(!finKey.eof() && !finVal.eof()){finKey.getline(buf1, 50);finVal.getline(buf2, 50);dictReadUni[string(buf1)] = atof(buf2);}finKey.close();finVal.close();}//读取makeDictBi()中制作好的词典,二元词典void readDictBi(){dictReadBi.clear();ifstream finKey, finVal;finKey.open("dictKeyBi.txt");if(!finKey)cout << "dictKeyBi.txt不存在";finVal.open("dictValBi.txt");if(!finVal)cout << "dictValBi.txt不存在";char buf1[50], buf2[50];char *d = " ";while(!finKey.eof() && !finVal.eof()){finKey.getline(buf1, 50);finVal.getline(buf2, 50);dictReadBi[string(buf1)] = atof(buf2);}finKey.close();finVal.close();}//读取makeDictTri()中制作好的词典,三元词典void readDictTri(){dictReadTri.clear();ifstream finKey, finVal;finKey.open("dictKeyTri.txt");if(!finKey)cout << "dictKeyTri.txt不存在";finVal.open("dictValTri.txt");if(!finVal)cout << "dictValTri.txt不存在";char buf1[50], buf2[50];char *d = " ";while(!finKey.eof() && !finVal.eof()){finKey.getline(buf1, 50);finVal.getline(buf2, 50);dictReadTri[string(buf1)] = atof(buf2);}finKey.close();finVal.close();}void biGram(string &sentence, int len, int end, double rate, int index, vector<string> &wordList){if(len <= 2)   //句子只有一个字{stringBi = sentence;return;}if(end >= len-2)  //句子已经划分完{string str = "";for(int i=0; i<wordList.size(); i++)str += wordList[i] + " ";cout << str << "\t" << "概率:" << rate << endl;if(rate >= bestRate)   //当前的划分方式更好{stringBi = str;bestRate = rate;}return;}if(end == -1)  //第一次执行,需做清空工作{wordList.clear();stringBi = "";bestRate = 0;}string word = "";int last = end + 1;for(int i=end+1; i<len && i<maxLength; i+=2){word += sentence.substr(i, 2);if(index == 0) // 如果是第一个词{if(i == last || dictReadUni.find(word) != dictReadUni.end()){wordList.push_back(word);biGram(sentence, len, i+1, rate, index+1, wordList);wordList.pop_back();}}else //不是第一个词{string preWord = wordList[index-1];string twoWords = preWord + word;if(i == last  || dictReadUni.find(word) != dictReadUni.end())  //如果是第一个字,或者存在于一元词典中{if(dictReadBi.find(twoWords) != dictReadBi.end())  //如果存在2-gram{//求条件概率P(w_i | w_i-1) = P(w_i-1 w_i) / P(w_i-1)if(dictReadUni.find(preWord) != dictReadUni.end())    //前面的词存在于一元词典中{wordList.push_back(word);biGram(sentence, len, i + 1, rate * (dictReadBi[twoWords] / dictReadUni[preWord]), index + 1, wordList);wordList.pop_back();}else  //前面的词不存在于一元词典中{wordList.push_back(word);biGram(sentence, len, i + 1, rate * dictReadBi[twoWords], index + 1, wordList);wordList.pop_back();}}else  //不存在2-gram{wordList.push_back(word);biGram(sentence, len, i + 1, rate * smallP, index + 1, wordList);wordList.pop_back();}}}}}//返回句子分词后的结果,三元模型void triGram(string &sentence, int len, int end, double rate, int index, vector<string> &wordList){if(len <= 2)  //句子只有1个字{stringTri = sentence;return;}if(end >= len-2)  //句子已经划分完{string str = "";for(int i=0; i<wordList.size(); i++)str += wordList[i] + " ";cout << str << "\t" << "概率:" << rate << endl;if(rate >= bestRate)  //当前的划分方式更好{stringTri = str;bestRate = rate;}return;}if(end == -1)  //第一次执行本程序,需做清空工作{wordList.clear();stringTri = "";bestRate = 0;}string word = "";int last = end + 1;for(int i=end+1; i<len && i<maxLength; i+=2){word += sentence.substr(i, 2);if(index == 0 || index ==1)  //第一个词或者第二个词{if(i == last || dictReadUni.find(word) != dictReadUni.end())  {wordList.push_back(word);triGram(sentence, len, i+1, rate * 0.9, index+1, wordList);wordList.pop_back();}}else{string preWords = wordList[index-2] + wordList[index-1];string threeWords = preWords + word;if(i == last || dictReadUni.find(word) != dictReadUni.end())  //第一个字,或者存在于一元词典中{//要计算P(w_i | w_i-2 w_i-1)//P(w_i | w_i-2 w_i-1) = P(w_i-2 w_i-1 w_i) / P(w_i-1 w_i)if(dictReadTri.find(threeWords) != dictReadTri.end())  //存在3-gram{if(dictReadBi.find(preWords) != dictReadBi.end())  //前2个词存在于二元词典中{wordList.push_back(word);triGram(sentence, len, i + 1, rate * (dictReadTri[threeWords] / dictReadBi[preWords]), index + 1, wordList);wordList.pop_back();}else{wordList.push_back(word);triGram(sentence, len, i + 1, rate * dictReadTri[threeWords], index + 1, wordList);wordList.pop_back();}}else   //不存在3-gram{wordList.push_back(word);triGram(sentence, len, i + 1, rate * smallP, index + 1, wordList);wordList.pop_back();}}}}}int main(){//只在程序第一次启用的时候需要readStopWords("stopwords.txt");   //读取停用词makeWords("train.txt");  //读取语料库,制作一元词典makeDictBi();      //制作二元词典makeDictTri();     //制作三元词典///////////////////////////////////////////readDictUni();     //读取制作好的一元词典readDictBi();      //读取制作好的二元词典readDictTri();     //读取制作好的三元词典string str;cout << "请输入句子:\t";while(cin >> str){int end = -1;int index = 0;double rate = 1;int len = str.size();cout << "\n二元模型分词过程:\n";biGram(str, len, end, rate, index, wordList);  //二元模型划分句子cout << "二元模型切分结果:\t" << stringBi << endl << endl;end = -1;index = 0;rate = 1;cout << "三元模型分词过程:\n";triGram(str, len, end, rate, index, wordList);  //三元模型划分句子cout << "三元模型切分结果:\t" << stringTri << endl << endl;cout << endl << endl;cout << "请输入句子:\t";}return 0;}


 

0 0
原创粉丝点击