N-gram
来源:互联网 发布:淘宝客pid查询 编辑:程序博客网 时间:2024/04/29 20:11
#include "iostream"#include "string.h"#include "string"#include "map"#include "fstream"#include "set"#include "vector"#include "stdio.h"using namespace std;vector<string> words; //词汇库set<string> stopWords; //停用词map<string, float> dictMakeUni; //制作一元词典时用到map<string, float> dictMakeBi; //制作二元词典时用到map<string, float> dictMakeTri; //制作三元词典时用到map<string, float> dictReadUni; //读取一元词典时用到map<string, float> dictReadBi; //读取二元词典时用到map<string, float> dictReadTri; //读取三元词典时用到vector<string> wordList; //将测试语句分词后的结果const int maxLength = 20; //最长单词:10,一个汉字的长度是2const int laplace = 1; //平滑值string stringBi; //二元分词结果string stringTri; //三元分词结果double bestRate = 0; //句子切分概率float smallP; //一个小概率值int allNum = 0; //所有词的总数//读取停用词,这里只是一些标点符号void readStopWords(char *file){stopWords.clear();ifstream fin;fin.open(file);if(!fin)cout << "停用词文件不存在";char buf[20];while(!fin.eof()){fin.getline(buf, 20);stopWords.insert(string(buf));}fin.close();}//制作词汇表、一元词典void makeWords(char *file){words.clear();ifstream fin;fin.open(file);if(!fin)cout << "语料库不存在";dictMakeUni.clear();ofstream foutKey, foutVal;foutKey.open("dictKeyUni.txt");if(!foutKey)cout << "dictKeyUni.txt创建失败";foutVal.open("dictValUni.txt");if(!foutVal)cout << "dictValUni.txt创建失败";char buf[500];char *d = " ";float total = 0;while(!fin.eof()){fin.getline(buf, 500);char *p;p = strtok(buf, d);while(p){string word(p);if(stopWords.find(word) == stopWords.end()) //不属于停用词{if(dictMakeUni.find(word) != dictMakeUni.end())dictMakeUni[word] += 1;elsedictMakeUni[word] = 1; //至少出现一次words.push_back(word);total++;}p = strtok(NULL, d);}}allNum = total;smallP = 1.0/(float)total;map<string, float>::iterator it;for(it = dictMakeUni.begin(); it != dictMakeUni.end(); it++){foutKey << it->first << "\n";foutVal << (float)it->second/total << "\n";}foutKey.close();foutVal.close();}//制作二元词典void makeDictBi(){dictMakeBi.clear();ofstream foutKey, foutVal;foutKey.open("dictKeyBi.txt");if(!foutKey)cout << "dictKeyBi.txt创建失败";foutVal.open("dictValBi.txt");if(!foutVal)cout << "dictValBi.txt创建失败";float total = 0;for(int i=0; i<words.size()-1; i++){string word = words[i] + words[i+1];total++;if(dictMakeBi.find(word) != dictMakeBi.end())dictMakeBi[word] += 1; elsedictMakeBi[word] = 1; //至少出现一次}map<string, float>::iterator it;for(it = dictMakeBi.begin(); it != dictMakeBi.end(); it++){foutKey << it->first << "\n";foutVal << (float)it->second/total << "\n";}foutKey.close();foutVal.close();}//制作三元词典void makeDictTri(){dictMakeTri.clear();ofstream foutKey, foutVal;foutKey.open("dictKeyTri.txt");if(!foutKey)cout << "dictKeyTri.txt创建失败";foutVal.open("dictValTri.txt");if(!foutVal)cout << "dictValTri.txt创建失败";float total = 0;for(int i=0; i<words.size()-2; i++){string word = words[i] + words[i+1] + words[i+2];total++;if(dictMakeTri.find(word) != dictMakeTri.end())dictMakeTri[word] += 1; elsedictMakeTri[word] = 1; //至少出现一次}map<string, float>::iterator it;for(it = dictMakeTri.begin(); it != dictMakeTri.end(); it++){foutKey << it->first << "\n";foutVal << (float)it->second/total << "\n";}foutKey.close();foutVal.close();}//读取makeDictUni()中制作好的词典,一元词典void readDictUni(){dictReadBi.clear();ifstream finKey, finVal;finKey.open("dictKeyUni.txt");if(!finKey)cout << "dictKeyUni.txt不存在";finVal.open("dictValUni.txt");if(!finVal)cout << "dictValUni.txt不存在";char buf1[50], buf2[50];char *d = " ";while(!finKey.eof() && !finVal.eof()){finKey.getline(buf1, 50);finVal.getline(buf2, 50);dictReadUni[string(buf1)] = atof(buf2);}finKey.close();finVal.close();}//读取makeDictBi()中制作好的词典,二元词典void readDictBi(){dictReadBi.clear();ifstream finKey, finVal;finKey.open("dictKeyBi.txt");if(!finKey)cout << "dictKeyBi.txt不存在";finVal.open("dictValBi.txt");if(!finVal)cout << "dictValBi.txt不存在";char buf1[50], buf2[50];char *d = " ";while(!finKey.eof() && !finVal.eof()){finKey.getline(buf1, 50);finVal.getline(buf2, 50);dictReadBi[string(buf1)] = atof(buf2);}finKey.close();finVal.close();}//读取makeDictTri()中制作好的词典,三元词典void readDictTri(){dictReadTri.clear();ifstream finKey, finVal;finKey.open("dictKeyTri.txt");if(!finKey)cout << "dictKeyTri.txt不存在";finVal.open("dictValTri.txt");if(!finVal)cout << "dictValTri.txt不存在";char buf1[50], buf2[50];char *d = " ";while(!finKey.eof() && !finVal.eof()){finKey.getline(buf1, 50);finVal.getline(buf2, 50);dictReadTri[string(buf1)] = atof(buf2);}finKey.close();finVal.close();}void biGram(string &sentence, int len, int end, double rate, int index, vector<string> &wordList){if(len <= 2) //句子只有一个字{stringBi = sentence;return;}if(end >= len-2) //句子已经划分完{string str = "";for(int i=0; i<wordList.size(); i++)str += wordList[i] + " ";cout << str << "\t" << "概率:" << rate << endl;if(rate >= bestRate) //当前的划分方式更好{stringBi = str;bestRate = rate;}return;}if(end == -1) //第一次执行,需做清空工作{wordList.clear();stringBi = "";bestRate = 0;}string word = "";int last = end + 1;for(int i=end+1; i<len && i<maxLength; i+=2){word += sentence.substr(i, 2);if(index == 0) // 如果是第一个词{if(i == last || dictReadUni.find(word) != dictReadUni.end()){wordList.push_back(word);biGram(sentence, len, i+1, rate, index+1, wordList);wordList.pop_back();}}else //不是第一个词{string preWord = wordList[index-1];string twoWords = preWord + word;if(i == last || dictReadUni.find(word) != dictReadUni.end()) //如果是第一个字,或者存在于一元词典中{if(dictReadBi.find(twoWords) != dictReadBi.end()) //如果存在2-gram{//求条件概率P(w_i | w_i-1) = P(w_i-1 w_i) / P(w_i-1)if(dictReadUni.find(preWord) != dictReadUni.end()) //前面的词存在于一元词典中{wordList.push_back(word);biGram(sentence, len, i + 1, rate * (dictReadBi[twoWords] / dictReadUni[preWord]), index + 1, wordList);wordList.pop_back();}else //前面的词不存在于一元词典中{wordList.push_back(word);biGram(sentence, len, i + 1, rate * dictReadBi[twoWords], index + 1, wordList);wordList.pop_back();}}else //不存在2-gram{wordList.push_back(word);biGram(sentence, len, i + 1, rate * smallP, index + 1, wordList);wordList.pop_back();}}}}}//返回句子分词后的结果,三元模型void triGram(string &sentence, int len, int end, double rate, int index, vector<string> &wordList){if(len <= 2) //句子只有1个字{stringTri = sentence;return;}if(end >= len-2) //句子已经划分完{string str = "";for(int i=0; i<wordList.size(); i++)str += wordList[i] + " ";cout << str << "\t" << "概率:" << rate << endl;if(rate >= bestRate) //当前的划分方式更好{stringTri = str;bestRate = rate;}return;}if(end == -1) //第一次执行本程序,需做清空工作{wordList.clear();stringTri = "";bestRate = 0;}string word = "";int last = end + 1;for(int i=end+1; i<len && i<maxLength; i+=2){word += sentence.substr(i, 2);if(index == 0 || index ==1) //第一个词或者第二个词{if(i == last || dictReadUni.find(word) != dictReadUni.end()) {wordList.push_back(word);triGram(sentence, len, i+1, rate * 0.9, index+1, wordList);wordList.pop_back();}}else{string preWords = wordList[index-2] + wordList[index-1];string threeWords = preWords + word;if(i == last || dictReadUni.find(word) != dictReadUni.end()) //第一个字,或者存在于一元词典中{//要计算P(w_i | w_i-2 w_i-1)//P(w_i | w_i-2 w_i-1) = P(w_i-2 w_i-1 w_i) / P(w_i-1 w_i)if(dictReadTri.find(threeWords) != dictReadTri.end()) //存在3-gram{if(dictReadBi.find(preWords) != dictReadBi.end()) //前2个词存在于二元词典中{wordList.push_back(word);triGram(sentence, len, i + 1, rate * (dictReadTri[threeWords] / dictReadBi[preWords]), index + 1, wordList);wordList.pop_back();}else{wordList.push_back(word);triGram(sentence, len, i + 1, rate * dictReadTri[threeWords], index + 1, wordList);wordList.pop_back();}}else //不存在3-gram{wordList.push_back(word);triGram(sentence, len, i + 1, rate * smallP, index + 1, wordList);wordList.pop_back();}}}}}int main(){//只在程序第一次启用的时候需要readStopWords("stopwords.txt"); //读取停用词makeWords("train.txt"); //读取语料库,制作一元词典makeDictBi(); //制作二元词典makeDictTri(); //制作三元词典///////////////////////////////////////////readDictUni(); //读取制作好的一元词典readDictBi(); //读取制作好的二元词典readDictTri(); //读取制作好的三元词典string str;cout << "请输入句子:\t";while(cin >> str){int end = -1;int index = 0;double rate = 1;int len = str.size();cout << "\n二元模型分词过程:\n";biGram(str, len, end, rate, index, wordList); //二元模型划分句子cout << "二元模型切分结果:\t" << stringBi << endl << endl;end = -1;index = 0;rate = 1;cout << "三元模型分词过程:\n";triGram(str, len, end, rate, index, wordList); //三元模型划分句子cout << "三元模型切分结果:\t" << stringTri << endl << endl;cout << endl << endl;cout << "请输入句子:\t";}return 0;}
0 0
- n-Gram
- N-gram
- N-gram
- n-gram
- n-gram
- N-gram模型
- N-gram模型
- N-gram算法
- N-gram模型
- N-gram模型
- N-gram模型
- N-Gram学习笔记
- N-Gram的数据结构
- N-gram模型
- N-gram模型
- N-gram模型
- n-gram模型
- 语言模型n-gram
- centos7安装codeblocks16.01
- git rebase 进阶
- 分布式架构中一致性解决方案——Zookeeper集群搭建
- 通向架构师的道路(第十一天)之Axis2 Web Service(二)
- 李开复给中国计算机系大学生的7点建议
- N-gram
- Java之美[从菜鸟到高手演变]之设计模式四
- 通向架构师的道路(第十二天)之Axis2 Web Service(三)
- CodeForces 359A - Table(思维)
- js 全局变量、局部变量的作用域;变量声明提前;无块级作用域
- 第十三章上机练习
- Linux Driver APIs - interrupt and irq
- 华为组(三)
- Java finalize方法使用