/** code list 4-1 : transfer func from docs list to vocabulary list* code list 4-2 : training func on Naive Bayes Classifier* code list 4-3 : naive bayes classify function* add code list 4-4 : naive bayes bag-of-word model* add code list 4-5 : text parse : textParse.py and spam email test function : get_error_rate()* */#include<iostream>  #include<map>  #include<set>  #include<cmath>  #include<vector>  #include<algorithm>  #include<numeric>  #include<cstring>  #include<string>#include<stdio.h>  #include<cstdlib>  #include<fstream>  #include<stdlib.h>  //#include<unistd.h>  #include<string.h>  using namespace std;class NaiveBayes         //贝叶斯分类器{private:vector< vector<string> > list_of_docs;vector<int> list_classes;   //类别向量map<string, int>  my_vocab_list;int *return_vec;vector< vector<int> > train_mat;  //训练矩阵vector<float> p0vect;vector<float> p1vect;float p_abusive;ifstream fin;ofstream fout;int test_data_num;public:NaiveBayes(){cout << "please input the num of test data which should be less than 24 : " << endl;cin >> test_data_num;vector<string> vec;string word;string filename;char buf[3];string buf_str;//分类器类初始化,运行解析脚本,将分割后的词向量存入list_of_docs,list_of_docs中的每一个元素对应一个文件内容for (int i = test_data_num + 1; i <= 25; i++){sprintf(buf, "%d", i);  //convert digit to string  vec.clear();buf_str = buf;filename = "./email/hamParse/" + buf_str + ".dat";//cout<<"filename : "<<filename<<endl;  fin.open(filename.c_str());if (!fin){cerr << "open the file " << filename << " error" << endl;exit(1);}while (fin >> word){vec.push_back(word);}list_of_docs.push_back(vec);list_classes.push_back(0);filename.clear();fin.close();}for (int i = test_data_num + 1; i <= 25; i++){sprintf(buf, "%d", i);vec.clear();buf_str = buf;filename = "./email/spamParse/" + buf_str + ".dat";//cout<<"filename : "<<filename<<endl;  fin.open(filename.c_str());if (!fin){cerr << "open the file " << filename << " error" << endl;}while (fin >> word){vec.push_back(word);}list_of_docs.push_back(vec);list_classes.push_back(1);filename.clear();fin.close();}}~NaiveBayes(){fin.close();fout.close();list_of_docs.clear();list_classes.clear();my_vocab_list.clear();train_mat.clear();//delete [] return_vec;  p0vect.clear();p1vect.clear();}void create_vocab_list(){vector< vector<string> > ::iterator it = list_of_docs.begin();int index = 1;while (it != list_of_docs.end()){//vector<string> vec( *it.begin(),*it.end() );  vector<string> vec = *it;vector<string> ::iterator tmp_it = vec.begin();while (tmp_it != vec.end()){//cout<<*tmp_it<<" ";  if (my_vocab_list[*tmp_it] == 0){my_vocab_list[*tmp_it] = index++; //index is the location of the vovabulary  }tmp_it++;}it++;}}//create_vocab_list  //set some one word to vec with 0 and 1.  void beg_of_words_to_vec(int idx){//cout<<"set of words to vec begin the document id is : "<<idx<<endl;  int len = my_vocab_list.size() + 1;return_vec = new int[len](); //pay attention to the difference between "new int[len]". initalize all the element to zero.  fill(return_vec, return_vec + len, 0);vector< vector<string> >::iterator it = list_of_docs.begin() + idx - 1;vector<string> vec = *it;vector<string> ::iterator itt = vec.begin();int pos = 0;while (itt != vec.end()){//          cout<<*itt<<" ";  pos = my_vocab_list[*itt];if (pos != 0){return_vec[pos] += 1;}itt++;}}//beg_of_words_to_vec  void get_train_matrix(){cout << "get train matrix begin : " << endl;train_mat.clear();for (int i = 1; i <= list_of_docs.size(); i++){beg_of_words_to_vec(i);vector<int> vec(return_vec, return_vec + my_vocab_list.size() + 1);train_mat.push_back(vec);delete[]return_vec;}}//get train matrix  void print(){cout << "print the train matrix begin : " << endl;vector< vector<int> > ::iterator it = train_mat.begin();while (it != train_mat.end()){vector<int> vec = *it;vector<int> ::iterator itt = vec.begin();while (itt != vec.end()){cout << *itt << " ";itt++;}cout << endl;it++;}}//print()  void train_NB0(){int num_train_docs = train_mat.size();//sizeof(docs_lists)/sizeof(docs_lists[0]);  cout << "num_train_docs = " << num_train_docs << endl;int num_words = train_mat[0].size() - 1;/* calculatr the sum of the abusive classes */int sum = accumulate(list_classes.begin(), list_classes.end(), 0);cout << "sum = " << sum << endl;//float p_abusive = (float)sum/(float)num_train_docs;  p_abusive = (float)sum / (float)num_train_docs;cout << "p_abusive = " << p_abusive << endl;//vector<float> p0vect(train_mat[0].size(),1); //the frequency of each word in non-absusive docs  p0vect.resize(train_mat[0].size(), 1);  //先将所有单词出现次数初始化为nameda = 1,拉普拉斯平滑,避免有概率值为0//vector<float> p1vect(train_mat[0].size(),1); //the frequency of each word in abusive docs  p1vect.resize(train_mat[0].size(), 1);printf("p0num.size() = %d , p1num.size() = %d\n", p0vect.size(), p1vect.size());float p0Denom = 2.0; //the total number of words in non-abusive docs,初始化为类别总计2float p1Denom = 2.0; //the total number of words in abusive docs  /* calculate the p0num,p1num,p0Denom,p1Denom */for (int i = 0; i<list_classes.size(); i++){if (list_classes[i] == 1)  //abusive doc  {for (int j = 0; j<p1vect.size(); j++){////有争议的是此处地方,对于概率分母p1Denom和p0Denom的求解暂且不明白p1vect[j] += train_mat[i][j];if (train_mat[i][j] == 1)p1Denom += 1;}}else   //non-abusive doc  {for (int j = 0; j<p0vect.size(); j++){p0vect[j] += train_mat[i][j];if (train_mat[i][j] == 1)p0Denom += 1;}}}for (int i = 0; i<p1vect.size(); i++){p0vect[i] = log(p0vect[i] / p0Denom);    //计算先验概率p1vect[i] = log(p1vect[i] / p1Denom);    }cout << endl;}int classify_NB(const char  *filename){return_vec = new int[my_vocab_list.size() + 1]();fin.open(filename);if (!fin){cerr << "fail to open the file " << filename << endl;exit(1);}string word;while (fin >> word){int pos = my_vocab_list[word];if (pos != 0){return_vec[pos] += 1;}}fin.close();cout << endl;float p1 = inner_product(p1vect.begin() + 1, p1vect.end(), return_vec + 1, 0) + log(p_abusive);float p0 = inner_product(p0vect.begin() + 1, p0vect.end(), return_vec + 1, 0) + log(1 - p_abusive);cout << "p1 = " << p1 << "  " << "p0 = " << p0 << endl;if (p1>p0){return 1;}else{return 0;}}void get_error_rate(){string filename;char buf[3];string buf_str;int error_count = 0;for (int i = 1; i <= test_data_num; i++){sprintf(buf, "%d", i);buf_str = buf;filename = "./email/hamParse/" + buf_str + ".dat";if (classify_NB(filename.c_str()) != 0){error_count++;}filename = "./email/spamParse/" + buf_str + ".dat";if (classify_NB(filename.c_str()) != 1){error_count++;}}cout << "the error rate is : " << (float)error_count / (float)(2 * test_data_num) << endl;}};int main(){NaiveBayes nb;        //文本分割,存入vector< vector<string> > list_of_docs,list_of_docs中的每一项都表示一个文件内容的分割nb.create_vocab_list(); //创建词列表,用红黑树存储,key-单词,value-单词序号(面向文本总计)/*创建训练矩阵,大家肯定想,这个是什么?矩阵的长:文本总数矩阵的宽:单词总计数(不重复的所有文本总计的单词量)矩阵每一个点的值:对应文件中,对应单词出现次数*/nb.get_train_matrix(); //nb.print();  nb.train_NB0();char  doc1_to_classify[] = "./email/hamParse/1.dat";char  doc2_to_classify[] = "./email/spamParse/1.dat";cout << "doc1 classified as : " << nb.classify_NB(doc1_to_classify) << endl;cout << "doc2 classified as : " << nb.classify_NB(doc2_to_classify) << endl;nb.get_error_rate();return 0;}

