朴素贝叶斯-过滤垃圾邮件代码实例详解

来源:互联网 发布:js验证用户名重复 编辑:程序博客网 时间:2024/06/05 20:03

1.问题描述

过滤垃圾邮件

2.思考过程

(1)收集数据:提供文本文件

(2)准备数据:将文本文件解析成词条向量

此处我们需要从给予的文本文档中构建自己的词列表(将文本内容进行词分割,过滤不需要的),也就是要创建符合实际情况的文本解析规则和过滤器(此处发现python对这方面的支持太好用,用其他语言比如c++当然也可以),代码实现中可以为此单独写一个函数

(3)分析数据:检查词条确保解析的正确性

(4)训练算法:使用训练函数进行训练(其实此处就是统计词条出现频率,然后采用极大似然估计或贝叶斯估计等估计法方法用统计的频率估算概率,总体来说,就是从词向量计算概率)

采用贝叶斯公式



(5)测试算法:使用训练好的分类器来进行结果分类,并构建测试函数计算文档集的错误率
(6)使用算法:构建完整程序对输入文档进行分类





3.代码实例和讲解

/** code list 4-1 : transfer func from docs list to vocabulary list* code list 4-2 : training func on Naive Bayes Classifier* code list 4-3 : naive bayes classify function* add code list 4-4 : naive bayes bag-of-word model* add code list 4-5 : text parse : textParse.py and spam email test function : get_error_rate()* */#include<iostream>  #include<map>  #include<set>  #include<cmath>  #include<vector>  #include<algorithm>  #include<numeric>  #include<cstring>  #include<string>#include<stdio.h>  #include<cstdlib>  #include<fstream>  #include<stdlib.h>  //#include<unistd.h>  #include<string.h>  using namespace std;class NaiveBayes         //贝叶斯分类器{private:vector< vector<string> > list_of_docs;vector<int> list_classes;   //类别向量map<string, int>  my_vocab_list;int *return_vec;vector< vector<int> > train_mat;  //训练矩阵vector<float> p0vect;vector<float> p1vect;float p_abusive;ifstream fin;ofstream fout;int test_data_num;public:NaiveBayes(){cout << "please input the num of test data which should be less than 24 : " << endl;cin >> test_data_num;vector<string> vec;string word;string filename;char buf[3];string buf_str;//分类器类初始化,运行解析脚本,将分割后的词向量存入list_of_docs,list_of_docs中的每一个元素对应一个文件内容for (int i = test_data_num + 1; i <= 25; i++){sprintf(buf, "%d", i);  //convert digit to string  vec.clear();buf_str = buf;filename = "./email/hamParse/" + buf_str + ".dat";//cout<<"filename : "<<filename<<endl;  fin.open(filename.c_str());if (!fin){cerr << "open the file " << filename << " error" << endl;exit(1);}while (fin >> word){vec.push_back(word);}list_of_docs.push_back(vec);list_classes.push_back(0);filename.clear();fin.close();}for (int i = test_data_num + 1; i <= 25; i++){sprintf(buf, "%d", i);vec.clear();buf_str = buf;filename = "./email/spamParse/" + buf_str + ".dat";//cout<<"filename : "<<filename<<endl;  fin.open(filename.c_str());if (!fin){cerr << "open the file " << filename << " error" << endl;}while (fin >> word){vec.push_back(word);}list_of_docs.push_back(vec);list_classes.push_back(1);filename.clear();fin.close();}}~NaiveBayes(){fin.close();fout.close();list_of_docs.clear();list_classes.clear();my_vocab_list.clear();train_mat.clear();//delete [] return_vec;  p0vect.clear();p1vect.clear();}void create_vocab_list(){vector< vector<string> > ::iterator it = list_of_docs.begin();int index = 1;while (it != list_of_docs.end()){//vector<string> vec( *it.begin(),*it.end() );  vector<string> vec = *it;vector<string> ::iterator tmp_it = vec.begin();while (tmp_it != vec.end()){//cout<<*tmp_it<<" ";  if (my_vocab_list[*tmp_it] == 0){my_vocab_list[*tmp_it] = index++; //index is the location of the vovabulary  }tmp_it++;}it++;}}//create_vocab_list  //set some one word to vec with 0 and 1.  void beg_of_words_to_vec(int idx){//cout<<"set of words to vec begin the document id is : "<<idx<<endl;  int len = my_vocab_list.size() + 1;return_vec = new int[len](); //pay attention to the difference between "new int[len]". initalize all the element to zero.  fill(return_vec, return_vec + len, 0);vector< vector<string> >::iterator it = list_of_docs.begin() + idx - 1;vector<string> vec = *it;vector<string> ::iterator itt = vec.begin();int pos = 0;while (itt != vec.end()){//          cout<<*itt<<" ";  pos = my_vocab_list[*itt];if (pos != 0){return_vec[pos] += 1;}itt++;}}//beg_of_words_to_vec  void get_train_matrix(){cout << "get train matrix begin : " << endl;train_mat.clear();for (int i = 1; i <= list_of_docs.size(); i++){beg_of_words_to_vec(i);vector<int> vec(return_vec, return_vec + my_vocab_list.size() + 1);train_mat.push_back(vec);delete[]return_vec;}}//get train matrix  void print(){cout << "print the train matrix begin : " << endl;vector< vector<int> > ::iterator it = train_mat.begin();while (it != train_mat.end()){vector<int> vec = *it;vector<int> ::iterator itt = vec.begin();while (itt != vec.end()){cout << *itt << " ";itt++;}cout << endl;it++;}}//print()  void train_NB0(){int num_train_docs = train_mat.size();//sizeof(docs_lists)/sizeof(docs_lists[0]);  cout << "num_train_docs = " << num_train_docs << endl;int num_words = train_mat[0].size() - 1;/* calculatr the sum of the abusive classes */int sum = accumulate(list_classes.begin(), list_classes.end(), 0);cout << "sum = " << sum << endl;//float p_abusive = (float)sum/(float)num_train_docs;  p_abusive = (float)sum / (float)num_train_docs;cout << "p_abusive = " << p_abusive << endl;//vector<float> p0vect(train_mat[0].size(),1); //the frequency of each word in non-absusive docs  p0vect.resize(train_mat[0].size(), 1);  //先将所有单词出现次数初始化为nameda = 1,拉普拉斯平滑,避免有概率值为0//vector<float> p1vect(train_mat[0].size(),1); //the frequency of each word in abusive docs  p1vect.resize(train_mat[0].size(), 1);printf("p0num.size() = %d , p1num.size() = %d\n", p0vect.size(), p1vect.size());float p0Denom = 2.0; //the total number of words in non-abusive docs,初始化为类别总计2float p1Denom = 2.0; //the total number of words in abusive docs  /* calculate the p0num,p1num,p0Denom,p1Denom */for (int i = 0; i<list_classes.size(); i++){if (list_classes[i] == 1)  //abusive doc  {for (int j = 0; j<p1vect.size(); j++){////有争议的是此处地方,对于概率分母p1Denom和p0Denom的求解暂且不明白p1vect[j] += train_mat[i][j];if (train_mat[i][j] == 1)p1Denom += 1;}}else   //non-abusive doc  {for (int j = 0; j<p0vect.size(); j++){p0vect[j] += train_mat[i][j];if (train_mat[i][j] == 1)p0Denom += 1;}}}for (int i = 0; i<p1vect.size(); i++){p0vect[i] = log(p0vect[i] / p0Denom);    //计算先验概率p1vect[i] = log(p1vect[i] / p1Denom);    }cout << endl;}int classify_NB(const char  *filename){return_vec = new int[my_vocab_list.size() + 1]();fin.open(filename);if (!fin){cerr << "fail to open the file " << filename << endl;exit(1);}string word;while (fin >> word){int pos = my_vocab_list[word];if (pos != 0){return_vec[pos] += 1;}}fin.close();cout << endl;float p1 = inner_product(p1vect.begin() + 1, p1vect.end(), return_vec + 1, 0) + log(p_abusive);float p0 = inner_product(p0vect.begin() + 1, p0vect.end(), return_vec + 1, 0) + log(1 - p_abusive);cout << "p1 = " << p1 << "  " << "p0 = " << p0 << endl;if (p1>p0){return 1;}else{return 0;}}void get_error_rate(){string filename;char buf[3];string buf_str;int error_count = 0;for (int i = 1; i <= test_data_num; i++){sprintf(buf, "%d", i);buf_str = buf;filename = "./email/hamParse/" + buf_str + ".dat";if (classify_NB(filename.c_str()) != 0){error_count++;}filename = "./email/spamParse/" + buf_str + ".dat";if (classify_NB(filename.c_str()) != 1){error_count++;}}cout << "the error rate is : " << (float)error_count / (float)(2 * test_data_num) << endl;}};int main(){NaiveBayes nb;        //文本分割,存入vector< vector<string> > list_of_docs,list_of_docs中的每一项都表示一个文件内容的分割nb.create_vocab_list(); //创建词列表,用红黑树存储,key-单词,value-单词序号(面向文本总计)/*创建训练矩阵,大家肯定想,这个是什么?矩阵的长:文本总数矩阵的宽:单词总计数(不重复的所有文本总计的单词量)矩阵每一个点的值:对应文件中,对应单词出现次数*/nb.get_train_matrix(); //nb.print();  nb.train_NB0();char  doc1_to_classify[] = "./email/hamParse/1.dat";char  doc2_to_classify[] = "./email/spamParse/1.dat";cout << "doc1 classified as : " << nb.classify_NB(doc1_to_classify) << endl;cout << "doc2 classified as : " << nb.classify_NB(doc2_to_classify) << endl;nb.get_error_rate();return 0;}


阅读全文
0 0