贝叶斯实现文本分类C++实现

来源:互联网 发布:淘宝店铺怎么发布微淘 编辑:程序博客网 时间:2024/06/11 09:03
//NaiveBayes.h#ifndef NAIVEBAYES_H_#define NAIVEBAYES_H_#include<iostream>#include<map>#include<set>#include<cmath>#include<vector>#include<algorithm>#include<numeric>#include<cstring>#include<stdio.h>#include<cstdlib>using namespace std;class NaiveBayes{private:vector< vector<string> > list_of_posts;vector<int> list_classes;map<string, int>  my_vocab_list;int *return_vec;vector< vector<int> > train_mat;vector<float> p0vect;vector<float> p1vect;float p_abusive;public:NaiveBayes();void create_vocab_list();//create_vocab_listvoid set_of_words_to_vec(int idx);void get_train_matrix();void print();void train_NB0();int classify_NB(string *doc_to_classify);};#endif // !NAIVEBAYES_H_

//NaiveBayes.cpp#include"stdafx.h"#include"NaiveBayes.h"string posting_list[6][10] = {{ "my", "dog", "has", "flea", "problems", "help", "please", "null" },{ "maybe", "not", "take", "him", "to", "dog", "park", "stupid", "null" },{ "my", "dalmation", "is", "so", "cute", "I", "love", "him", "null" },{ "stop", "posting", "stupid", "worthless", "garbage", "null" },{ "mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him", "null" },{ "quit", "buying", "worthless", "dog", "food", "stupid", "null" }};int class_vec[6] = { 0, 1, 0, 1, 0, 1 };//1 is abusive ,0 notNaiveBayes::NaiveBayes(){vector<string> vec;for (int i = 0; i<6; i++){vec.clear();for (int j = 0; posting_list[i][j] != "null"; j++){vec.push_back(posting_list[i][j]);}list_of_posts.push_back(vec);}for (int i = 0; i<sizeof(class_vec) / sizeof(class_vec[0]); i++){list_classes.push_back(class_vec[i]);}}void NaiveBayes::create_vocab_list(){vector< vector<string> > ::iterator it = list_of_posts.begin();int index = 1;while (it != list_of_posts.end()){//vector<string> vec( *it.begin(),*it.end() );vector<string> vec = *it;vector<string> ::iterator tmp_it = vec.begin();while (tmp_it != vec.end()){//cout<<*tmp_it<<" ";if (my_vocab_list[*tmp_it] == 0){my_vocab_list[*tmp_it] = index++; //index is the location of the vovabulary}tmp_it++;}it++;}}//create_vocab_list//set some one word to vec with 0 and 1.void NaiveBayes::set_of_words_to_vec(int idx){cout << "set of words to vec begin the document id is : " << idx << endl;int len = my_vocab_list.size() + 1;return_vec = new int[len](); //pay attention to the difference between "new int[len]". initalize all the element to zero.fill(return_vec, return_vec + len, 0);for (int i = 0; i<len; i++)cout << return_vec[i] << " ";for (int i = 0; posting_list[idx][i] != "null"; i++){//cout<<posting_list[idx][i]<<" ";int pos = my_vocab_list[posting_list[idx][i]];if (pos != 0){return_vec[pos] = 1;}}cout << endl;}//set_of_words_to_vecvoid NaiveBayes::get_train_matrix(){cout << "get train matrix begin : " << endl;train_mat.clear();for (int i = 0; i<6; i++){set_of_words_to_vec(i);vector<int> vec(return_vec, return_vec + my_vocab_list.size() + 1);train_mat.push_back(vec);delete[]return_vec;}}//get train matrixvoid NaiveBayes::print(){cout << "print the train matrix begin : " << endl;vector< vector<int> > ::iterator it = train_mat.begin();while (it != train_mat.end()){vector<int> vec = *it;vector<int> ::iterator itt = vec.begin();while (itt != vec.end()){cout << *itt << " ";itt++;}cout << endl;it++;}}//print()void NaiveBayes::train_NB0(){int num_train_docs = train_mat.size();//sizeof(posting_lists)/sizeof(posting_lists[0]);cout << "num_train_docs = " << num_train_docs << endl;int num_words = train_mat[0].size() - 1;/* calculatr the sum of the abusive classes */int sum = accumulate(list_classes.begin(), list_classes.end(), 0);cout << "sum = " << sum << endl;//float p_abusive = (float)sum/(float)num_train_docs;p_abusive = (float)sum / (float)num_train_docs;cout << "p_abusive = " << p_abusive << endl;//vector<float> p0vect(train_mat[0].size(),1); //the frequency of each word in non-absusive docsp0vect.resize(train_mat[0].size(), 1);//vector<float> p1vect(train_mat[0].size(),1); //the frequency of each word in abusive docsp1vect.resize(train_mat[0].size(), 1);printf("p0num.size() = %d , p1num.size() = %d\n", p0vect.size(), p1vect.size());float p0Denom = 2.0; //the total number of words in non-abusive docsfloat p1Denom = 2.0; //the total number of words in abusive docs/* calculate the p0num,p1num,p0Denom,p1Denom */for (int i = 0; i<list_classes.size(); i++){if (list_classes[i] == 1)  //abusive doc{for (int j = 0; j<p1vect.size(); j++){p1vect[j] += train_mat[i][j];if (train_mat[i][j] == 1)p1Denom++;}}else   //non-abusive doc{for (int j = 0; j<p0vect.size(); j++){p0vect[j] += train_mat[i][j];if (train_mat[i][j] == 1)p0Denom++;}}}for (int i = 0; i<p1vect.size(); i++){p0vect[i] = log(p0vect[i] / p0Denom);p1vect[i] = log(p1vect[i] / p1Denom);}cout << "print the p0vect values : " << endl;for (int i = 0; i<p0vect.size(); i++)cout << p0vect[i] << " ";cout << "\nprint the p1vect values : " << endl;for (int i = 0; i<p1vect.size(); i++)cout << p1vect[i] << " ";cout << endl;}int NaiveBayes::classify_NB(string *doc_to_classify){return_vec = new int[my_vocab_list.size() + 1]();for (int i = 0; doc_to_classify[i] != "null"; i++){int pos = my_vocab_list[doc_to_classify[i]];if (pos != 0){return_vec[pos] = 1;}}//forfor (int i = 0; i<my_vocab_list.size() + 1; i++)cout << return_vec[i] << " ";cout << endl;float p1 = inner_product(p1vect.begin() + 1, p1vect.end(), return_vec + 1, 0) + log(p_abusive);float p0 = inner_product(p0vect.begin() + 1, p0vect.end(), return_vec + 1, 0) + log(1 - p_abusive);cout << "p1 = " << p1 << endl;cout << "p0 = " << p0 << endl;if (p1>p0){return 1;}else{return 0;}}

//main.cpp#include"stdafx.h"#include"stdlib.h "#include<iostream>#include"NaiveBayes.h"using namespace std;int main(){NaiveBayes nb;nb.create_vocab_list();nb.get_train_matrix();nb.print();nb.train_NB0();string doc1_to_classify[] = { "love", "my", "dalmation", "null" };string doc2_to_classify[] = { "stupid", "garbage", "null" };cout << "doc1 classified as : " << nb.classify_NB(doc1_to_classify) << endl;cout << "doc2 classified as : " << nb.classify_NB(doc2_to_classify) << endl;system("pause");return 0;}