openCv学习笔记（十三）—贝叶斯分类器的实现

来源：互联网发布：网易邮箱数据库下载编辑：程序博客网时间：2024/05/22 22:09

第一个是用c语言做的关于文本的分类，主要是对待分类文本所有单词在模板中概率的后验计算。算法比较简单，从网上下的（没记下地址，若不愿意公开，请留言，自当处理），稍作了一点修改。。，等有时间可以实现垃圾邮件的分类，利用斯坦福机器学习公开课中方法，统计高频词，利用朴素贝叶斯。等有时间和大家分享。

[cpp] view plaincopyprint?

#include <stdio.h>
#include <string.h>
#include <direct.h> //_getcwd(), _chdir()
#include <stdlib.h> //_MAX_PATH, system()
#include <io.h> //_finddata_t, _findfirst(), _findnext(), _findclose()
#include<iostream>
using namespace std;
//#include<fstream>
char vocabulary[1000][20];/*声明公有二维数组，用来存储分割好的单词*/
/*=================将要分类的文本分割成单词存储在二维数组vocabulary中================*/
//@输入参数：要分类的文本
//@输出参数：该文本中总单词数
int SplitToWord(char text[])
{
int i=0;
char seps[]=", .\n";/*定义单词的分隔符*/
char *substring;
/******利用分隔符将文本内容分割成单词并存储******/
substring=strtok(text,seps);
while(substring!=NULL)
{
strcpy(vocabulary[i],substring);//将单词存储到vocabulary数组中
substring=strtok(NULL,seps);
i++;
}
return i; //返回一共多少个单词
}
/*===============================计算该目录下的文件数================================*/
//@输入参数：无
//@输出参数：该目录下.txt文件数
int CountDirectory()
{
int count=0; //txt文件计数器
long hFile;
_finddata_t fileinfo;
/********查找.txt文件，记录文件数**********/
if ((hFile=_findfirst("*.txt",&fileinfo))!=-1L)
{
do
{
count++;
} while (_findnext(hFile,&fileinfo) == 0);
}
return count;
}
/*===================================计算某类别中∏P(ai|vj)===================================*/
//@输入参数：分类文本中单词数
//@输出参数：该类别下∏P(ai|vj)
float CalculateWordProbability(int wordCount)
{
int countSame; //分类文本中的某单词在所有训练样本中出现次数
int countAll=0; //训练样本中总单词数
char token;
FILE *fp;
float wordProbability=1; //为后面联乘做准备
int i,j;
long hFile;
_finddata_t fileinfo;
for(j=0;j<wordCount;j++) //对于分类样本中的每一个单词
{
countSame=0;
countAll=0;
if((hFile=_findfirst("*.txt",&fileinfo))!=-1L)//对于该类别下每一个.txt文本
{
do
{
if((fp=fopen(fileinfo.name,"r"))==NULL)//是否能打开该文本
{
printf("Sorry!Cannot open the file!\n");
exit(0);
}
/********存储此.txt文件中每个单词并与分类文本的单词作比较*******/
while((token = fgetc(fp)) != EOF)
{
char keyword[1024];
i = 0;
keyword[0] = token; // 将每个词第一个字符赋给数组第一个元素
while ((keyword[++i] = fgetc(fp)) !=' ' && keyword[i] != '\t' && keyword[i] != EOF && keyword[i] !='\n'); // 开始读字符，直到遇到空白符，说明找到一个词
keyword[i] = '\0';// 加结束符
countAll++;
if (strcmp(keyword,vocabulary[j]) == 0)//比较两个单词是否相同
countSame++;
}
fclose(fp);
}while (_findnext(hFile,&fileinfo) == 0);
}
wordProbability*=(float)(countSame+1)/(float)(wordCount+countAll)*300;//计算∏P(wj|vi)，为了扩大效果而*380
}
return wordProbability;
}
/*============================计算每个类别的最终概率输出结果===============================*/
//@输入参数：分类文本中单词数
void CalculateProbability(int wordCount,int num)
{
/*********将类别表存储在二维数组中*********/
FILE *fp;
char classList[10][20]; //类别列表
char ch; //临时读取字符使用
int index=0; //classList的行标
int className_c=0; //classList的列标
if((fp=fopen("ClassList.txt","r"))==NULL)
{
printf("Failed to open the file: ClassList.txt.\n");
}
ch = fgetc(fp);
while(ch!=EOF)
{
if(ch!='\n')
{
classList[index][className_c]=ch;
className_c++;
}
else
{
classList[index][className_c]='\0';
index++;
className_c=0;
}
ch = fgetc(fp);
}
/********计算总文本数和每个类别下的文本数、∏P(ai|vj)********/
int txtCount[10]; //每个类别下的训练文本数
int countAll=0; //训练集中总文本数
float wordProbability[10];//每个类别的单词概率，即∏P(ai|vj)
if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\1"))//更改当前绝对路径
printf("系统找不到指定路径!\n");
else
{
txtCount[0]=CountDirectory(); //获取该类别下.txt文件数
countAll+=txtCount[0];
wordProbability[0]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)
}
if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\2"))//更改当前绝对路径
printf("系统找不到指定路径!\n");
else
{
txtCount[1]=CountDirectory(); //获取该类别下.txt文件数
countAll+=txtCount[1];
wordProbability[1]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)
}
if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\3"))//更改当前绝对路径
printf("系统找不到指定路径!\n");
else
{
txtCount[2]=CountDirectory(); //获取该类别下.txt文件数
countAll+=txtCount[2];
wordProbability[2]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)
}
if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\4"))//更改当前绝对路径
printf("系统找不到指定路径!\n");
else
{
txtCount[3]=CountDirectory(); //获取该类别下.txt文件数
countAll+=txtCount[3];
wordProbability[3]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)
}
if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\5"))//更改当前绝对路径
printf("系统找不到指定路径!\n");
else
{
txtCount[4]=CountDirectory(); //获取该类别下.txt文件数
countAll+=txtCount[4];
wordProbability[4]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)
}
if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\6"))//更改当前绝对路径
printf("系统找不到指定路径!\n");
else
{
txtCount[5]=CountDirectory(); //获取该类别下.txt文件数
countAll+=txtCount[5];
wordProbability[5]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)
}
if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\7"))//更改当前绝对路径
printf("系统找不到指定路径!\n");
else
{
txtCount[6]=CountDirectory(); //获取该类别下.txt文件数
countAll+=txtCount[6];
wordProbability[6]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)
}
if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\8"))//更改当前绝对路径
printf("系统找不到指定路径!\n");
else
{
txtCount[7]=CountDirectory(); //获取该类别下.txt文件数
countAll+=txtCount[7];
wordProbability[7]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)
}
if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\9"))//更改当前绝对路径
printf("系统找不到指定路径!\n");
else
{
txtCount[8]=CountDirectory(); //获取该类别下.txt文件数
countAll+=txtCount[8];
wordProbability[8]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)
}
if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\10"))//更改当前绝对路径
printf("系统找不到指定路径!\n");
else
{
txtCount[9]=CountDirectory(); //获取该类别下.txt文件数
countAll+=txtCount[9];
wordProbability[9]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)
}
/*******计算先验概率和最终概率并输出分类结果*******/
float max=0;
int classNo=0;
float priorProbability[10];
float finalProbability[10];
for(int i=0;i<num;i++)
{
priorProbability[i]=(float)txtCount[i]/(float)countAll;//先验概率
finalProbability[i]=priorProbability[i]*wordProbability[i];//最终概率
if(finalProbability[i]>max) //找到最大概率并记录
{
max=finalProbability[i];
classNo=i;
}
printf("该文本为类别%s的概率为:%.5e\n",classList[i],finalProbability[i]);//输出每个类别的最终概率
}
printf("\n经分析，该文本最有可能为%s类文本!\n",classList[classNo]);//输出最后分类结果
}
/*===================调用文本分割函数和计算最终概率函数======================*/
//@输入参数：分类文本
void NaiveBayesClassifier(char text[],int num)
{
int vocabularyCount;//分类样本中单词数
vocabularyCount=SplitToWord(text); //对要分类的文本进行单词分割，结果存储在vocabulary数组中，返回分类样本中单词数
CalculateProbability(vocabularyCount,num); //计算最终概率
}
/*===================程序入口====================*/
int main()
{
FILE *fp;
if((fp=fopen("text.txt","r"))==NULL)
{
printf("Failed to open the file: ClassList.txt.\n");
}
char ch = fgetc(fp);
int i=0;
while(ch!=EOF)
{
ch = fgetc(fp);
i++;
}
char *text=newchar(i+1);
fseek(fp,0,SEEK_SET);//
ch = fgetc(fp);
int j=0;
while(ch!=EOF)
{
ch = fgetc(fp);
cout<<ch;
text[j]=ch;
j++;
}
// char text[]=new char(i);;
int num = 2;
NaiveBayesClassifier(text,num); /*调用朴素贝叶斯分类函数，返回最终分类结果*/
return 1;
}

#include <stdio.h>#include <string.h>#include <direct.h> //_getcwd(), _chdir()#include <stdlib.h> //_MAX_PATH, system()#include <io.h> //_finddata_t, _findfirst(), _findnext(), _findclose()#include<iostream>using namespace std;//#include<fstream>char vocabulary[1000][20];/*声明公有二维数组，用来存储分割好的单词*//*=================将要分类的文本分割成单词存储在二维数组vocabulary中================*///@输入参数：要分类的文本//@输出参数：该文本中总单词数int SplitToWord(char text[]){int i=0;char seps[]=", .\n"; /*定义单词的分隔符*/ char *substring; /******利用分隔符将文本内容分割成单词并存储******/substring=strtok(text,seps); while(substring!=NULL) {      strcpy(vocabulary[i],substring);//将单词存储到vocabulary数组中    substring=strtok(NULL,seps);    i++;}return i; //返回一共多少个单词}/*===============================计算该目录下的文件数================================*///@输入参数：无//@输出参数：该目录下.txt文件数int CountDirectory(){int count=0; //txt文件计数器long hFile;    _finddata_t fileinfo;/********查找.txt文件，记录文件数**********/    if ((hFile=_findfirst("*.txt",&fileinfo))!=-1L)    {        do        {                count++;        } while (_findnext(hFile,&fileinfo) == 0);}return count;}/*===================================计算某类别中∏P(ai|vj)===================================*///@输入参数：分类文本中单词数//@输出参数：该类别下∏P(ai|vj)float CalculateWordProbability(int wordCount){int countSame; //分类文本中的某单词在所有训练样本中出现次数int countAll=0; //训练样本中总单词数char token;FILE *fp;float wordProbability=1; //为后面联乘做准备int i,j;long hFile;    _finddata_t fileinfo;for(j=0;j<wordCount;j++) //对于分类样本中的每一个单词{   countSame=0;   countAll=0;   if((hFile=_findfirst("*.txt",&fileinfo))!=-1L) //对于该类别下每一个.txt文本   {    do{     if((fp=fopen(fileinfo.name,"r"))==NULL) //是否能打开该文本     {      printf("Sorry!Cannot open the file!\n");      exit(0);     }     /********存储此.txt文件中每个单词并与分类文本的单词作比较*******/     while((token = fgetc(fp)) != EOF)  {      char keyword[1024];       i = 0;            keyword[0] = token; // 将每个词第一个字符赋给数组第一个元素      while ((keyword[++i] = fgetc(fp)) != ' ' && keyword[i] != '\t' && keyword[i] != EOF && keyword[i] != '\n'); // 开始读字符，直到遇到空白符，说明找到一个词       keyword[i] = '\0';// 加结束符      countAll++;      if (strcmp(keyword,vocabulary[j]) == 0) //比较两个单词是否相同       countSame++;     }     fclose(fp);    }while (_findnext(hFile,&fileinfo) == 0);    }   wordProbability*=(float)(countSame+1)/(float)(wordCount+countAll)*300; //计算∏P(wj|vi)，为了扩大效果而*380}return wordProbability;}  /*============================计算每个类别的最终概率输出结果===============================*///@输入参数：分类文本中单词数  void CalculateProbability(int wordCount,int num){/*********将类别表存储在二维数组中*********/FILE *fp;char classList[10][20]; //类别列表    char ch;    //临时读取字符使用    int index=0; //classList的行标    int className_c=0; //classList的列标if((fp=fopen("ClassList.txt","r"))==NULL)    {        printf("Failed to open the file: ClassList.txt.\n");    }    ch = fgetc(fp);    while(ch!=EOF)    {        if(ch!='\n')        {            classList[index][className_c]=ch;            className_c++;        }        else        {            classList[index][className_c]='\0';            index++;            className_c=0;        }   ch = fgetc(fp);}/********计算总文本数和每个类别下的文本数、∏P(ai|vj)********/int txtCount[10]; //每个类别下的训练文本数int countAll=0; //训练集中总文本数float wordProbability[10]; //每个类别的单词概率，即∏P(ai|vj)if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\1")) //更改当前绝对路径     printf("系统找不到指定路径!\n");else{   txtCount[0]=CountDirectory(); //获取该类别下.txt文件数   countAll+=txtCount[0];   wordProbability[0]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)}if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\2")) //更改当前绝对路径   printf("系统找不到指定路径!\n");else{   txtCount[1]=CountDirectory(); //获取该类别下.txt文件数   countAll+=txtCount[1];   wordProbability[1]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)}if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\3")) //更改当前绝对路径     printf("系统找不到指定路径!\n");else{   txtCount[2]=CountDirectory(); //获取该类别下.txt文件数   countAll+=txtCount[2];   wordProbability[2]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)}if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\4")) //更改当前绝对路径     printf("系统找不到指定路径!\n");else{   txtCount[3]=CountDirectory(); //获取该类别下.txt文件数   countAll+=txtCount[3];   wordProbability[3]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)}if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\5")) //更改当前绝对路径     printf("系统找不到指定路径!\n");else{   txtCount[4]=CountDirectory(); //获取该类别下.txt文件数   countAll+=txtCount[4];   wordProbability[4]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)}if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\6")) //更改当前绝对路径     printf("系统找不到指定路径!\n");else{   txtCount[5]=CountDirectory(); //获取该类别下.txt文件数   countAll+=txtCount[5];   wordProbability[5]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)}if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\7")) //更改当前绝对路径     printf("系统找不到指定路径!\n");else{   txtCount[6]=CountDirectory(); //获取该类别下.txt文件数   countAll+=txtCount[6];   wordProbability[6]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)}if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\8")) //更改当前绝对路径     printf("系统找不到指定路径!\n");else{   txtCount[7]=CountDirectory(); //获取该类别下.txt文件数   countAll+=txtCount[7];   wordProbability[7]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)}if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\9")) //更改当前绝对路径     printf("系统找不到指定路径!\n");else{   txtCount[8]=CountDirectory(); //获取该类别下.txt文件数   countAll+=txtCount[8];   wordProbability[8]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)}if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\贝叶斯（文本分类）—c语言\\example\\10")) //更改当前绝对路径     printf("系统找不到指定路径!\n");else{   txtCount[9]=CountDirectory(); //获取该类别下.txt文件数   countAll+=txtCount[9];   wordProbability[9]=CalculateWordProbability(wordCount); //获取该类别下∏P(wj|vi)}/*******计算先验概率和最终概率并输出分类结果*******/float max=0;int classNo=0;float priorProbability[10];float finalProbability[10];for(int i=0;i<num;i++) {   priorProbability[i]=(float)txtCount[i]/(float)countAll; //先验概率   finalProbability[i]=priorProbability[i]*wordProbability[i]; //最终概率   if(finalProbability[i]>max) //找到最大概率并记录   {    max=finalProbability[i];    classNo=i;   }   printf("该文本为类别%s的概率为:%.5e\n",classList[i],finalProbability[i]); //输出每个类别的最终概率}printf("\n经分析，该文本最有可能为%s类文本!\n",classList[classNo]); //输出最后分类结果}/*===================调用文本分割函数和计算最终概率函数======================*///@输入参数：分类文本void NaiveBayesClassifier(char text[],int num){int vocabularyCount;//分类样本中单词数vocabularyCount=SplitToWord(text); //对要分类的文本进行单词分割，结果存储在vocabulary数组中，返回分类样本中单词数CalculateProbability(vocabularyCount,num); //计算最终概率}/*===================程序入口====================*/int main(){   FILE *fp;   if((fp=fopen("text.txt","r"))==NULL)   {        printf("Failed to open the file: ClassList.txt.\n");   }   char ch = fgetc(fp);   int i=0;   while(ch!=EOF)   {       ch = fgetc(fp);   i++;   }   char *text=new char(i+1);   fseek(fp,0,SEEK_SET);//   ch = fgetc(fp);   int j=0;   while(ch!=EOF)   {       ch = fgetc(fp);   cout<<ch;   text[j]=ch;   j++;   }  // char text[]=new char(i);;   int num = 2;   NaiveBayesClassifier(text,num); /*调用朴素贝叶斯分类函数，返回最终分类结果*/return 1;}

这一个是openCv里的贝叶斯训练器的使用，摘自：http://blog.csdn.net/carson2005/article/details/6854024

[cpp] view plaincopyprint?

//openCV中贝叶斯分类器的API函数用法举例
//运行环境：winXP + VS2008 + openCV2.3.0
//摘自：http://blog.csdn.net/carson2005/article/details/6854024
//#include <stdafx.h>
/*
openCV中与贝叶斯分类器相关的API函数有以下几个：
(1)CvNormalBayesClassifier::CvNormalBayesClassifier();
该函数为默认构造函数；
(2)CvNormalBayesClassifier::CvNormalBayesClassifier(const Mat& trainData, const Mat& response, const Mat& varIdx=Mat(),const Mat& sampleIdx=Mat());
该函数实际是在默认构造函数内部调用train()函数进行分类器训练；
(3)bool CvNormalBayesClassifier::train(const Mat& trainData, const Mat& response, const Mat& varIdx=Mat(),const Mat& sampleIdx=Mat());
该函数进行贝叶斯分类器的训练，输入向量必须为行向量；变量response必须为整数，但其在初始化时类型可被设置为CV_32FC1;
所有特征向量必须完整，不允许训练样本集的某一个向量存在数据缺失；
(4)float CvNormalBayesClassifier::predict(const Mat& samples, Mat* result=0);
该函数根据用户输入的测试样本的特征向量，返回其所属的类别；注意，如果输入是很多个测试样本的特征向量组成的矩阵，返回值为result矩阵；
*/
#include "opencv.hpp"
#include <iostream>
using namespace cv;
using namespace std;
//10个样本特征向量维数为12的训练样本集，第一列为该样本的类别标签
double inputArr[10][13] =
{
1,0.708333,1,1,-0.320755,-0.105023,-1,1,-0.419847,-1,-0.225806,0,1,
-1,0.583333,-1,0.333333,-0.603774,1,-1,1,0.358779,-1,-0.483871,0,-1,
1,0.166667,1,-0.333333,-0.433962,-0.383562,-1,-1,0.0687023,-1,-0.903226,-1,-1,
-1,0.458333,1,1,-0.358491,-0.374429,-1,-1,-0.480916,1,-0.935484,0,-0.333333,
-1,0.875,-1,-0.333333,-0.509434,-0.347032,-1,1,-0.236641,1,-0.935484,-1,-0.333333,
-1,0.5,1,1,-0.509434,-0.767123,-1,-1,0.0534351,-1,-0.870968,-1,-1,
1,0.125,1,0.333333,-0.320755,-0.406393,1,1,0.0839695,1,-0.806452,0,-0.333333,
1,0.25,1,1,-0.698113,-0.484018,-1,1,0.0839695,1,-0.612903,0,-0.333333,
1,0.291667,1,1,-0.132075,-0.237443,-1,1,0.51145,-1,-0.612903,0,0.333333,
1,0.416667,-1,1,0.0566038,0.283105,-1,1,0.267176,-1,0.290323,0,1
};
//一个测试样本的特征向量
double testArr[]=
{
0.25,1,1,-0.226415,-0.506849,-1,-1,0.374046,-1,-0.83871,0,-1
};
int main()
{
Mat trainData(10, 12, CV_32FC1);//构建训练样本的特征向量
for (int i=0; i<10; i++)
{
for (int j=0; j<12; j++)
{
trainData.at<float>(i, j) = inputArr[i][j+1];
}
}
Mat trainResponse(10, 1, CV_32FC1);//构建训练样本的类别标签
for (int i=0; i<10; i++)
{
trainResponse.at<float>(i, 0) = inputArr[i][0];
}
CvNormalBayesClassifier nbc;
bool trainFlag = nbc.train(trainData, trainResponse);//进行贝叶斯分类器训练
if (trainFlag)
{
cout<<"train over..."<<endl;
nbc.save("d:/normalBayes.txt");
}
else
{
cout<<"train error..."<<endl;
system("pause");
exit(-1);
}
CvNormalBayesClassifier testNbc;
testNbc.load("d:/normalBayes.txt");
Mat testSample(1, 12, CV_32FC1);//构建测试样本
for (int i=0; i<12; i++)
{
testSample.at<float>(0, i) = testArr[i];
}
float flag = testNbc.predict(testSample);//进行测试
cout<<"flag = "<<flag<<endl;
system("pause");
return 0;
}