朴素贝叶斯分类Python演示

来源:互联网 发布:多得美工学院 编辑:程序博客网 时间:2024/05/17 04:03

# -*- coding: utf-8 -*-import numpy as npimport matplotlib.mlab as mlabimport matplotlib.pyplot as plt '''朴素贝叶斯分类器的原理 事件H的先验概率P(H),即无条件概率事件H在X发生时的后验概率P(H|X) 这里面H通常是指某一个分类X是指样本事件P(H|X)就表示在样本X发生的情况下,分类H的概率假设有N个分类H1到HN,那么P(Hi|X)最大的分类就是我们要找的,假设是Hk因为在X发生的情况下,Hk发生的概率最大 P(H|X) = [P(X|H)*P(H)]/P(X) 分类器的构造条件如下1.每个元组有N个属性 X = {x1 , x2 , ... , xn},分别对应属性{A1 ... An},并且属性之间相互独立2.当前有M个训练元组,并且知道它们的分类标号3.一共有K个分类C1 ... Ck,并且知道每个分类的先验概率,如果不知道则拟定其为1/K 问题是对于任意给定的新的元组Xgive,确定其分类Cget,使得P(Cget|Xgive)最大(相对于其它分类) 解析P(Ci|X) = [P(X|Ci)*P(Ci)]/P(X)对于所有的分类{C},P(X)是不变的(其实是未知的),为了使P(Ci|X)最大,只需要使P(X|Ci)*P(Ci)最大如果P(Ci)取1/K,则只要使P(X|Ci)最大由于X = {x1 , x2 , ... , xn},分别对应属性{A1 ... An},并且各个属性之间没有关联则P(X|Ci)*P(Ci) = P(x1|Ci)*P(x2|Ci)*...*P(xn|Ci) * P(Ci) P(xi|Ci)由训练元组得到分为两种情况如果Ai为离散值,则P(xi|Ci)等于【属于Ci的训练元组中,Ai=xi的元组个数】/【属于Ci的元组总数】如果Ai为连续值,则假定Ci中的Ai服从高斯分布,计算出高斯分布的参数,分别为平均值和标准差P(xi|Ci)等于xi处的概率密度 '''  '''问题一:训练元组从何而来? ''' #加载训练数据#文件格式:属性标号,是否连续【yes|no】,属性说明attribute_file_dest = 'F:\\bayes_categorize\\attribute.dat'attribute_file = open(attribute_file_dest) #文件格式:rec_id,attr1_value,attr2_value,...,attrn_value,class_idtrainning_data_file_dest = 'F:\\bayes_categorize\\trainning_data.dat'trainning_data_file = open(trainning_data_file_dest) #文件格式:class_id,class_descclass_desc_file_dest = 'F:\\bayes_categorize\\class_desc.dat'class_desc_file = open(class_desc_file_dest)  attr_dict = {}for line in attribute_file :    line = line.strip()    fld_list = line.split(',')    attr_dict[int(fld_list[0])] = tuple(fld_list[1:]) class_dict = {}for line in class_desc_file :    line = line.strip()    fld_list = line.split(',')    class_dict[int(fld_list[0])] = fld_list[1]    trainning_data_dict = {}class_member_set_dict = {}for line in trainning_data_file :    line = line.strip()    fld_list = line.split(',')    rec_id = int(fld_list[0])    a1 = int(fld_list[1])    a2 = int(fld_list[2])    a3 = float(fld_list[3])    c_id = int(fld_list[4])        if c_id not in class_member_set_dict :        class_member_set_dict[c_id] = set()    class_member_set_dict[c_id].add(rec_id)    trainning_data_dict[rec_id] = (a1 , a2 , a3)    attribute_file.close()class_desc_file.close()trainning_data_file.close() class_possibility_dict = {}for c_id in class_member_set_dict :    class_possibility_dict[c_id] = (len(class_member_set_dict[c_id]) + 0.0)/len(trainning_data_dict)     #等待分类的数据data_to_classify_file_dest = 'F:\\bayes_categorize\\trainning_data_new.dat'data_to_classify_file = open(data_to_classify_file_dest)data_to_classify_dict = {}for line in data_to_classify_file :    line = line.strip()    fld_list = line.split(',')    rec_id = int(fld_list[0])    a1 = int(fld_list[1])    a2 = int(fld_list[2])    a3 = float(fld_list[3])    c_id = int(fld_list[4])    data_to_classify_dict[rec_id] = (a1 , a2 , a3 , c_id)data_to_classify_file.close()  diff_cnt = 0#对于每一个待分类元组,对于每一个分类计算P(X|Ci)*P(Ci),寻找取得最大值的分类for rec_id in data_to_classify_dict :        res_class_id = 0    max_P_X_Ci = 0.0    a1_x1 = data_to_classify_dict[rec_id][0]    a2_x2 = data_to_classify_dict[rec_id][1]    a3_x3 = data_to_classify_dict[rec_id][2]    for c_id in class_possibility_dict :        P_Ci = class_possibility_dict[c_id]        #求P_x1_Ci        cnt_Ci = len(class_member_set_dict[c_id])        cnt_x1_Ci = len([tmp_rec_id for tmp_rec_id in trainning_data_dict \        if trainning_data_dict[tmp_rec_id][0] == a1_x1 and tmp_rec_id in class_member_set_dict[c_id]])        P_x1_Ci = (cnt_x1_Ci + 0.0) / cnt_Ci        #求P_x2_Ci        cnt_Ci = len(class_member_set_dict[c_id])        cnt_x2_Ci = len([tmp_rec_id for tmp_rec_id in trainning_data_dict \        if trainning_data_dict[tmp_rec_id][1] == a2_x2 and tmp_rec_id in class_member_set_dict[c_id]])        P_x2_Ci = (cnt_x2_Ci + 0.0) / cnt_Ci        #求P_x3_Ci        #按正态分布处理,取标准差和平均值        a3_data = [ trainning_data_dict[tmp_rec_id][2] for tmp_rec_id in trainning_data_dict \        if tmp_rec_id in class_member_set_dict[c_id] ]        a3_std_err = np.sqrt(np.var(a3_data))        a3_mean = np.mean(a3_data)        P_x3_Ci = mlab.normpdf(a3_x3 , a3_mean , a3_std_err )                res = P_x1_Ci * P_x2_Ci * P_x3_Ci * P_Ci        if res > max_P_X_Ci :            max_P_X_Ci = res            res_class_id = c_id            if res_class_id == 0 :        print 'error 2'        if res_class_id != data_to_classify_dict[rec_id][3] :        print 'different'        print res_class_id        print data_to_classify_dict[rec_id]        diff_cnt += 1 print diff_cnt                        产生测试数据的脚本:# -*- coding: utf-8 -*-import numpy as npfrom random import random as rdn #attr : a1 离【1 -- 10 】, a2 离【1 -- 10 】, a3 连【1 -- 100】正态分布 #class : c1 , c2 , c3 , c4 , c5 , c6 , c7 , c8 #data : 1 - 1000 '''c1 : a1[1 - 3] a2[4 - 10] a3[<= 50]c2 : a1[1 - 3] a2[4 - 10] a3[> 50]c3 : a1[1 - 3] a2[1 - 3] a3[> 30]c4 : a1[1 - 3] a2[1 - 3] a3[<= 30]c5 : a1[4 - 10] a2[4 - 10] a3[<= 50]c6 : a1[4 - 10] a2[4 - 10] a3[> 50]c7 : a1[4 - 10] a2[1 - 3] a3[> 30]c8 : a1[4 - 10] a2[1 - 3] a3[<= 30]'''  data_file = open('F:\\bayes_categorize\\trainning_data_new.dat' , 'w')a3_data = np.random.randn(1000 ) * 30 + 50 for i in range(1 , 1001 ) :    rec_id = i    a1 = int(rdn()*10) + 1    if a1 > 10 :        a1 = 10            a2 = int(rdn()*10) + 1    if a2 > 10 :        a2 = 10           a3 = a3_data[i-1]        c_id = 0    if a1 <= 3 and a2 >= 4 and a3 <= 50 :        c_id = 1    elif a1 <= 3 and a2 >= 4 and a3 > 50 :        c_id = 2    elif a1 <= 3 and a2 < 4 and a3 > 30 :        c_id = 3    elif a1 <= 3 and a2 < 4 and a3 <= 30 :        c_id = 4    elif a1 > 3 and a2 >= 4 and a3 <= 50 :        c_id = 5    elif a1 > 3 and a2 >= 4 and a3 > 50 :        c_id = 6    elif a1 > 3 and a2 < 4 and a3 > 30 :        c_id = 7    elif a1 > 3 and a2 < 4 and a3 <= 30 :        c_id = 8    else :        print 'error'            str_line = str(rec_id) + ',' + str(a1) + ',' + str(a2) + ','  + str(a3) + ',' + str(c_id) + '\n'    data_file.write(str_line)data_file.close() 配置文件:1,no,2,no,3,yes,


原创粉丝点击