朴素贝叶斯 Python

来源:互联网 发布:psv如何重构数据库 编辑:程序博客网 时间:2024/06/18 08:19

朴素贝叶斯分类
1. 预处理:OneHotEncoder(sklearn)
2. 特征属性的条件概率分母默认设置为1

#coding=utf-8from numpy import *class NaiveBayesClassifier(object):    def __init__(self):        self.dataMat = list()        self.labelMat = list()        self.pLabel1 = 0        self.p0Vec = list()        self.p1Vec = list()    def loadDataSet(self):        self.dataset = mat([            ['overcast', 'hot', 'high', 'FALSE', 'yes'],            ['rainy', 'mild', 'high', 'FALSE', 'yes'],            ['rainy', 'cool', 'normal', 'FALSE', 'yes'],            ['overcast', 'cool', 'normal', 'TRUE', 'yes'],            ['sunny', 'cool', 'normal', 'FALSE', 'yes'],            ['rainy', 'mild', 'normal', 'FALSE', 'yes'],            ['sunny', 'mild', 'normal', 'TRUE', 'yes'],            ['overcast', 'mild', 'high', 'TRUE', 'yes'],            ['overcast', 'hot', 'normal', 'FALSE', 'yes'],            ['sunny', 'hot', 'high', 'FALSE', 'no'],            ['sunny', 'hot', 'high', 'TRUE', 'no'],            ['rainy', 'cool', 'normal', 'TRUE', 'no'],            ['sunny', 'mild', 'high', 'FALSE', 'no'],            ['rainy', 'mild', 'high', 'TRUE', 'no']])        self.transform()        self.OneHotEncoder()        self.dataMat = self.dataset[:, 0:-1]        self.labelMat = self.dataset[:, -1]    def transform(self):        m, n = shape(self.dataset)        for i in range(n):            types = []            for j in self.dataset[:, i]:                types.append(j[0, 0])            type = list(set(types))            type.sort()            for index in range(len(type)):                self.dataset[:, i][self.dataset[:, i] == type[index]] = index        self.type = type    def OneHotEncoder(self):        from sklearn import preprocessing        self.enc = preprocessing.OneHotEncoder()        self.enc.fit(self.dataset)        self.dataset = mat(self.enc.transform(self.dataset).toarray())    def train(self):        dataNum, featureNum = shape(self.dataMat)        self.pLabel1 = sum(self.labelMat) / float(dataNum)        p0Num = zeros((1, featureNum))        p1Num = zeros((1, featureNum))        p0Denom = 1.0        p1Denom = 1.0        for i in range(dataNum):            if self.labelMat[i] == 1:                p1Num += self.dataMat[i]                p1Denom += 1            else:                p0Num += self.dataMat[i]                p0Denom += 1        self.p0Vec = p0Num / p0Denom        self.p1Vec = p1Num / p1Denom    def classify(self, data):        p = multiply(data, self.p1Vec)        p1 = 1        for i in nonzero(p)[1]:            p1 *= p[0, i]        p1 *= self.pLabel1        p = multiply(data, self.p0Vec)        p0 = 1        for i in nonzero(p)[1]:            p0 *= p[0, i]        p0 *= (1.0 - self.pLabel1)        if p1 > p0:            # return self.type[1]            return 1        else:            # return self.type[0]            return 0    def test(self):        self.loadDataSet()        self.train()        sum = 0.0        m, n = shape(self.dataMat)        for i in arange(m):            sum += self.classify(self.dataMat[i, :] == self.labelMat[i, 0])        print("正确率:",sum/m)if __name__ == '__main__':    NB = NaiveBayesClassifier()    NB.test()

实验结果:

正确率: 0.7857142857142857

0 0
原创粉丝点击