西瓜书《机器学习》课后答案——chapter7_7.3

来源：互联网发布：销售分析软件编辑：程序博客网时间：2024/05/16 16:24

7.3 试编程实现拉普拉斯修正的朴素贝叶斯分类器，并以西瓜数据集3.0为训练集，对p.151 “测1”样本进行判别。
解答：
朴素贝叶斯的训练过程就是估计类别先验概率以及类条件概率的过程；测试阶段根据训练得到的概率值计算出类别的后验概率并取概率最大的类别作为样本分类。根据数据集3.0得到的拉普拉斯朴素贝叶斯分类器把测试样例预测为正类，即好瓜。

代码：

# -*- coding:gbk -*-"""@Author: Victoria@Date: 2017.10.17 10:30"""import xlrdimport mathclass LaplacianNB():    """    Laplacian naive bayes for binary classification problem.    """    def __init__(self):        """        """    def train(self, X, y):        """        Training laplacian naive bayes classifier with traning set (X, y).        Input:            X: list of instances. Each instance is represented by (鑹叉辰锛屾牴钂傦紝鏁插０锛屾枃鐞嗭紝鑴愰儴锛岃Е鎰燂紝瀵嗗害锛屽惈绯栫巼)            y: list of labels. 0 represents bad, 1 represents good.        """        N = len(y)        self.classes = self.count_list(y)        self.class_num = len(self.classes)        self.classes_p = {}        #print self.classes        for c, n in self.classes.items():            self.classes_p[c] = float(n+1) / (N+self.class_num)        self.discrete_attris_with_good_p = []        self.discrete_attris_with_bad_p = []        for i in range(6):            attr_with_good = []            attr_with_bad = []            for j in range(N):                if y[j] == 1:                     attr_with_good.append(X[j][i])                else:                    attr_with_bad.append(X[j][i])            unique_with_good = self.count_list(attr_with_good)            unique_with_bad = self.count_list(attr_with_bad)            self.discrete_attris_with_good_p.append(self.discrete_p(unique_with_good, self.classes[1]))            self.discrete_attris_with_bad_p.append(self.discrete_p(unique_with_bad, self.classes[0]))        self.good_mus = []        self.good_vars = []        self.bad_mus = []        self.bad_vars = []        for i in range(2):            attr_with_good = []            attr_with_bad = []            for j in range(N):                if y[j] == 1:                    attr_with_good.append(X[j][i+6])                else:                    attr_with_bad.append(X[j][i+6])            good_mu, good_var = self.mu_var_of_list(attr_with_good)            bad_mu, bad_var = self.mu_var_of_list(attr_with_bad)            self.good_mus.append(good_mu)            self.good_vars.append(good_var)            self.bad_mus.append(bad_mu)            self.bad_vars.append(bad_var)    def predict(self, x):        """        """        p_good = self.classes_p[1]        p_bad = self.classes_p[0]        for i in range(6):            p_good  *= self.discrete_attris_with_good_p[i][x[i]]            p_bad *= self.discrete_attris_with_bad_p[i][x[i]]        for i in range(2):            p_good *= self.continuous_p(x[i+6], self.good_mus[i], self.good_vars[i])            p_bad *= self.continuous_p(x[i+6], self.bad_mus[i], self.bad_vars[i])        if p_good >= p_bad:            return p_good, p_bad, 1        else:            return p_good, p_bad, 0    def count_list(self, l):        """        Get unique elements in list and corresponding count.        """        unique_dict = {}        for e in set(l):            unique_dict[e] = l.count(e)        return unique_dict    def discrete_p(self, d, N_class):        """        Compute discrete attribution probability based on {0:, 1:, 2: }.        """        new_d = {}        #print d        for a, n in d.items():            new_d[a] = float(n+1) / (N_class + len(d))        return new_d    def continuous_p(self, x, mu, var):        p = 1.0 / (math.sqrt(2*math.pi) * math.sqrt(var)) * math.exp(- (x-mu)**2 /(2*var))        return p    def mu_var_of_list(self, l):        mu = sum(l) / float(len(l))        var = 0        for i in range(len(l)):            var += (l[i]-mu)**2         var = var / float(len(l))        return mu, varif __name__=="__main__":    lnb = LaplacianNB()    workbook = xlrd.open_workbook("../../数据/3.0.xlsx")    sheet = workbook.sheet_by_name("Sheet1")    X = []    for i in range(17):        x = sheet.col_values(i)        for j in range(6):            x[j] = int(x[j])        x.pop()        X.append(x)    y = sheet.row_values(8)    y = [int(i) for i in y]    #print X, y    lnb.train(X, y)    #print lnb.discrete_attris_with_good_p    label = lnb.predict([1, 1, 1, 1, 1, 1, 0.697, 0.460])    print "predict ressult: ", label

结果：

predict ressult:  (0.03191920486294201, 4.9158340214165893e-05, 1)#分别为正类概率，父类概率以及分类结果

阅读全文

0 0