机器学习实战第四章朴素贝叶斯算法照葫芦画瓢算法实践

来源：互联网发布：国外域名注册商有那些编辑：程序博客网时间：2024/06/01 14:16
完成了朴素贝叶斯分类器的主要算法，并且利用其过滤了辣鸡邮件，及对个人发布的大量广告中学习分类器，并将学习结果转换成可以理解的信息。
用到了feedparse库中相关的函数来访问RSS源，如果是在windos下，且装有anaconda的情况下，可以不需要去官网上下包，解压再安装，直接在命令行中
输入conda install feedparse一般就能安装成功，非常方便。
# -*- coding: utf-8 -*-"""照葫芦画瓢完成于2017.4.23 20:25算法名称 : 基于朴素贝叶斯的分类方法算法整体思路：  通过统计想要的不同类型的数据出现的频率，转换成概率，依照条件概率进行具体的分类，主要对于一些相关文本的属性进行分类。  1.从文本中构建词向量  2.通过构建出的词向量计算概率  3.构建文件词袋模型  4.切分文本，解析文本，构建训练集，利用朴素贝叶斯对测试集中的文本进行分类作者:    zzt941006"""from numpy import *def loadDataSet():    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]#进行词条切分后的文档集合    classVec = [0,1,0,1,0,1]    #1 代表侮辱性文字 0代表正常言论,也即每一行里的文字代表的是侮辱性的还是正常向的    return postingList,classVecdef createVocabList(dataSet):    vocabSet = set([])    for document in dataSet: # 每次取出list中的一行        vocabSet = vocabSet | set(document) #去重取并集        #print document,len(vocabSet)    return list(vocabSet)#得到新的不重复的单词表def setOfWords2Vec(vocabList,inputSet):    returnVec = [0] * len(vocabList)    for word in inputSet:        if word in vocabList:            returnVec[vocabList.index(word)] = 1        else: print "the word: %s is not in my Vocabulary!" % word    return returnVec#训练算法通过词向量计算概率#对于for循环里的计算，我们可以发现，比如my这个词，在分类0里面出现3次，则最后的p0Num数组所对应的my那个位置上的值就是3，而p0Denom代表#所有非侮辱性文档中的单词总数，比如第一篇有7个第三篇有8个第五篇有9个一共出现了24次，故p(my|非侮辱性文档) = 3 / 24 =1 / 8def trainNB0(trainMatrix,trainCategory):    numTrainDocs = len(trainMatrix)#共有多少个文档，即文档的行数，根据输入数据的话是6行    numWords = len(trainMatrix[0])#单词表的长度，基于总单词表构成的文档向量，其每一行的元素个数，去重之后的单词表长度为32故大小为32    pAbusive = sum(trainCategory) / float(numTrainDocs)#侮辱性文档的概率   # print numTrainDocs,numWords,pAbusive    p0Num = ones(numWords)    p1Num = ones(numWords)    p0Denom = 2.0    p1Denom = 2.0    for i in range(numTrainDocs):        if trainCategory[i] == 1:            p1Num += trainMatrix[i]#记录所有分类为侮辱性文档中的每个词汇一共出现了多少次            p1Denom +=sum(trainMatrix[i])#记录第i篇侮辱性文档中共有多少个单词出现            #print i,trainCategory[i],trainMatrix[i],sum(trainMatrix[i])        else:             p0Num += trainMatrix[i]#记录所有分类为非侮辱性文档中的每个词汇一共出现了多少次             p0Denom +=sum(trainMatrix[i])#记录第i篇非侮辱性文档中共有多少个单词出现            # print i,trainCategory[i],trainMatrix[i],sum(trainMatrix[i])    p1Vect = log(p1Num / p1Denom) #在所有侮辱性文档中，每个单词出现的概率，取个log    p0Vect = log(p0Num / p0Denom) #在所有非侮辱性文档中，每个单词出现的概率,取个log        return p0Vect,p1Vect,pAbusive#注意一下这里概率的计算。p1 = p (ci = 1 | w) = p(w|ci = 1) * p(ci = 1) / p(w)#而p0的计算也要除以p(w)忽略分母，只管分子，即只考虑 p(w | ci = 1) * p(ci = 1)和 p(w|ci = 0) * p(ci = 0)的大小即可以知道属于哪一类#那么此时取个log防止过小，则有log(p(w | ci = 1) * p(ci = 1)) = Σlog(p(wj | ci = 1)) + log(p(ci = 1))#log(p(w | ci = 0) * p(ci = 0)) = Σlog(p(wj | ci = 0)) + log(p(ci = 0)) = Σlog(p(wj | ci = 0)) + log(1-p(ci = 1))#从而对应了下面这个分类函数的p1和p0的计算方式，又因为在train中，已经取了log 所以求和只要单纯的向量相乘求和即可def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    p0 = sum(vec2Classify * p0Vec) + log(1-pClass1)    if p1 > p0:        return 1    if p0 > p1:        return 0#构建文档词袋模型，跟最开始的setOfWords2Vec只有一个区别，那就是在词袋中，每个单词可以出现多次，set中只是记录有多少个不同的单词。#词袋模型中开一个长度为单词表总长度的数组，并初始化为全0，然后读所有的输入进来，统计输入的向量每个单词出现的次数。#一句话概括就是set返回的是一个01矩阵，只有2种不同的元素，而bag里面的元素不止 0 和 1 两种取值def bagOfWords2VecMN(vocabList,inputSet):    returnVec = [0] * len(vocabList)    for word in inputSet:        if word in vocabList:            returnVec[vocabList.index(word)] += 1    return returnVec#测试分类的代码，输入几个单词，然后生成其对应的单词表下的向量，然后通过分类函数来测试它属于侮辱性文档还是非侮辱性文档def testingNB():    listOPosts,listClasses = loadDataSet()    myVocabList = createVocabList(listOPosts)    trainMat=[]    for postinDoc in listOPosts:        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))    testEntry = ['love', 'my', 'dalmation']    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))    print thisDoc,testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)    testEntry = ['stupid', 'love','my','to','cute','please']    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))    print thisDoc,testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)#接受一个大字符串，并将其解析为字符串列表def textPrase(bigString):    import re    listOfTokens = re.split(r'W*',bigString)#对字符串进行处理，按空格切分，并且去除标点符号    return [tok.lower() for tok in listOfTokens if len(tok) > 2] #去除少于两个字符的字符串，并全部转为小写#文件解析及完整的垃圾邮件测试函数，并使用朴素贝叶斯进行交叉验证def spamTest():    docList =[]#将这个脑补成loadDataSet里的postingList    classList = []#对应每个docList每行的分类    fullText = []#构建大单词表（不去重）    for i in range(1,26):#分别读取25个H的txt和SP的txt，实际上就是导入并解析文件        wordList = textParse(open('email/spam/%d.txt' % i).read())        docList.append(wordList)        fullText.extend(wordList)        classList.append(1)        wordList = textParse(open('email/ham/%d.txt' % i).read())        docList.append(wordList)        fullText.extend(wordList)        classList.append(0)         vocabList = createVocabList(docList)#把25个H的txt和SP的txt先加入docList中，然后去重构造出大单词表       #注意一下这里的分类是1和0间隔出现的    trainingSet = range(50)# 0 - 49    testSet = []    for i in range(10):        randIndex = int(random.uniform(0,len(trainingSet)))#取不重复的10个数字作为测试集的下标，故训练集只有40个        testSet.append(trainingSet[randIndex])#加入测试集        del(trainingSet[randIndex])#删除对应的下标    #print testSet    trainMat = []    trainClasses = []    for docIndex in trainingSet:#注意这里的 trainingSet只有40个元素了        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))#构造训练集矩阵，一堆01矩阵        trainClasses.append(classList[docIndex])#记录对应下标的真实对应的分类    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))#进行训练，获取相应概率    errorCount = 0    for docIndex in testSet:    #遍历测试集            wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])#将测试集里的单词丢进词袋模型中获取相应矩阵        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:#分类结果与正常结果进行比对，看是否对应            errorCount += 1            print "classification error",docList[docIndex]    print 'the error rate is: ',float(errorCount)/len(testSet)#统计出现次数最多的前30个单词def calcMostFreq(vocabList,fullText):    import operator    freqDict = {}    for token in vocabList:        freqDict[token] = fullText.count(token)    sortedFreq = sorted(freqDict.iteritems(),key = operator.itemgetter(1),reverse = True)    return sortedFreq[:30]#feed1 feed0为两个RSS源，说明在这里访问的是RSS源，而并非一成不变的文件其中要加入去除词频最高的30个单词的预处理，#并返回相关的单词表和对应的概率，其余跟spam()非常类似，通过移除最高词频的单词这一预处理，可以提高准确率def localWords(feed1,feed0):    import feedparser    docList = []    classList = []    fullText = []    minLen = min(len(feed1['entries']),len(feed0['entries']))    for i in range(minLen):        wordList = textParse(feed1['entries'][i]['summary'])        docList.append(wordList)        fullText.append(wordList)        classList.append(1)        wordList = textParse(feed0['entries'][i]['summary'])        docList.append(wordList)        fullText.append(wordList)        classList.append(0)    vocabList = createVocabList(docList)    top30Words = calcMostFreq(vocabList,fullText)    #print top30Words  #  print len(vocabList),vocabList    for pairW in top30Words:       # print len(pairW),pairW[0],pairW[1]        if pairW[0] in vocabList: vocabList.remove(pairW[0])    trainingSet = range(2*minLen)    testSet = []    for i in range(20):        randIndex = int(random.uniform(0,len(trainingSet)))#取不重复的20个数字作为测试集的下标        testSet.append(trainingSet[randIndex])#加入测试集        del(trainingSet[randIndex])#删除对应的下标    #print testSet    trainMat = []    trainClasses = []    for docIndex in trainingSet:        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))#构造训练集矩阵一堆01矩阵        trainClasses.append(classList[docIndex])#记录对应下标的真实对应的分类    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))    errorCount = 0    for docIndex in testSet:        #classify the remaining items        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:            errorCount += 1           # print "classification error",docList[docIndex]    print 'the error rate is: ',float(errorCount)/len(testSet)    return vocabList,p0V,p1V#将两个RSS源作为输入，通过训练后再利用朴素贝叶斯分类，返回排名最具代表性的词汇，可以设定一个阈值，来获取这些分类之后的词汇，并按照该词汇出现#的条件概率的高低，进行排序输出def getTopWords(ny,sf):    import operator    vocabList,p0V,p1V=localWords(ny,sf)        topNY=[]; topSF=[]    for i in range(len(p0V)):        if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))        if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)    print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"    for item in sortedSF:        print item[0]    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)    print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"    for item in sortedNY:        print item[0]    print "VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**"    print vocabList
0 0