朴素贝叶斯原理及Python实现

来源:互联网 发布:jbl煲音箱软件 编辑:程序博客网 时间:2024/05/21 17:49

朴素贝叶斯分类器优缺点

优点:在数据较少的情况下依然有效,可以处理多分类问题
缺点:对输入数据的准备方式较为敏感
使用数据类型:标称型数据

算法原理

朴素贝叶斯分类器是基于贝叶斯概率理论构建的,即我们希望通过一个已知事务的先验概率(条件概率)去推测该事物的后验概率。
首先我们来回顾一下贝叶斯概率理论原理:
贝叶斯概率理论原理

贝叶斯公式说明:
1,事件A在事件B发生的条件下的概率,与事件B在事件A发生的条件下的概率是不一样的。但是这两者是有确定关系的。
2,我们可以通过已知的三个概率去推测第四个概率,即从结果上溯到源头(也即逆向概率)。

对于一个有多维的特征的样本而言,其贝叶斯公式是:

p(ci|w)=p(w|ci)p(ci)p(w)

我们之所以称之为朴素(naive)贝叶斯分类器是因为它有两点假设前提:
1,假设样本特征之间是相互独立的,即p(AB)=p(A)p(B)
2,假设每个特征同等重要

Python实现

#-*- coding:utf-8 -*-from numpy import *def loadDataSet():    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]    classVec = [0,1,0,1,0,1] #1代表侮辱性言论 0 代表正常言论    return postingList,classVec#创建一个包含所有文档中不重复词的列表def createVocabList(dataSet):    vocabSet = set([]) #set 集合类中不包含重复的元素    for document in dataSet:        vocabSet = vocabSet | set(document)  #操作符  | 用于求两个合集的并集,这也是一个按位或(OR)操作符,        # 在数学符号表示上,按位或操作与集合求并操作使用相同的符号    return list(vocabSet)#词集模型def setOfWords2Vec(vocabList,inputSet):    returnVec = [0] * len(vocabList)    for word in inputSet:        if word in vocabList:            returnVec[vocabList.index(word)] = 1        else: print "the word:%s is not in my Vocabulary!" % word    return  returnVec#词袋模型def bagOfWords2VecMN(vocabList,inputSet):    returnVec = [0] * len(vocabList)    for word in inputSet:        if word in vocabList:            returnVec[vocabList.index(word)] += 1    return  returnVec#朴素贝叶斯分类器的训练函数def trainNB0(trainMatrix,trainCategory):    numTrainDocs = len(trainMatrix)    numWords = len(trainMatrix[0])    pAbusive = sum(trainCategory)/float(numTrainDocs) #计算侮辱性言论的概率    #p0Num = zeros(numWords);p1Num = zeros(numWords) #生成长度为所有词汇量个数的向量    #p0Denom = 0.0; p1Denom = 0.0 #初始化分母项    #在后续计算多个概率的成绩时,为了避免某一个概率为0导致整个成绩的结果为0,将上述两行代码做一下修改    p0Num = ones(numWords);p1Num = ones(numWords)    p0Denom = 2.0; p1Denom = 2.0    for i in range(numTrainDocs):        if trainCategory[i] == 1:            p1Num += trainMatrix[i]            p1Denom += sum(trainMatrix[i])        else:            p0Num += trainMatrix[i]            p0Denom += sum(trainMatrix[i])    #p1Vect = p1Num/p1Denom    #p0Vect = p0Num/p0Denom    #为了避免许多数值过小的概率相乘造成下溢出的问题,对概率成绩取自然对数,上述两行改为    p1Vect = log(p1Num/p1Denom)    p0Vect = log(p0Num/p0Denom)    return p0Vect,p1Vect,pAbusive#朴素贝叶斯的分类函数def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    p0 = sum(vec2Classify * p0Vec) + log(1-pClass1)    if p1>p0:        return 1    else :        return 0#convenience functiondef testingNB():    listOPosts,listClasses = loadDataSet()    myVocabList = createVocabList(listOPosts)    trainMat = []    for postinDoc in listOPosts:        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))    testEntry = ['love','my','damation']    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))    print testEntry ,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)    testEntry = ['stupid','garbage']    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))    print testEntry ,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)#文件解析及完整的垃圾邮件测试函数def textParse(bigString):    import re    listOfTokens = re.split('\\W*',bigString)    return [tok.lower() for tok in listOfTokens if len(tok) > 2]def spamTest():    docList = [];classList = []; fullText=[]    for i in range(1,26):        wordList = textParse(open('email/spam/%d.txt' % i).read())        docList.append(wordList)        fullText.extend(wordList)        classList.append(1)        wordList = textParse(open('email/ham/%d.txt' % i).read())        docList.append(wordList)        fullText.extend(wordList)        classList.append(0)    vocabList = createVocabList(docList)    trainingSet = range(50); testSet=[]    for i in range(10):        randIndex = int(random.uniform(0,len(trainingSet)))        testSet.append(trainingSet[randIndex])        del(trainingSet[randIndex])    trainMat = []; trainClasses = []    for docIndex in trainingSet:        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))        trainClasses.append(classList[docIndex])    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))    errorCount = 0    for docIndex in testSet:        wordVector = setOfWords2Vec(vocabList, docList[docIndex])        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:            errorCount +=1            print docList[docIndex]    print 'the error rate is: ', float(errorCount)/len(testSet)def calcMostFreq(vocabList,fullText):    import operator    freqDict = {}    for token in vocabList:        freqDict[token]=fullText.count(token)    sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)     return sortedFreq[:30]       def localWords(feed1,feed0):    import feedparser    docList=[]; classList = []; fullText =[]    minLen = min(len(feed1['entries']),len(feed0['entries']))    for i in range(minLen):        wordList = textParse(feed1['entries'][i]['summary'])        docList.append(wordList)        fullText.extend(wordList)        classList.append(1) #NY is class 1        wordList = textParse(feed0['entries'][i]['summary'])        docList.append(wordList)        fullText.extend(wordList)        classList.append(0)    vocabList = createVocabList(docList)#create vocabulary    top30Words = calcMostFreq(vocabList,fullText)   #remove top 30 words    for pairW in top30Words:        if pairW[0] in vocabList: vocabList.remove(pairW[0])    trainingSet = range(2*minLen); testSet=[]           #create test set    for i in range(20):        randIndex = int(random.uniform(0,len(trainingSet)))        testSet.append(trainingSet[randIndex])        del(trainingSet[randIndex])      trainMat=[]; trainClasses = []    for docIndex in trainingSet:#train the classifier (get probs) trainNB0        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))        trainClasses.append(classList[docIndex])    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))    errorCount = 0    for docIndex in testSet:        #classify the remaining items        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:            errorCount += 1    print 'the error rate is: ',float(errorCount)/len(testSet)    return vocabList,p0V,p1Vdef getTopWords(ny,sf):    import operator    vocabList,p0V,p1V=localWords(ny,sf)    topNY=[]; topSF=[]    for i in range(len(p0V)):        if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))        if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)    print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"    for item in sortedSF:        print item[0]    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)    print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"    for item in sortedNY:        print item[0]
0 0