朴素贝叶斯

来源:互联网 发布:数控螺纹编程实例 编辑:程序博客网 时间:2024/05/22 11:50
# -*- coding: utf-8 -*-
from numpy import *

import random
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    #1代表侮辱性文字,0为正常
    classVec=[0,1,0,1,0,1]
    return postingList,classVec
#收集新文档中未出现过的不重复的新词
def createVocabList(dataset):
    #不能直接set(dataset),这样会报错,set()里边只能填入向量
    vocabset=set([])
    for document in dataset:
        #取并集
        vocabset=vocabset | set(document)
    return list(vocabset)
#输出文档向量,表示输入文档中是否有单词是在词汇表中,输入文档必须是向量
def setOfWords2Vec(vocablist,inputset):
    returnVec=[0]*len(vocablist)
    for word in inputset:
        if word in vocablist:
            returnVec[vocablist.index(word)]=1
        else:
            print "the word :%s is not in the Vocabulary!" %word
    return returnVec

#词袋模型
def bagOfWords2Vec(vocablist,inputset):
    returnVec=[0]*len(vocablist)
    for word in inputset:
        if word in vocablist:
            returnVec[vocablist.index(word)]+=1
        else:
            print "the word :%s is not in the Vocabulary!" %word
    return returnVec
listOfPosts,listClasses=loadDataSet()
myVocabList=createVocabList(listOfPosts)
# print len(myVocabList)
returnVec=setOfWords2Vec(myVocabList, listOfPosts[0])
# print returnVec
# print sum(listClasses)
#朴素贝叶斯分类器训练函数
def trainNBO(trainMatrix,trainCategory):
    #文档中有几个数据
    numTrain=len(trainMatrix)
    #词汇库中有多少词个数
    numWords=len(trainMatrix[0])
    #文档属于侮辱性的概率
    pAbusive=sum(trainCategory)/float(numTrain)
    #非侮辱性的概率
    p0Num=ones(numWords)
    #侮辱性概率
    p1Num=ones(numWords)
    #初始概率为0.5
    p0Denom=2.0
    p1Denom=2.0
    
    for i in range(numTrain):
        if trainCategory[i]==1:
            p1Num += trainMatrix[i]
            p1Denom +=sum(trainMatrix[i])
        else:
            p0Num +=trainMatrix[i]
            p0Denom +=sum(trainMatrix[i])
    
    #防止相乘的数值过小产生下溢出
    p1Vect=log(p1Num/p1Denom)
    p0Vect=log(p0Num/p0Denom)
    
    return p0Vect,p1Vect,pAbusive

trainMatrix=[]
for data in listOfPosts:
    trainMatrix.append(setOfWords2Vec(myVocabList, data))
p0v,p1v,pa=trainNBO(trainMatrix, listClasses)
# print pa,p0v
# print p1v

#朴素贝叶斯分类函数
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    #贝叶斯概率公式不需要考虑分母,因为只需要比较概率大小
    #由于使用log函数,乘法都变成加法
    p1=sum(vec2Classify * p1Vec) +log(pClass1)
    p0=sum(vec2Classify * p0Vec) ++log(1.0-pClass1)
    if p1>p0:
        return 1
    else:
        return 0
    
#测试算法
# vec2Classify=['love','my','dalmation']
# vec2Classify_2=['stupid','garbage']
# thisDoc=array(setOfWords2Vec(myVocabList, vec2Classify))
# thisDoc_2=array(setOfWords2Vec(myVocabList, vec2Classify_2))
# print classifyNB(thisDoc, p0v, p1v, pa)
# print classifyNB(thisDoc_2, p0v, p1v, pa)

def textParse(bigString):
    import re
    #按照非单词字符断开
    listOfTokens=re.split(r'\W',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

def spamTest():
    docList=[]
    classList=[]
    fullText=[]
    #导入文件
    for i in range(1,26):
        fr=open('D:/learn/Ch02/email/spam/%d.txt'%i)
        wordList=textParse(fr.read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        
        fr==open('D:/learn/Ch02/email/ham/%d.txt'%i)
        wordList=textParse(fr.read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
#     print len(classList)
    vocabList=createVocabList(docList)
    #总共50封邮件,随数据集变化
    trainingSet=range(50)
    testSet=[]

    #随机选取10个作为测试集
    for i in range(10):
        #选取0-50的随机数
        randIndex=int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
        
    trainMat=[]
    trainClasses=[]
    
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
        
    p0V,p1V,pSpam=trainNBO(array(trainMat), array(trainClasses))
    errorCount=0
    
    for docIndex in testSet:
        wordVector=bagOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V, pSpam)!=classList[docIndex]:
            errorCount+=1
    print 'the error rate is ',float(errorCount)/len(testSet)

# spamTest()
            
        

        
    
        
0 0
原创粉丝点击