朴素贝叶斯
来源:互联网 发布:数控螺纹编程实例 编辑:程序博客网 时间:2024/05/22 11:50
# -*- coding: utf-8 -*-
from numpy import *
import random
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
#1代表侮辱性文字,0为正常
classVec=[0,1,0,1,0,1]
return postingList,classVec
#收集新文档中未出现过的不重复的新词
def createVocabList(dataset):
#不能直接set(dataset),这样会报错,set()里边只能填入向量
vocabset=set([])
for document in dataset:
#取并集
vocabset=vocabset | set(document)
return list(vocabset)
#输出文档向量,表示输入文档中是否有单词是在词汇表中,输入文档必须是向量
def setOfWords2Vec(vocablist,inputset):
returnVec=[0]*len(vocablist)
for word in inputset:
if word in vocablist:
returnVec[vocablist.index(word)]=1
else:
print "the word :%s is not in the Vocabulary!" %word
return returnVec
#词袋模型
def bagOfWords2Vec(vocablist,inputset):
returnVec=[0]*len(vocablist)
for word in inputset:
if word in vocablist:
returnVec[vocablist.index(word)]+=1
else:
print "the word :%s is not in the Vocabulary!" %word
return returnVec
listOfPosts,listClasses=loadDataSet()
myVocabList=createVocabList(listOfPosts)
# print len(myVocabList)
returnVec=setOfWords2Vec(myVocabList, listOfPosts[0])
# print returnVec
# print sum(listClasses)
#朴素贝叶斯分类器训练函数
def trainNBO(trainMatrix,trainCategory):
#文档中有几个数据
numTrain=len(trainMatrix)
#词汇库中有多少词个数
numWords=len(trainMatrix[0])
#文档属于侮辱性的概率
pAbusive=sum(trainCategory)/float(numTrain)
#非侮辱性的概率
p0Num=ones(numWords)
#侮辱性概率
p1Num=ones(numWords)
#初始概率为0.5
p0Denom=2.0
p1Denom=2.0
for i in range(numTrain):
if trainCategory[i]==1:
p1Num += trainMatrix[i]
p1Denom +=sum(trainMatrix[i])
else:
p0Num +=trainMatrix[i]
p0Denom +=sum(trainMatrix[i])
#防止相乘的数值过小产生下溢出
p1Vect=log(p1Num/p1Denom)
p0Vect=log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
trainMatrix=[]
for data in listOfPosts:
trainMatrix.append(setOfWords2Vec(myVocabList, data))
p0v,p1v,pa=trainNBO(trainMatrix, listClasses)
# print pa,p0v
# print p1v
#朴素贝叶斯分类函数
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
#贝叶斯概率公式不需要考虑分母,因为只需要比较概率大小
#由于使用log函数,乘法都变成加法
p1=sum(vec2Classify * p1Vec) +log(pClass1)
p0=sum(vec2Classify * p0Vec) ++log(1.0-pClass1)
if p1>p0:
return 1
else:
return 0
#测试算法
# vec2Classify=['love','my','dalmation']
# vec2Classify_2=['stupid','garbage']
# thisDoc=array(setOfWords2Vec(myVocabList, vec2Classify))
# thisDoc_2=array(setOfWords2Vec(myVocabList, vec2Classify_2))
# print classifyNB(thisDoc, p0v, p1v, pa)
# print classifyNB(thisDoc_2, p0v, p1v, pa)
def textParse(bigString):
import re
#按照非单词字符断开
listOfTokens=re.split(r'\W',bigString)
return [tok.lower() for tok in listOfTokens if len(tok)>2]
def spamTest():
docList=[]
classList=[]
fullText=[]
#导入文件
for i in range(1,26):
fr=open('D:/learn/Ch02/email/spam/%d.txt'%i)
wordList=textParse(fr.read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
fr==open('D:/learn/Ch02/email/ham/%d.txt'%i)
wordList=textParse(fr.read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
# print len(classList)
vocabList=createVocabList(docList)
#总共50封邮件,随数据集变化
trainingSet=range(50)
testSet=[]
#随机选取10个作为测试集
for i in range(10):
#选取0-50的随机数
randIndex=int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[]
trainClasses=[]
for docIndex in trainingSet:
trainMat.append(bagOfWords2Vec(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam=trainNBO(array(trainMat), array(trainClasses))
errorCount=0
for docIndex in testSet:
wordVector=bagOfWords2Vec(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam)!=classList[docIndex]:
errorCount+=1
print 'the error rate is ',float(errorCount)/len(testSet)
# spamTest()
0 0