朴素贝叶斯分类

来源:互联网 发布:python http请求 编辑:程序博客网 时间:2024/05/23 21:43

构建基本分类器代码:

# coding:utf-8from numpy import *def loadDataSet():postingList=[['my','dog','has','flea','problems','help','please'],\             ['maybe','not','take','him','to','dog','park','stupid'],\             ['my','dalmation','is','so','cute','I','love','him'],\             ['stop','position','stupid','worthless','garbage'],\             ['mr','licks','ate','my','steak','how','to','stop','him'],\             ['quite','buying','worthless','dog','food','stupid']]classVec=[0,1,0,1,0,1] #1代表侮辱性文字,0代表正常文字return postingList, classVecdef createVocabList(dataSet):vocabSet=set([])         #创建一个空集for document in dataSet: vocabSet=vocabSet | set(document)  #创建两个集合的并集return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet):  #词集模型   计算每个词只出现一次<span style="white-space:pre"></span>returnVec=[0]*len(vocabList)<span style="white-space:pre"></span>for word in inputSet:<span style="white-space:pre"></span>if word in vocabList:<span style="white-space:pre"></span>returnVec[vocabList.index(word)]=1<span style="white-space:pre"></span>else:<span style="white-space:pre"></span>print "the word: %s is not in my Vocabulary!" % word<span style="white-space:pre"></span>return returnVecdef bagOfWord2VecMN(vocabList,inputSet):#词袋模型  计算每个词出现多次<span style="white-space:pre"></span>returnVec=[0]*len(vocabList)<span style="white-space:pre"></span>for word in inputSet:<span style="white-space:pre"></span>if word in vocabList:<span style="white-space:pre"></span>returnVec[vocabList.index(word)]+=1;<span style="white-space:pre"></span>return returnVec# listOPosts,listClasses=loadDataSet()# myVocabList=createVocabList(listOPosts)# print myVocabList# print setOfWords2Vec(myVocabList,listOPosts[0])# print setOfWords2Vec(myVocabList,listOPosts[3])def trainNB0(trainMatrix,trainCategory):numTrainDocs=len(trainMatrix)numWords=len(trainMatrix[0])pAbusive=sum(trainCategory)/float(numTrainDocs)p0Num=ones(numWords);p1Num=ones(numWords)p0Denom =2.0; p1Denom=2.0for i in range(numTrainDocs):if trainCategory[i]==1:p1Num+=trainMatrix[i]p1Denom+=sum(trainMatrix[i])else:p0Num+=trainMatrix[i]p0Denom+=sum(trainMatrix[i])p1Vect=log(p1Num/p1Denom)p0Vect=log(p0Num/p0Denom)return p0Vect,p1Vect,pAbusivelistOPosts,listClasses=loadDataSet()myVocabList=createVocabList(listOPosts)trainMat=[]for postinDoc in listOPosts:trainMat.append(setOfWords2Vec(myVocabList,postinDoc))p0V,p1V,PAb=trainNB0(trainMat,listClasses)# print PAb# print p0V# print p1Vdef classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):p1=sum(vec2Classify*p1Vec)+log(pClass1)p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)if p1>p0:return 1else:return 0def testingNB():listOPosts,listClasse=loadDataSet()myVocabList=createVocabList(listOPosts)trainMat=[]for postinDoc in listOPosts:trainMat.append(setOfWords2Vec(myVocabList,postinDoc))p0V,p1V,pAb=trainNB0(array(trainMat),array(listClasse))testEntry=['love','my','dalmation']thisDoc=array(setOfWords2Vec(myVocabList,testEntry))print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)testEntry=['stupid','garbage']thisDoc=array(setOfWords2Vec(myVocabList,testEntry))print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)testingNB()

通过上面分类器对邮件进行分类,添加如下代码:

#文本解析及完整的垃圾邮件测试函数def textParse(bigString):import relistOfTokens=re.split(r'\W*',bigString)return [tok.lower() for tok in listOfTokens if len(tok)>2]def spamTest():docList=[];classList=[];fullText=[]for i in range(1,26):wordList=textParse(open('email/spam/%d.txt'%i).read())docList.append(wordList)fullText.extend(wordList)classList.append(1)wordList=textParse(open('email/ham/%d.txt'%i).read())docList.append(wordList)fullText.extend(wordList)classList.append(0)vocabList=createVocabList(docList)trainingSet=range(50);testSet=[]for i in range(10):randIndex=int(random.uniform(0,len(trainingSet)))testSet.append(trainingSet[randIndex])del(trainingSet[randIndex])trainMat=[];trainClasses=[]for docIndex in trainingSet:trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))trainClasses.append(classList[docIndex])p0V,p1V,pSpam=trainNB0(array(trainMat),array(trainClasses))errorCount=0for docIndex in testSet:wordVector=setOfWords2Vec(vocabList,docList[docIndex])if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:errorCount+=1print 'the error rate is:',float(errorCount)/len(testSet)spamTest()



0 0
原创粉丝点击