朴素贝叶斯算法学习笔记(二)使用算法进行交叉验证

来源:互联网 发布:淘宝里猜你喜欢在哪里 编辑:程序博客网 时间:2024/06/05 18:54
import bayesfrom numpy import *def bagOfWords2VecMN(vocabList,inputSet):    returnVec=0*len(vocabList)    for word in inputSet:        if word in vocabList:            returnVec[vocabList.index(word)]+=1    return returnVecdef textParse(bigString):    import re    listOfTokens=re.split(r'\W*',bigString)    return [tok.lower() for tok in listOfTokens if len(tok)>2]def spamTest():    docList=[]    classList=[]    fullText=[]    for i in range(1,26):        wordList=textParse(open('email/spam/%d.txt'%i).read())        docList.append(wordList)        fullText.extend(wordList)        classList.append(1)        wordList = textParse(open('email/ham/%d.txt' % i).read())        docList.append(wordList)        fullText.extend(wordList)        classList.append(0)    vocabList=bayes.createVocabList(docList)    trainingSet=range(50)    testSet=[]    for i in range(10):        randIndex=int(random.uniform(0,len(trainingSet)))        testSet.append(trainingSet[randIndex])        del (trainingSet[randIndex])    trainMat=[]    trainClasses=[]    for docIndex in trainingSet:        trainMat.append(bayes.setOfWords2Vec(vocabList,docList[docIndex]))        trainClasses.append(classList[docIndex])    p0v,p1v,pSpsm=bayes.trainNB0(trainMat,trainClasses)    errorCount=0    for docIndex in testSet:        wordVector=bayes.setOfWords2Vec(vocabList,docList[docIndex])        if bayes.classifyNB(wordVector,p0v,p1v,pSpsm) != classList[docIndex]:            errorCount+=1    print 'the error rate is :',float(errorCount)/len(testSet)spamTest()spamTest()
阅读全文
0 0
原创粉丝点击