朴素贝叶斯算法学习笔记(三)显示地域相关的用词

来源:互联网 发布:查询pid占用了哪个端口 编辑:程序博客网 时间:2024/06/17 20:55
#coding=utf-8import feedparserimport bayesimport bayes_emailfrom numpy import  *#计算出现频率最高三十个单词def calMostFreq(vocabList,fullText) :    import operator    freqDict={}    for token in vocabList:        freqDict[token]=fullText.count(token)    sortedFreq=sorted(freqDict.iteritems(),key=operator.itemgetter(1),reverse=True)    return sortedFreq[:30]def localwords(feed1,feed0):    docList=[]    classList=[]    fullText=[]    minLen=min(len(feed1['entries']),len(feed0['entries']) )    for i in range(minLen):        wordList=bayes_email.textParse(feed1['entries'][i]['summary'])        docList.append(wordList)        fullText.extend(wordList)        classList.append(1)        wordList=bayes_email.textParse(feed0['entries'][i]['summary'])        docList.append(wordList)        fullText.extend(wordList)        classList.append(0)    vocabList=bayes.createVocabList(docList)    top30Words=calMostFreq(vocabList,fullText)    for pairW in top30Words:        if pairW[0] in vocabList :vocabList.remove(pairW[0])    trainingSet=range(2*minLen)    testSet=[]    for i in range(20):        randIndex=int(random.uniform(0,len(trainingSet)))        testSet.append(trainingSet[randIndex])        del (trainingSet[randIndex])    trainMat=[];trainClasses=[]    for docIndex in trainingSet:        trainMat.append(bayes.bagOfWords2VecMN(vocabList,docList[docIndex]))        trainClasses.append(classList[docIndex])    p0V,p1V,pSpam=bayes.trainNB0(trainMat,trainClasses)    errorCount=0    errorData=[]    for docIndex in testSet:        wordVector=bayes.setOfWords2Vec(vocabList,docList[docIndex])        if bayes.classifyNB(wordVector,p0V,p1V,pSpam) != classList[docIndex]:            errorCount+=1            errorData.extend(docList[docIndex])    print 'the error rate is :',float(errorCount)/len(testSet)    print 'the error data is :',errorData    return vocabList,p0V,p1Vdef getTopWords(ny,sf):    import operator    vocabList,p0V,p1V=localwords(ny,sf)    topNY=[];topSF=[]    for i in range(len(p0V)):        if p0V[i]>-6.0:topSF.append((vocabList[i],p0V[i]))        if p1V[i]>-6.0:topNY.append((vocabList[i],p1V[i]))    sortedSF=sorted(topSF,key=lambda pair:pair[1],reverse=True)    print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"    for item in sortedSF:         print(item[0])    sortedNY=sorted(topNY,key=lambda  pair:pair[1],reverse=True)    print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"    for item in sortedNY:        print item[0]ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss')sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')getTopWords(ny,sf)
阅读全文
0 0
原创粉丝点击