朴素bayes实战

来源:互联网 发布:网络不文明现象 编辑:程序博客网 时间:2024/06/11 16:13
from numpy import *###创建一些实验样本#####################def loadDataSet():    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not    return postingList,classVec####创建一个不重复的词表#############               def createVocabList(dataSet):    vocabSet = set([])  #创建空集合    for document in dataSet:        vocabSet = vocabSet | set(document) # 操作符|用于求两个集合的并集    return list(vocabSet)"""该函数的输入为词汇表及某个文档,输出为文档向量"""def setOfWords2Vec(vocabList, inputSet):    returnVec = [0]*len(vocabList)   # 创建一个其中所含元素都为0的向量    for word in inputSet:        if word in vocabList:            returnVec[vocabList.index(word)] = 1   ######list.index(obj)返回查找对象的索引位置        else:             print("the word: %s is not in my Vocabulary!" % word)    return returnVecdef trainNB0(trainMatrix,trainCategory):    numTrainDocs = len(trainMatrix)   # 计算矩阵行数    numWords = len(trainMatrix[0])    # 计算矩阵列数    pAbusive = sum(trainCategory)/float(numTrainDocs)  # 计算各个类别的概率    p0Num = ones(numWords); p1Num = ones(numWords)      #change to ones()     p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0    '''接下来计算词汇表中各个词汇在不同分类中出现的概率'''    for i in range(numTrainDocs):    # 依次读取文件        if trainCategory[i] == 1:    #  这里的if函数判断文档类别            p1Num += trainMatrix[i]    #  判断词在文档中出现的个数            p1Denom += sum(trainMatrix[i])  #判断某个文档中的次总数        else:            p0Num += trainMatrix[i]            p0Denom += sum(trainMatrix[i])    p1Vect = log(p1Num/p1Denom)          #change to log()    p0Vect = log(p0Num/p0Denom)          #change to log()    return p0Vect,p1Vect,pAbusive###朴素bayes分类函数##########def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)    if p1 > p0:        return 1    else:         return 0########bayes词袋模型####################################def bagOfWords2VecMN(vocabList, inputSet):    returnVec = [0]*len(vocabList)    for word in inputSet:        if word in vocabList:            returnVec[vocabList.index(word)] += 1    return returnVecdef testingNB():    listOPosts,listClasses = loadDataSet()    myVocabList = createVocabList(listOPosts)    trainMat=[]    for postinDoc in listOPosts:        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))    testEntry = ['love', 'my', 'dalmation']    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))    testEntry = ['stupid', 'garbage']    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))# print(testingNB())listOPosts,listClass=loadDataSet()myVocabList=createVocabList(listOPosts)trainMat=[]for postinDoc in listOPosts:    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))pOV,p1V,pAb=trainNB0(trainMat,listClass)print(p1V)# ##################利用bayes过滤垃圾邮件################# ####文件解析################ def textParse(bigString):    #input is big string, #output is word list#     import re#     listOfTokens = re.split(r'\W*', bigString)#     return [tok.lower() for tok in listOfTokens if len(tok) > 2] # def spamTest():#     docList=[]; classList = []; fullText =[]#     for i in range(1,26):#         wordList = textParse(open('email/spam/%d.txt' % i,'r').read())#         docList.append(wordList)#         fullText.extend(wordList)#         classList.append(1)#         wordList = textParse(open('email/ham/%d.txt' % i,'r').read())#         docList.append(wordList)#         fullText.extend(wordList)#         classList.append(0)#     vocabList = createVocabList(docList) #创建词表#     trainingSet = list(range(50)); testSet=[]           #create test set#     for i in range(10):   # 筛选测试集#         randIndex = int(random.uniform(0,len(trainingSet)))#         testSet.append(trainingSet[randIndex])#         del(trainingSet[randIndex])  #     trainMat=[]; trainClasses = []#     for docIndex in trainingSet:#train the classifier (get probs) trainNB0#         trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))#         trainClasses.append(classList[docIndex])#     p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))#     errorCount = 0#     for docIndex in testSet:        #对测试集进行分类#         wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])#         if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:#             errorCount += 1#             print ("classification error",docList[docIndex])#     print ('the error rate is: ',float(errorCount)/len(testSet))#     #return vocabList,fullText# ##################使用bayes从个人广告中获取区域倾向############## ###统计词汇表在文本中出现的次数,根据次数从高到低排序,最后筛选出最高的30个词####### def calcMostFreq(vocabList,fullText):#     import operator#     freqDict = {}#     for token in vocabList:#         freqDict[token]=fullText.count(token)#     sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True) #     return sortedFreq[:5]       # def localWords(feed1,feed0):#     import feedparser#     docList=[]; classList = []; fullText =[]#     minLen = min(len(feed1['entries']),len(feed0['entries']))#     for i in range(minLen):#         wordList = textParse(feed1['entries'][i]['summary'])#         docList.append(wordList)#         fullText.extend(wordList)#         classList.append(1) #NY is class 1#         wordList = textParse(feed0['entries'][i]['summary'])#         docList.append(wordList)#         fullText.extend(wordList)#         classList.append(0)#     vocabList = createVocabList(docList)#create vocabulary#     top30Words = calcMostFreq(vocabList,fullText)   #remove top 30 words#     for pairW in top30Words:#         if pairW[0] in vocabList: vocabList.remove(pairW[0])#     trainingSet = list(range(2*minLen)); testSet=[]           #create test set#     for i in range(5):#         randIndex = int(random.uniform(0,len(trainingSet)))#         testSet.append(trainingSet[randIndex])#         del(trainingSet[randIndex])  #     trainMat=[]; trainClasses = []#     for docIndex in trainingSet:#train the classifier (get probs) trainNB0#         trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))#         trainClasses.append(classList[docIndex])#     p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))#     errorCount = 0#     for docIndex in testSet:        #classify the remaining items#         wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])#         if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:#             errorCount += 1#     print('the error rate is: ',float(errorCount)/len(testSet))#     return vocabList,p0V,p1V# def getTopWords(ny,sf):#     import operator#     vocabList,p0V,p1V=localWords(ny,sf)#     topNY=[]; topSF=[]#     for i in range(len(p0V)):#         if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))#         if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))#     sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)#     print ("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")#     for item in sortedSF:#         print (item[0])#     sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)#     print ("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")#     for item in sortedNY:#         print (item[0])# import feedparser# ny=feedparser.parse("http://www.nasa.gov/rss/dyn/image_of_the_day.rss")# sf=feedparser.parse("http://sports.yahoo.com/nba/teams/hou/rss.xml")# # print(len(ny['entries']))# # print(ny['entries'][1])# print(getTopWords(ny,sf))
原创粉丝点击