基于朴素贝叶斯算法——进行文档分类

来源:互联网 发布:控制台窗口界面编程 编辑:程序博客网 时间:2024/06/14 06:36

使用贝叶斯进行文档分类

贝叶斯的核心思想:选择具有最高概率的决策
应用贝叶斯准则得到:

p(c1|x,y)=(p(x,y|ci))(p(ci)p(x,y)

如果p(c1|x,y)>p(c2|x,y),那么属于类别c1
如果p(c1|x,y)<p(c2|x,y),那么属于类别c2

准备数据:从文本中构建词汇量

def loadDataSet():    '''函数loadDataSet:导入数据集和分类    parameters:无    return:postingList:数据集例子    classVec:对应数据集的分类    '''    postingList=[['my','dog','has','flea','promblems','help','please'],                 ['maybe','not','take','him','to','dog','park','stupid'],                 ['my','dalmation','is','so','cute','I','love','him'],                 ['stop','posting','stupid','worthless','garbage'],                 ['mr','licks','ate','my','steak','how','to','stop','him'],                 ['quit','buying','worthless','dog','food','stupid']]    classVec=[0,1,0,1,0,1]    return postingList,classVecdef createVocabList(dataSet):    '''函数createVocabList建立一个存在现有单词的总集    parameters:dataSet:之前函数中的数据集    return:vocabList:现有单词的总集    '''    vocabList=set([])    for i in dataSet:        vocabList = vocabList | set(i)    vocabList = list(vocabList)    return vocabListdef setOfWords2Vec(vocabList,inputSet):    '''函数setOfWords2Vec输出文档0-1向量    parameters:vocabList:单词总集    inputSet:某个单词list    return:returnVec:文档list对应的0-1向量    '''    returnVec = [0]*len(vocabList)    for word in inputSet:        if word in vocabList:            returnVec[vocabList.index(word)] = 1        else:             print("the word: %s is not in my Vocabulary!"%word)    return returnVec

训练算法:从词汇两计算概率

def trainNB0(trainMatrix,trainCategory):    '''函数trainNB0是朴素贝叶斯分类器训练函数    parameters:trainMatrix:数据集0-1文档矩阵    trainCategory:文档类别标签0-1向量    return:p0Vect,p1Vect:分别是在分类0,1情况下的条件概况list    pAbusive:分类为1的概率    '''    numTrainDocs = len(trainMatrix)    numWords = len(trainMatrix[0])    pAbusive = np.sum(trainCategory)/numTrainDocs    p0Num = np.ones(numWords)    p1Num = np.ones(numWords)    #`ones`是为了防止概率值为0的情况    p0Denom = 2    p1Denom = 2    #`=2`是为了防止概率值为0的情况    for i in range(numTrainDocs):        if trainCategory[i] == 1:            p1Num += trainMatrix[i]            p1Denom += np.sum(trainMatrix[i])        else:            p0Num += trainMatrix[i]            p0Denom += np.sum(trainMatrix[i])    p1Vect = np.log(p1Num/p1Denom)    p0Vect = np.log(p0Num/p0Denom)    #`log`防止乘法遇到特别小的数,用log转化为加法    return p0Vect,p1Vect,pAbusive

测试算法:根据现实情况修改分类器

def classifyNB(vec2Classify,p0Vect,p1Vect,pClass1):    '''函数classifyNB是贝叶斯分类函数    parameters:vec2Classify:想测试的词汇list    p0Vect,p1Vect:分别是在分类0,1情况下的条件概况list    pClass1:分类为1的概率    return:最后分类    '''    p1 = np.sum(vec2Classify*p1Vect)+np.log(pClass1)    p0 = np.sum(vec2Classify*p0Vect)+np.log(1-pClass1)    if p1>p0:        return 1    else:        return 0def testingNB():    '''函数testingNB利用之前的所有函数对新的词汇list进行分类    parameters: 无    return:print输出    '''    import numpy as np    dataSet,classVec = loadDataSet()    myvocabList = createVocabList(dataSet)    trainMat=[]    for postinDoc in dataSet:        trainMat.append(setOfWords2Vec(myvocabList,postinDoc))    p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(classVec))    testEntry = ['love','my','dalmation']    thisDoc = np.array(setOfWords2Vec(myvocabList,testEntry))    print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))    testEntry = ['stupid','garbage']    thisDoc = np.array(setOfWords2Vec(myvocabList,testEntry))    print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))

最后的测试

if __name__ == "__main__":    testingNB()

这里写图片描述

测试算法:使用朴素贝叶斯进行交叉验证

import bayesimport numpy as npfrom imp import reload reload(bayes)def textParse(bigString):    '''函数textPares对文本字符串切分    parameters:bigString:输入的txt文件    return:被分割的字符串列表(小写,长度大于2)    '''    import re    listOfTokens = re.split(r'\W*',bigString)    return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():    '''函数spamTest使用朴素贝叶斯进行交叉验证    parameters:无    return:错误率    '''    docList=[]    classList=[]    fullText=[]    for i in range(1,26):        wordList=textParse(open("email/spam/%d.txt" % i).read())        docList.append(wordList)        fullText.extend(wordList)        classList.append(1)        wordList=textParse(open("email/ham/%d.txt" % i).read())        docList.append(wordList)        fullText.extend(wordList)        classList.append(0)    vocabList = bayes.createVocabList(docList)    trainingSet = list(range(50))    testSet=[]    for i in range(10):        randIndex = int(np.random.uniform(0,len(trainingSet)))        testSet.append(trainingSet[randIndex])        del(trainingSet[randIndex])    trainMat=[]    trainClasses=[]    for docIndex in trainingSet:        trainMat.append(bayes.setOfWords2Vec(vocabList,docList[docIndex]))        trainClasses.append(classList[docIndex])    p0V,p1V,pSpam = bayes.trainNB0(np.array(trainMat),np.array(trainClasses))    errorCount=0    for docIndex in testSet:        wordVector = bayes.setOfWords2Vec(vocabList,docList[docIndex])        if bayes.classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:            errorCount += 1    print("the error rate is:",errorCount/len(testSet))spamTest()

输出:

这里写图片描述

以上过程重复多次,比如说10次,然后求平均值,获取平均错误率

原创粉丝点击