朴素贝叶斯算法学习笔记(一)

来源:互联网 发布:手机wifi控制器软件 编辑:程序博客网 时间:2024/05/16 10:50
#coding=utf-8from numpy import *'''准备数据:从文本中构建词向量'''#数据集的读取def loadDataSet():    postingList=[['my','dog','has','flea','problems','help','please'],\                 ['maybe','not','take','him','to','dog','park','stupid'],\                 ['my','dalmation','is','so','cute','I','love','him'],\                 ['stop','posting','stupid','worthless','garbage'],\                 ['mr','licks','ate','my','steak','how','to','stop','him'],\                 ['quit','buying','worthless','dog','food','stupid']]    classVec=[0,1,0,1,0,1]    return postingList,classVec#处理为不重复的列表def createVocabList(dataSet):    vocabSet=set([])    for document in dataSet:        vocabSet=vocabSet|set(document)#并集    return list(vocabSet)#'''vocabList:词汇表inputSet:文档返回文档向量'''def setOfWords2Vec(vocabList,inputSet):    returnVec=[0]*len(vocabList)    for word in inputSet:#如果文档中出现了这个单词,则将向量置为1        if word in vocabList:            returnVec[vocabList.index(word)]=1        else:            print "the word:%s is not in my Vocabulary!"    return returnVeclistOposts,listClasses=loadDataSet()print len(listOposts)myVocabList=createVocabList(listOposts)print myVocabListprint  setOfWords2Vec(myVocabList,listOposts[3])'''训练算法:从词向量计算概率trainMatrix:文档矩阵trainCategory:每篇文档构成的标签向量'''def trainNB0(trainMatrix,trainCategory):    numTrainDocs=len(trainMatrix)#计算文档数目    numWords=len(trainMatrix[0])#计算文档总词条数    pAbusive=sum(trainCategory)/float(numTrainDocs)#计算侮辱性文档概率    p0Num=ones(numWords)#初始化正常文章矩阵    p1Num=ones(numWords)#初始化侮辱性文章矩阵    p0Denom=2.0#初始化正常词条数    p1Denom=2.0#初始化非正常词条数    for i in range(numTrainDocs):        if trainCategory[i] == 1:#如果是侮辱性文档            p1Num += trainMatrix[i]#矩阵相加统计侮辱词条            p1Denom += sum(trainMatrix[i])#统计侮辱词条总数目        else:#如果是正常文档            p0Num += trainMatrix[i]#矩阵相加统计正常词条            p0Denom += sum(trainMatrix[i])#统计正常词条总数目    p1Vect=p1Num/p1Denom    p0Vect=p0Num/p0Denom    return p0Vect,p1Vect,pAbusivetrainMat=[]for postinDoc in listOposts:    trainMat.append(setOfWords2Vec(myVocabList,postinDoc))# print trainMat# p0V,p1V,pAb=trainNB0(trainMat,listClasses)# print p0V# print  p1V# print pAb'''测试算法:根据现实情况修改分类器'''def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):    p1=sum(vec2Classify*p1Vec)+log(pClass1)    p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)    if p1>p0:        return 1;    else:        return 0;def testingNB():    listOposts,listClasses=loadDataSet()    myVocabList=createVocabList(listOposts)    trainMat=[]    for postinDoc in listOposts:        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))    p0V,p1V,pAb=trainNB0(array(trainMat),array(listClasses))    testEntry=['love','my','dalmation']    thisDoc=array(setOfWords2Vec(myVocabList,testEntry))    print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)    testEntry = ['stupid', 'garbage']    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))    print testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb)testingNB()['cute', 'love', 'help', 'garbage', 'quit', 'I', 'problems', 'is', 'park', 'stop', 'flea', 'dalmation', 'licks', 'food', 'not', 'him', 'buying', 'posting', 'has', 'worthless', 'ate', 'to', 'maybe', 'please', 'dog', 'how', 'stupid', 'so', 'take', 'mr', 'steak', 'my'][0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]['love', 'my', 'dalmation'] classified as: 0['stupid', 'garbage'] classified as: 1
阅读全文
1 0
原创粉丝点击