手写朴素贝叶斯文本分类

来源:互联网 发布:mac磁盘空间不足在哪看 编辑:程序博客网 时间:2024/05/17 04:17
def get_traindata():   #输出总的文本矩阵,文本向量,词典    folder_path = ('F:/train_data')    folder_list = os.listdir(folder_path)    sum_list = []    corpus = []    sum_vocab = set([])    sum_dict = {}    print('正在生成训练集总文本向量...')    for folder in folder_list:                            #先获取总的文本向量        new_folder_path = folder_path + '/' + str(folder)        files = os.listdir(new_folder_path)        for file in files:            rs = []            with open(new_folder_path+'/'+file,'r',encoding='utf-8') as fp:                for ln in fp:                    rs.extend(ln.strip().split(' '))                    # print(type(rs))                sum_list.append(rs)    print(len(sum_list))    print('生成完毕!')    sum_num = len(sum_list)                              #文本总数    past_num = 0    train_num = 0    print('正在生成词典...')    for folder in folder_list:                           #对每一类求类向量并降维        new_folder_path = folder_path + '/' + str(folder)        files = os.listdir(new_folder_path)        train_num = len(files)        class_list = []        for file in files:            rs = []            with open(new_folder_path + '/' + file, 'r', encoding='utf-8') as fp:                for ln in fp:                    rs.extend(ln.strip().split(' '))                    class_list.extend(rs)        corpus.append(str(class_list))        # class_vocab  = createVocablist(class_list)        # for word in class_vocab:        #     if word in sum_vocab:        #         class_vocab.remove(word)        # class_vocab1 = class_tfidf(class_vocab,class_list,sum_list)        # sum_vocab = sum_vocab | set(class_vocab1)        sum_dict[str(folder)] = [0]*past_num+[1]*train_num+[0]*(sum_num-past_num-train_num)        past_num +=train_num    vectorizer = CountVectorizer()    transformer = TfidfTransformer()    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))    word = vectorizer.get_feature_names()    weight = tfidf.toarray()    vocab = set([])    for i in range(len(weight)):        list = sorted(weight[i], reverse=True)        num = list[5]        for j in range(len(word)):            if weight[i][j] >= num:                vocab.add(j)    vocab_list =[]    for i in vocab:        vocab_list.append(word[i])    #         class_list1 = []    #         for i in dict_list:    #             class_list1.append(i[0])    #     sum_vocab = sum_vocab | set(class_list1)    #    # vocab_list = list(sum_vocab)     #最终的词典列表    print('生成的词典为:%s'% str(vocab_list))    vocab_file = open('F:/myVocab.txt','w')    vocab_file.write(str(vocab_list))    vocab_file.close()    return sum_list,sum_dict,vocab_list    #返回总的文档列表、总的分类向量、词典def createVocablist(dataSet):  #去重复词创建词库    vocabSet=set([])    for document in dataSet:        vocabSet=vocabSet | set(document)    return list(vocabSet)# def class_tfidf(class_vocab, class_list, sum_list):#     class_dict = {}#     tf1_dict = {}#     tf2 = 0#     idf2_dict = {}##     for word in class_vocab:#         tf1_dict[word] = 0  # 该类中该词的词频#         idf2_dict[word] = 1  # 总文档中包含该词的文档数#     idf1 = len(sum_list)  # 总文档数##     print('step 1')#     for list in class_list:#         for word in list:#             if word in class_vocab:#                 tf1_dict[word] += 1#                 tf2 += 1#             else:#                 tf2 += 1##     print('step 2')#     for list in sum_list:#         for word in class_vocab:#             if word in list:#                 idf2_dict[word] += 1##     print('step 3')#     for word in class_vocab:#         class_dict[word] = (tf1_dict[word] / tf2) * log(idf1 / idf2_dict[word])#     dict = sorted(class_dict.items(), key=lambda d: d[1], reverse=True)#     dict_list = dict[:1000]#     class_list1 = []#     for i in dict_list:#         class_list1.append(i[0])#     return class_list1  # 返回维度为1000的类词典#     print('完成一类TFIDF')def setOfWords2Vec(vocablist,inputSet):  #生成文本向量    returnVec=[0]*len(vocablist)    for word in inputSet:        if word in vocablist:            returnVec[vocablist.index(word)]=1        else:            pass            # print('The word:%s is not in my Vocabulary!' % word)    return returnVecdef trainNBO(trainMatrix,sum_dic):           #求每种分类的概率向量    numTrainDocs=len(trainMatrix)            #总文档数    numWords=len(trainMatrix[0])             #向量长度    folder_path = ('F:/train_data')    folder_list = os.listdir(folder_path)    for folder in folder_list:        train_num=0        new_folder_path=folder_path+'/'+str(folder)        train_num=len(os.listdir(new_folder_path))        if folder=='baby':            pClass0=train_num/float(numTrainDocs)        elif folder=='car':            pClass1=train_num/float(numTrainDocs)        elif folder=='food':            pClass2=train_num/float(numTrainDocs)        elif folder=='health':            pClass3=train_num/float(numTrainDocs)        elif folder=='legend':            pClass4=train_num/float(numTrainDocs)        elif folder=='life':            pClass5=train_num/float(numTrainDocs)        elif folder=='love':            pClass6=train_num/float(numTrainDocs)        elif folder=='news':            pClass7=train_num/float(numTrainDocs)        elif folder=='science':            pClass8=train_num/float(numTrainDocs)        else :            pClass9=train_num/float(numTrainDocs)    p0Num = ones(numWords)    p1Num = ones(numWords)    p2Num = ones(numWords)    p3Num = ones(numWords)    p4Num = ones(numWords)    p5Num = ones(numWords)    p6Num = ones(numWords)    p7Num = ones(numWords)    p8Num = ones(numWords)    p9Num = ones(numWords)    p0Denom = numWords    p1Denom = numWords    p2Denom = numWords    p3Denom = numWords    p4Denom = numWords    p5Denom = numWords    p6Denom = numWords    p7Denom = numWords    p8Denom = numWords    p9Denom = numWords    for i in range(numTrainDocs):        if sum_dic['baby'][i] == 1:            p0Num += trainMatrix[i]            p0Denom += sum(trainMatrix[i])        elif sum_dic['car'][i] == 1:            p1Num += trainMatrix[i]            p1Denom += sum(trainMatrix[i])        elif sum_dic['food'][i] == 1:            p2Num += trainMatrix[i]            p2Denom += sum(trainMatrix[i])        elif sum_dic['health'][i] == 1:            p3Num += trainMatrix[i]            p3Denom += sum(trainMatrix[i])        elif sum_dic['legend'][i] == 1:            p4Num += trainMatrix[i]            p4Denom += sum(trainMatrix[i])        elif sum_dic['life'][i] == 1:            p5Num += trainMatrix[i]            p5Denom += sum(trainMatrix[i])        elif sum_dic['love'][i] == 1:            p6Num += trainMatrix[i]            p6Denom += sum(trainMatrix[i])        elif sum_dic['news'][i] == 1:            p7Num += trainMatrix[i]            p7Denom += sum(trainMatrix[i])        elif sum_dic['science'][i] == 1:            p8Num += trainMatrix[i]            p8Denom += sum(trainMatrix[i])        else :            p9Num += trainMatrix[i]            p9Denom += sum(trainMatrix[i])    p0Vect = log(p0Num / p0Denom)    p1Vect = log(p1Num / p1Denom)    p2Vect = log(p2Num / p2Denom)    p3Vect = log(p3Num / p3Denom)    p4Vect = log(p4Num / p4Denom)    p5Vect = log(p5Num / p5Denom)    p6Vect = log(p6Num / p6Denom)    p7Vect = log(p7Num / p7Denom)    p8Vect = log(p8Num / p8Denom)    p9Vect = log(p9Num / p9Denom)    return p0Vect,p1Vect,p2Vect,p3Vect,p4Vect,p5Vect,p6Vect,p7Vect,p8Vect,p9Vect,\           pClass0,pClass1,pClass2,pClass3,pClass4,pClass5,pClass6,pClass7,pClass8,pClass9def classify(vec2Classify,p0Vec,p1Vec,p2Vec,p3Vec,p4Vec,p5Vec,p6Vec,p7Vec,p8Vec,p9Vec,pClass0,pClass1,pClass2,pClass3,pClass4,pClass5,pClass6,pClass7,pClass8,pClass9):    p0 = sum(vec2Classify * p0Vec) + log(pClass0)    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    p2 = sum(vec2Classify * p2Vec) + log(pClass2)    p3 = sum(vec2Classify * p3Vec) + log(pClass3)    p4 = sum(vec2Classify * p4Vec) + log(pClass4)    p5 = sum(vec2Classify * p5Vec) + log(pClass5)    p6 = sum(vec2Classify * p6Vec) + log(pClass6)    p7 = sum(vec2Classify * p7Vec) + log(pClass7)    p8 = sum(vec2Classify * p8Vec) + log(pClass8)    p9 = sum(vec2Classify * p9Vec) + log(pClass9)    if max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p0:        return 'baby'    elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p1:        return 'car'    elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p2:        return 'food'    elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p3:        return 'health'    elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p4:        return 'legend'    elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p5:        return 'life'    elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p6:        return 'love'    elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p7:        return 'news'    elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p8:        return 'science'    else:        return 'sexual'def Result(str1,str2,r):    if str1 == 'baby':        a=0    elif str1 == 'car':        a=1    elif str1 == 'food':        a=2    elif str1 == 'health':        a=3    elif str1 == 'legend':        a=4    elif str1 == 'life':        a=5    elif str1 == 'love':        a=6    elif str1 == 'news':        a=7    elif str1 == 'science':        a=8    else:        a=9    if str2 == 'baby':        b=0    elif str2 == 'car':        b=1    elif str2 == 'food':        b=2    elif str2 == 'health':        b=3    elif str2 == 'legend':        b=4    elif str2 == 'life':        b=5    elif str2 == 'love':        b=6    elif str2 == 'news':        b=7    elif str2 == 'science':        b=8    else:        b=9    r[a][b]+=1    return rdef Output_result(r):    r0=sum(r[0])    r1=sum(r[1])    r2=sum(r[2])    r3=sum(r[3])    r4=sum(r[4])    r5=sum(r[5])    r6=sum(r[6])    r7=sum(r[7])    r8=sum(r[8])    r9=sum(r[9])    sum_right=0    sum_all=0    for i in range(10):        sum_right += r[i][i]    sum_all=r0+r1+r2+r3+r4+r5+r6+r7+r8+r9    p_rate = [0]*10    r_rate = [0]*10    f_meas = [0]*10    for i in range(10):        p_rate[i] = r[i][i]/sum(r[0])        sum_list = 0        for j in range(10):            sum_list += r[j][i]        r_rate[i] = r[i][i]/sum_list        f_meas[i] = (2*p_rate[i]*r_rate[i])/(p_rate[i]+r_rate[i])    print('baby类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f'   % (p_rate[0]*100,r_rate[0]*100,f_meas[0]*100))    print('car类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f'    % (p_rate[1]*100,r_rate[1]*100,f_meas[1]*100))    print('food类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f '  % (p_rate[2]*100,r_rate[2]*100,f_meas[2]*100))    print('health类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[3]*100,r_rate[3]*100,f_meas[3]*100))    print('legend类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[4]*100,r_rate[4]*100,f_meas[4]*100))    print('life类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f'   % (p_rate[5]*100,r_rate[5]*100,f_meas[5]*100))    print('love类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f'   % (p_rate[6]*100,r_rate[6]*100,f_meas[6]*100))    print('news类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f'   % (p_rate[7]*100,r_rate[7]*100,f_meas[7]*100))    print('science类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f'% (p_rate[8]*100,r_rate[8]*100,f_meas[8]*100))    print('sexual类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[9]*100,r_rate[9]*100,f_meas[9]*100))    print('平均准确率为:%.2f%%'% (sum(p_rate)*10))    print('平均召回率为:%.2f%%'% (sum(r_rate)*10))    print('总的分类正确率为:%d%%'% ((sum_right/sum_all)*100))def Training():    sum_list,sum_dict,myVocabList = get_traindata()    # print('词典为:%s' % str(myVocabList))    print('维度为:%d' % len(myVocabList))    trainMat = []    print(len(sum_list))    for postinDoc in sum_list:        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))    p0Vect, p1Vect, p2Vect, p3Vect, p4Vect, p5Vect, p6Vect, p7Vect, p8Vect, p9Vect, pClass0, pClass1, pClass2, pClass3, pClass4, pClass5, pClass6, pClass7, pClass8, pClass9 = trainNBO(array(trainMat), sum_dict)    r = []    for i in range(10):        r.append([0] * 10)    folder_path = ('F:/train_data')    folder_list = os.listdir(folder_path)    for folder in folder_list:        print('正在对%s类文本进行分类......' % folder)        new_folder_path = folder_path + '/' + str(folder)        files = os.listdir(new_folder_path)        for file in files:            rs = []            file__path = new_folder_path + '/' + file            with open(file__path, 'r', encoding='utf-8') as fp:                for ln in fp:                    rs.extend(ln.strip().split(' '))            Test_vect = setOfWords2Vec(myVocabList, rs)            Test_result = classify(Test_vect, p0Vect, p1Vect, p2Vect, p3Vect, p4Vect, p5Vect, p6Vect, p7Vect, p8Vect,p9Vect, pClass0, pClass1, pClass2, pClass3, pClass4, pClass5, pClass6, pClass7,pClass8, pClass9)            r = Result(folder, Test_result, r)        print('对%s类文本分类完毕!' % folder)    print(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], sep='\n')    Output_result(r)if __name__=='__main__':    Training()


原创粉丝点击