手写朴素贝叶斯文本分类
来源:互联网 发布:mac磁盘空间不足在哪看 编辑:程序博客网 时间:2024/05/17 04:17
def get_traindata(): #输出总的文本矩阵,文本向量,词典 folder_path = ('F:/train_data') folder_list = os.listdir(folder_path) sum_list = [] corpus = [] sum_vocab = set([]) sum_dict = {} print('正在生成训练集总文本向量...') for folder in folder_list: #先获取总的文本向量 new_folder_path = folder_path + '/' + str(folder) files = os.listdir(new_folder_path) for file in files: rs = [] with open(new_folder_path+'/'+file,'r',encoding='utf-8') as fp: for ln in fp: rs.extend(ln.strip().split(' ')) # print(type(rs)) sum_list.append(rs) print(len(sum_list)) print('生成完毕!') sum_num = len(sum_list) #文本总数 past_num = 0 train_num = 0 print('正在生成词典...') for folder in folder_list: #对每一类求类向量并降维 new_folder_path = folder_path + '/' + str(folder) files = os.listdir(new_folder_path) train_num = len(files) class_list = [] for file in files: rs = [] with open(new_folder_path + '/' + file, 'r', encoding='utf-8') as fp: for ln in fp: rs.extend(ln.strip().split(' ')) class_list.extend(rs) corpus.append(str(class_list)) # class_vocab = createVocablist(class_list) # for word in class_vocab: # if word in sum_vocab: # class_vocab.remove(word) # class_vocab1 = class_tfidf(class_vocab,class_list,sum_list) # sum_vocab = sum_vocab | set(class_vocab1) sum_dict[str(folder)] = [0]*past_num+[1]*train_num+[0]*(sum_num-past_num-train_num) past_num +=train_num vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) word = vectorizer.get_feature_names() weight = tfidf.toarray() vocab = set([]) for i in range(len(weight)): list = sorted(weight[i], reverse=True) num = list[5] for j in range(len(word)): if weight[i][j] >= num: vocab.add(j) vocab_list =[] for i in vocab: vocab_list.append(word[i]) # class_list1 = [] # for i in dict_list: # class_list1.append(i[0]) # sum_vocab = sum_vocab | set(class_list1) # # vocab_list = list(sum_vocab) #最终的词典列表 print('生成的词典为:%s'% str(vocab_list)) vocab_file = open('F:/myVocab.txt','w') vocab_file.write(str(vocab_list)) vocab_file.close() return sum_list,sum_dict,vocab_list #返回总的文档列表、总的分类向量、词典def createVocablist(dataSet): #去重复词创建词库 vocabSet=set([]) for document in dataSet: vocabSet=vocabSet | set(document) return list(vocabSet)# def class_tfidf(class_vocab, class_list, sum_list):# class_dict = {}# tf1_dict = {}# tf2 = 0# idf2_dict = {}## for word in class_vocab:# tf1_dict[word] = 0 # 该类中该词的词频# idf2_dict[word] = 1 # 总文档中包含该词的文档数# idf1 = len(sum_list) # 总文档数## print('step 1')# for list in class_list:# for word in list:# if word in class_vocab:# tf1_dict[word] += 1# tf2 += 1# else:# tf2 += 1## print('step 2')# for list in sum_list:# for word in class_vocab:# if word in list:# idf2_dict[word] += 1## print('step 3')# for word in class_vocab:# class_dict[word] = (tf1_dict[word] / tf2) * log(idf1 / idf2_dict[word])# dict = sorted(class_dict.items(), key=lambda d: d[1], reverse=True)# dict_list = dict[:1000]# class_list1 = []# for i in dict_list:# class_list1.append(i[0])# return class_list1 # 返回维度为1000的类词典# print('完成一类TFIDF')def setOfWords2Vec(vocablist,inputSet): #生成文本向量 returnVec=[0]*len(vocablist) for word in inputSet: if word in vocablist: returnVec[vocablist.index(word)]=1 else: pass # print('The word:%s is not in my Vocabulary!' % word) return returnVecdef trainNBO(trainMatrix,sum_dic): #求每种分类的概率向量 numTrainDocs=len(trainMatrix) #总文档数 numWords=len(trainMatrix[0]) #向量长度 folder_path = ('F:/train_data') folder_list = os.listdir(folder_path) for folder in folder_list: train_num=0 new_folder_path=folder_path+'/'+str(folder) train_num=len(os.listdir(new_folder_path)) if folder=='baby': pClass0=train_num/float(numTrainDocs) elif folder=='car': pClass1=train_num/float(numTrainDocs) elif folder=='food': pClass2=train_num/float(numTrainDocs) elif folder=='health': pClass3=train_num/float(numTrainDocs) elif folder=='legend': pClass4=train_num/float(numTrainDocs) elif folder=='life': pClass5=train_num/float(numTrainDocs) elif folder=='love': pClass6=train_num/float(numTrainDocs) elif folder=='news': pClass7=train_num/float(numTrainDocs) elif folder=='science': pClass8=train_num/float(numTrainDocs) else : pClass9=train_num/float(numTrainDocs) p0Num = ones(numWords) p1Num = ones(numWords) p2Num = ones(numWords) p3Num = ones(numWords) p4Num = ones(numWords) p5Num = ones(numWords) p6Num = ones(numWords) p7Num = ones(numWords) p8Num = ones(numWords) p9Num = ones(numWords) p0Denom = numWords p1Denom = numWords p2Denom = numWords p3Denom = numWords p4Denom = numWords p5Denom = numWords p6Denom = numWords p7Denom = numWords p8Denom = numWords p9Denom = numWords for i in range(numTrainDocs): if sum_dic['baby'][i] == 1: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) elif sum_dic['car'][i] == 1: p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) elif sum_dic['food'][i] == 1: p2Num += trainMatrix[i] p2Denom += sum(trainMatrix[i]) elif sum_dic['health'][i] == 1: p3Num += trainMatrix[i] p3Denom += sum(trainMatrix[i]) elif sum_dic['legend'][i] == 1: p4Num += trainMatrix[i] p4Denom += sum(trainMatrix[i]) elif sum_dic['life'][i] == 1: p5Num += trainMatrix[i] p5Denom += sum(trainMatrix[i]) elif sum_dic['love'][i] == 1: p6Num += trainMatrix[i] p6Denom += sum(trainMatrix[i]) elif sum_dic['news'][i] == 1: p7Num += trainMatrix[i] p7Denom += sum(trainMatrix[i]) elif sum_dic['science'][i] == 1: p8Num += trainMatrix[i] p8Denom += sum(trainMatrix[i]) else : p9Num += trainMatrix[i] p9Denom += sum(trainMatrix[i]) p0Vect = log(p0Num / p0Denom) p1Vect = log(p1Num / p1Denom) p2Vect = log(p2Num / p2Denom) p3Vect = log(p3Num / p3Denom) p4Vect = log(p4Num / p4Denom) p5Vect = log(p5Num / p5Denom) p6Vect = log(p6Num / p6Denom) p7Vect = log(p7Num / p7Denom) p8Vect = log(p8Num / p8Denom) p9Vect = log(p9Num / p9Denom) return p0Vect,p1Vect,p2Vect,p3Vect,p4Vect,p5Vect,p6Vect,p7Vect,p8Vect,p9Vect,\ pClass0,pClass1,pClass2,pClass3,pClass4,pClass5,pClass6,pClass7,pClass8,pClass9def classify(vec2Classify,p0Vec,p1Vec,p2Vec,p3Vec,p4Vec,p5Vec,p6Vec,p7Vec,p8Vec,p9Vec,pClass0,pClass1,pClass2,pClass3,pClass4,pClass5,pClass6,pClass7,pClass8,pClass9): p0 = sum(vec2Classify * p0Vec) + log(pClass0) p1 = sum(vec2Classify * p1Vec) + log(pClass1) p2 = sum(vec2Classify * p2Vec) + log(pClass2) p3 = sum(vec2Classify * p3Vec) + log(pClass3) p4 = sum(vec2Classify * p4Vec) + log(pClass4) p5 = sum(vec2Classify * p5Vec) + log(pClass5) p6 = sum(vec2Classify * p6Vec) + log(pClass6) p7 = sum(vec2Classify * p7Vec) + log(pClass7) p8 = sum(vec2Classify * p8Vec) + log(pClass8) p9 = sum(vec2Classify * p9Vec) + log(pClass9) if max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p0: return 'baby' elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p1: return 'car' elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p2: return 'food' elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p3: return 'health' elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p4: return 'legend' elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p5: return 'life' elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p6: return 'love' elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p7: return 'news' elif max(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9)==p8: return 'science' else: return 'sexual'def Result(str1,str2,r): if str1 == 'baby': a=0 elif str1 == 'car': a=1 elif str1 == 'food': a=2 elif str1 == 'health': a=3 elif str1 == 'legend': a=4 elif str1 == 'life': a=5 elif str1 == 'love': a=6 elif str1 == 'news': a=7 elif str1 == 'science': a=8 else: a=9 if str2 == 'baby': b=0 elif str2 == 'car': b=1 elif str2 == 'food': b=2 elif str2 == 'health': b=3 elif str2 == 'legend': b=4 elif str2 == 'life': b=5 elif str2 == 'love': b=6 elif str2 == 'news': b=7 elif str2 == 'science': b=8 else: b=9 r[a][b]+=1 return rdef Output_result(r): r0=sum(r[0]) r1=sum(r[1]) r2=sum(r[2]) r3=sum(r[3]) r4=sum(r[4]) r5=sum(r[5]) r6=sum(r[6]) r7=sum(r[7]) r8=sum(r[8]) r9=sum(r[9]) sum_right=0 sum_all=0 for i in range(10): sum_right += r[i][i] sum_all=r0+r1+r2+r3+r4+r5+r6+r7+r8+r9 p_rate = [0]*10 r_rate = [0]*10 f_meas = [0]*10 for i in range(10): p_rate[i] = r[i][i]/sum(r[0]) sum_list = 0 for j in range(10): sum_list += r[j][i] r_rate[i] = r[i][i]/sum_list f_meas[i] = (2*p_rate[i]*r_rate[i])/(p_rate[i]+r_rate[i]) print('baby类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[0]*100,r_rate[0]*100,f_meas[0]*100)) print('car类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[1]*100,r_rate[1]*100,f_meas[1]*100)) print('food类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f ' % (p_rate[2]*100,r_rate[2]*100,f_meas[2]*100)) print('health类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[3]*100,r_rate[3]*100,f_meas[3]*100)) print('legend类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[4]*100,r_rate[4]*100,f_meas[4]*100)) print('life类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[5]*100,r_rate[5]*100,f_meas[5]*100)) print('love类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[6]*100,r_rate[6]*100,f_meas[6]*100)) print('news类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[7]*100,r_rate[7]*100,f_meas[7]*100)) print('science类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f'% (p_rate[8]*100,r_rate[8]*100,f_meas[8]*100)) print('sexual类的分类正确率为:%.2f%%,召回率为:%.2f%%,F-Measure为:%.2f' % (p_rate[9]*100,r_rate[9]*100,f_meas[9]*100)) print('平均准确率为:%.2f%%'% (sum(p_rate)*10)) print('平均召回率为:%.2f%%'% (sum(r_rate)*10)) print('总的分类正确率为:%d%%'% ((sum_right/sum_all)*100))def Training(): sum_list,sum_dict,myVocabList = get_traindata() # print('词典为:%s' % str(myVocabList)) print('维度为:%d' % len(myVocabList)) trainMat = [] print(len(sum_list)) for postinDoc in sum_list: trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) p0Vect, p1Vect, p2Vect, p3Vect, p4Vect, p5Vect, p6Vect, p7Vect, p8Vect, p9Vect, pClass0, pClass1, pClass2, pClass3, pClass4, pClass5, pClass6, pClass7, pClass8, pClass9 = trainNBO(array(trainMat), sum_dict) r = [] for i in range(10): r.append([0] * 10) folder_path = ('F:/train_data') folder_list = os.listdir(folder_path) for folder in folder_list: print('正在对%s类文本进行分类......' % folder) new_folder_path = folder_path + '/' + str(folder) files = os.listdir(new_folder_path) for file in files: rs = [] file__path = new_folder_path + '/' + file with open(file__path, 'r', encoding='utf-8') as fp: for ln in fp: rs.extend(ln.strip().split(' ')) Test_vect = setOfWords2Vec(myVocabList, rs) Test_result = classify(Test_vect, p0Vect, p1Vect, p2Vect, p3Vect, p4Vect, p5Vect, p6Vect, p7Vect, p8Vect,p9Vect, pClass0, pClass1, pClass2, pClass3, pClass4, pClass5, pClass6, pClass7,pClass8, pClass9) r = Result(folder, Test_result, r) print('对%s类文本分类完毕!' % folder) print(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], sep='\n') Output_result(r)if __name__=='__main__': Training()
阅读全文
0 0
- 手写朴素贝叶斯文本分类
- 朴素贝叶斯文本分类
- 朴素贝叶斯文本分类过程
- 朴素贝叶斯文本分类
- 朴素贝叶斯文本分类
- Mahout朴素贝叶斯文本分类
- 朴素贝叶斯文本分类算法
- 朴素贝叶斯文本分类算法
- 朴素贝叶斯文本分类算法
- 朴素贝叶斯文本分类
- 朴素贝叶斯文本分类应用
- 朴素贝叶斯文本分类
- 朴素贝叶斯文本分类算法java实现
- 朴素贝叶斯文本分类算法java实现(二)
- 朴素贝叶斯文本分类算法源代码
- 朴素贝叶斯文本分类java实现
- 利用Python实现朴素贝叶斯文本分类
- 使用sklearn实现朴素贝叶斯文本分类
- BaseActivity类
- Coursera Machine Learning ex2第三周 week3编程全套满分题目+注释(包括选做optional)
- Unity3D
- USB通信开发资料搜集
- 【C++】两个字符串相加
- 手写朴素贝叶斯文本分类
- session钝化与活化
- AI GOD:1012: Attack
- 算法工程师~~修炼之道
- merge关键字
- iOS MRC 下 block 循环引用问题
- 关于cmake qmake make makeile之间的关系问题
- STM32F0 Slave I2C配置
- js加密