jieba和朴素贝叶斯实现文本分类
来源:互联网 发布:java开发单元测试 编辑:程序博客网 时间:2024/04/30 02:32
#盗取男票年轻时候的代码,现在全给我教学使用了,感恩脸
#分类文档为多个文件夹 文件夹是以类别名命名 内含多个单个文档
#coding: utf-8from __future__ import print_function, unicode_literalsimport osimport timeimport randomimport jiebaimport numpy as npfrom collections import defaultdictimport sklearnfrom sklearn.naive_bayes import MultinomialNB# 主要用来获取停用词def MakeWordsSet(words_file): words_set = set() with open(words_file, 'r') as fp: for line in fp.readlines(): word = line.strip() if len(word) > 0 and word not in words_set: # 去重 words_set.add(word) return words_setdef TextProcessing(folder_path, test_size=0.2): folder_list = os.listdir(folder_path) data_list = [] class_list = [] # 类间循环 for folder in folder_list: new_folder_path = os.path.join(folder_path, folder) # 输出相关路径和时间 print ("路径 = ", new_folder_path, time.asctime((time.localtime(time.time())))) files = os.listdir(new_folder_path) # 类内循环 for file in files: with open(os.path.join(new_folder_path, file), 'r') as fp: raw = fp.read() word_cut = jieba.cut(raw, cut_all=False) # 精确模式,返回的结构是一个可迭代的genertor word_list = list(word_cut) # genertor转化为list,每个词unicode格式 data_list.append(word_list) class_list.append(folder) # 划分训练集和测试集 data_class_list = list(zip(data_list, class_list)) # 返回随机排列后的序列,没有返回值,会直接修改data_class_list random.shuffle(data_class_list) index = int(len(data_class_list) * test_size) + 1 #获取部分序列位置(index) (train:test)4 : 1 train_list = data_class_list[index:] test_list = data_class_list[:index] train_data_list, train_class_list = zip(*train_list) test_data_list, test_class_list = zip(*test_list) # 统计词频放入all_words_dict all_words_dict = {} for word_list in train_data_list: for word in word_list: if word in all_words_dict: all_words_dict[word] += 1 else: all_words_dict[word] = 1 # key函数利用词频进行降序排序 # 内建函数sorted参数需为list all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f:f[1], reverse=True) all_words_list = list(zip(*all_words_tuple_list))[0] return all_words_list, train_data_list, test_data_list, train_class_list, test_class_listdef words_dict(all_words_list, deleteN, stopwords_set=set()): # 选取特征词 feature_words = [] n = 1 for t in range(deleteN, len(all_words_list), 1): if n > 1500: # feature_words的维度1500 break if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5: feature_words.append(all_words_list[t]) n += 1 return feature_wordsdef TextFeatures(train_data_list, test_data_list, feature_words, flag='nltk'): # 注意python嵌套函数的使用 # 可以上网看看相关知识点 def text_features(text, feature_words): text_words = set(text) features = [1 if word in text_words else 0 for word in feature_words] return features train_feature_list = [text_features(text, feature_words) for text in train_data_list] test_feature_list = [text_features(text, feature_words) for text in test_data_list] return train_feature_list, test_feature_listdef TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list): ## sklearn分类器 ## 把这个代码看懂了 很多sklearn的函数都可以在这个里面试一试 classifier = MultinomialNB().fit(train_feature_list, train_class_list) test_accuracy = classifier.score(test_feature_list, test_class_list) return test_accuracyif __name__ == '__main__': print ('STARTING TIME : ', time.asctime((time.localtime(time.time())))) # 文本预处理 folder_path = './文档' all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2) # 生成stopwords_set stopwords_file = './stopwords.txt' stopwords_set = MakeWordsSet(stopwords_file) ## 文本特征提取和分类 feature_words = words_dict(all_words_list, 20, stopwords_set) train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words) test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list) print ('准确率 : ', test_accuracy * 100, '%') print ('ENDING TIME : ', time.asctime((time.localtime(time.time())))) print ("finished")
自己把提取关键词改为用jieba内analyse的函数,还是1500维,结果精确度提高了1.多%,但是时间长了很多,说明用jieba内部提取关键字 用uf-idf的时间复杂度还是很高的
feature_words = jieba.analyse.extract_tags(all_words,1500)改后如下:
#coding: utf-8from __future__ import print_function, unicode_literalsimport osimport timeimport randomimport jiebaimport jieba.analyseimport numpy as npfrom collections import defaultdictimport sklearnfrom sklearn.naive_bayes import MultinomialNB# 主要用来获取停用词def MakeWordsSet(words_file): words_set = set() with open(words_file, 'r') as fp: for line in fp.readlines(): word = line.strip() if len(word) > 0 and word not in words_set: # 去重 words_set.add(word) return words_setdef TextProcessing(folder_path, test_size=0.2): folder_list = os.listdir(folder_path) data_list = [] class_list = [] # 类间循环 for folder in folder_list: new_folder_path = os.path.join(folder_path, folder) # 输出相关路径和时间 print ("路径 = ", new_folder_path, time.asctime((time.localtime(time.time())))) files = os.listdir(new_folder_path) # 类内循环 for file in files: with open(os.path.join(new_folder_path, file), 'r') as fp: raw = fp.read() word_cut = jieba.cut(raw, cut_all=False) # 精确模式,返回的结构是一个可迭代的genertor word_list = list(word_cut) # genertor转化为list,每个词unicode格式 data_list.append(word_list) class_list.append(folder) # 划分训练集和测试集 data_class_list = list(zip(data_list, class_list)) # 返回随机排列后的序列,没有返回值,会直接修改data_class_list random.shuffle(data_class_list) index = int(len(data_class_list) * test_size) + 1 #获取部分序列位置(index) (train:test)4 : 1 train_list = data_class_list[index:] test_list = data_class_list[:index] train_data_list, train_class_list = zip(*train_list) test_data_list, test_class_list = zip(*test_list) # 统计词频放入all_words #all_words_dict = {} all_words = '' for word_list in train_data_list: for word in word_list: all_words += word # key函数利用词频进行降序排序 # 内建函数sorted参数需为list #all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f:f[1], reverse=True) #all_words_list = list(zip(*all_words_tuple_list))[0] return all_words, train_data_list, test_data_list, train_class_list, test_class_listdef words_dict(all_words_list, deleteN, stopwords_set=set()): # 选取特征词 feature_words = [] n = 1 for t in range(deleteN, len(all_words_list), 1): if n > 2000: # feature_words的维度1500 break if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5: feature_words.append(all_words_list[t]) n += 1 return feature_wordsdef TextFeatures(train_data_list, test_data_list, feature_words, flag='nltk'): # 注意python嵌套函数的使用 # 可以上网看看相关知识点 def text_features(text, feature_words): text_words = set(text) features = [1 if word in text_words else 0 for word in feature_words] return features train_feature_list = [text_features(text, feature_words) for text in train_data_list] test_feature_list = [text_features(text, feature_words) for text in test_data_list] return train_feature_list, test_feature_listdef TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list): ## sklearn分类器 ## 把这个代码看懂了 很多sklearn的函数都可以在这个里面试一试 classifier = MultinomialNB().fit(train_feature_list, train_class_list) test_accuracy = classifier.score(test_feature_list, test_class_list) return test_accuracyif __name__ == '__main__': print ('STARTING TIME : ', time.asctime((time.localtime(time.time())))) # 文本预处理 folder_path = './文档' all_words, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2) # 生成stopwords_set #stopwords_file = './stopwords.txt' #stopwords_set = MakeWordsSet(stopwords_file) ## 文本特征提取和分类 #feature_words = words_dict(all_words_list, 20, stopwords_set) feature_words = jieba.analyse.extract_tags(all_words,1500) train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words) test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list) print ('准确率 : ', test_accuracy * 100, '%') print ('ENDING TIME : ', time.asctime((time.localtime(time.time())))) print ("finished")
0 0
- jieba和朴素贝叶斯实现文本分类
- 贝叶斯分类方法学习三 python+jieba+mongodb实现朴素贝叶斯新闻文本自动分类
- 朴素贝叶斯实现文本分类(1)
- 朴素贝叶斯分类文本 python实现
- 朴素贝叶斯实现的文本分类
- 朴素贝叶斯-文本分类
- 朴素贝叶斯分类算法理解及文本分类器实现
- 朴素贝叶斯分类原理及Python实现简单文本分类
- 基于情感词典和朴素贝叶斯算法实现中文文本情感分类
- 朴素贝叶斯与文本分类
- 中文文本分类-朴素贝叶斯
- 朴素贝叶斯与文本分类
- 文本分类算法--朴素贝叶斯
- Python实现文本自动分类(朴素贝叶斯方法)
- 朴素贝叶斯实现文本分类部分代码(2)
- Naive Bayes 朴素贝叶斯(文本)分类器Java实现
- 朴素贝叶斯分类器简单实现文本情感分析
- 文本分类基于朴素贝叶斯分类器
- 内存对齐
- javascript最佳实践
- Leveldb 实现原理(转)
- LintCode 18-带重复元素的子集 JAVA
- 1100. Mars Numbers (20)
- jieba和朴素贝叶斯实现文本分类
- LeetCode 206. Reverse Linked List
- C#中stringBuilder和string的区别
- 【Matlab Computer Vision System ToolBox】学习笔记-1-点云配准流程 | 特征匹配
- 104. Maximum Depth of Binary Tree
- 我们公司(创业公司)目前的组织架构-部门划分和部门职能
- java集合使用方法
- 检测Android系统是否是MIUI
- Nginx反向代理、CORS、JSONP等跨域请求解决方法总结