贝叶斯新闻分类demo(Python)
来源:互联网 发布:网络数据保险箱 编辑:程序博客网 时间:2024/06/15 03:51
#pip install jiebaimport pandas as pdimport jieba
数据源:http://www.sogou.com/labs/resource/ca.php
df_news = pd.read_table('./data/val.txt',names=['category','theme','URL','content'],encoding='utf-8')df_news = df_news.dropna()df_news.head()
df_news.shape
(5000, 4)
分词:使用结吧分词器
content = df_news.content.values.tolist()print (content[1000])
content_S = []for line in content: current_segment = jieba.lcut(line) if len(current_segment) > 1 and current_segment != '\r\n': #换行符 content_S.append(current_segment)
content_S[1000]
df_content=pd.DataFrame({'content_S':content_S})df_content.head()
# 停词表stopwords=pd.read_csv("stopwords.txt",index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8')stopwords.head(20)
# 去掉停用词def drop_stopwords(contents,stopwords): contents_clean = [] all_words = [] for line in contents: line_clean = [] for word in line: if word in stopwords: continue line_clean.append(word) all_words.append(str(word)) contents_clean.append(line_clean) return contents_clean,all_words #print (contents_clean)contents = df_content.content_S.values.tolist() stopwords = stopwords.stopword.values.tolist()contents_clean,all_words = drop_stopwords(contents,stopwords)#df_content.content_S.isin(stopwords.stopword)#df_content=df_content[~df_content.content_S.isin(stopwords.stopword)]#df_content.head()
df_content=pd.DataFrame({'contents_clean':contents_clean})df_content.head()
df_all_words=pd.DataFrame({'all_words':all_words})df_all_words.head()
import numpy words_count=df_all_words.groupby(by=['all_words'])['all_words'].agg({"count":numpy.size})words_count=words_count.reset_index().sort_values(by=["count"],ascending=False)words_count.head()
#制作词云#pip install wordcloudfrom wordcloud import WordCloudimport matplotlib.pyplot as plt%matplotlib inlineimport matplotlibmatplotlib.rcParams['figure.figsize'] = (10.0, 5.0)wordcloud=WordCloud(font_path="./data/simhei.ttf",background_color="white",max_font_size=80)word_frequence = {x[0]:x[1] for x in words_count.head(100).values}wordcloud=wordcloud.fit_words(word_frequence)plt.imshow(wordcloud)
TF-IDF :提取关键词
import jieba.analyseindex = 2400print (df_news['content'][index])content_S_str = "".join(content_S[index]) print (" ".join(jieba.analyse.extract_tags(content_S_str, topK=5, withWeight=False))) #取前五个作为关键词
这五个关键词基本可以组成这则新闻摘要了
LDA :主题模型
格式要求:list of list形式,分词好的的整个语料
from gensim import corpora, models, similaritiesimport gensim#http://radimrehurek.com/gensim/
#做映射,相当于词袋dictionary = corpora.Dictionary(contents_clean)corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20) #类似Kmeans自己指定K值
#一号分类结果print (lda.print_topic(1, topn=5))
0.005*”孩子” + 0.004*”说” + 0.004*”女人” + 0.004*”中” + 0.003*”於”
for topic in lda.print_topics(num_topics=20, num_words=5): print (topic[1])
df_train=pd.DataFrame({'contents_clean':contents_clean,'label':df_news['category']})df_train.tail()
df_train.label.unique()
array([‘汽车’, ‘财经’, ‘科技’, ‘健康’, ‘体育’, ‘教育’, ‘文化’, ‘军事’, ‘娱乐’, ‘时尚’], dtype=object)
label_mapping = {"汽车": 1, "财经": 2, "科技": 3, "健康": 4, "体育":5, "教育": 6,"文化": 7,"军事": 8,"娱乐": 9,"时尚": 0}df_train['label'] = df_train['label'].map(label_mapping)df_train.head()
from sklearn.model_selection import train_test_splitx_train, x_test, y_train, y_test = train_test_split(df_train['contents_clean'].values, df_train['label'].values, random_state=1)
#x_train = x_train.flatten()x_train[0][1]
‘上海’
words = []for line_index in range(len(x_train)): try: #x_train[line_index][word_index] = str(x_train[line_index][word_index]) words.append(' '.join(x_train[line_index])) except: print (line_index,word_index)words[0]
print (len(words))
3750
from sklearn.feature_extraction.text import CountVectorizertexts=["dog cat fish","dog cat cat","fish bird", 'bird']cv = CountVectorizer()cv_fit=cv.fit_transform(texts)print(cv.get_feature_names())print(cv_fit.toarray())print(cv_fit.toarray().sum(axis=0))
from sklearn.feature_extraction.text import CountVectorizertexts=["dog cat fish","dog cat cat","fish bird", 'bird']cv = CountVectorizer(ngram_range=(1,4))cv_fit=cv.fit_transform(texts)print(cv.get_feature_names())print(cv_fit.toarray())print(cv_fit.toarray().sum(axis=0))
from sklearn.feature_extraction.text import CountVectorizervec = CountVectorizer(analyzer='word', max_features=4000, lowercase = False)vec.fit(words)
from sklearn.naive_bayes import MultinomialNBclassifier = MultinomialNB()classifier.fit(vec.transform(words), y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
test_words = []for line_index in range(len(x_test)): try: #x_train[line_index][word_index] = str(x_train[line_index][word_index]) test_words.append(' '.join(x_test[line_index])) except: print (line_index,word_index)test_words[0]
classifier.score(vec.transform(test_words), y_test)
0.80400000000000005
from sklearn.feature_extraction.text import TfidfVectorizervectorizer = TfidfVectorizer(analyzer='word', max_features=4000, lowercase = False)vectorizer.fit(words)
from sklearn.naive_bayes import MultinomialNBclassifier = MultinomialNB()classifier.fit(vectorizer.transform(words), y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
classifier.score(vectorizer.transform(test_words), y_test)
0.81520000000000004
阅读全文
0 0
- 贝叶斯新闻分类demo(Python)
- 基于朴素贝叶斯的关于互联网金融新闻分类(python实现)
- 贝叶斯案例3:文本关键词提取、新闻分类(python实现)
- 新闻分类系统(Python):爬虫(bs+rq)+数据处理(jieba分词)+分类器(贝叶斯)
- 贝叶斯分类方法学习三 python+jieba+mongodb实现朴素贝叶斯新闻文本自动分类
- 贝叶斯新闻分类器详解
- 朴素贝叶斯-新闻分类
- 新闻demo
- 【Kaggle笔记】新闻文本分类(朴素贝叶斯)
- 新闻分类
- caffe-python接口图片分类demo
- 朴素贝叶斯新闻分类器详解
- 朴素贝叶斯新闻分类器详解
- 朴素贝叶斯新闻分类器详解
- Nutch2.3.1 新闻分类爬虫(借鉴)
- 新闻推荐系统之朴素贝叶斯分类器文本分类
- 网易新闻首页demo
- CMS新闻DEMO
- CS231n (winter 2016) : Assignment2
- Java环境的搭建
- 1069. 微博转发抽奖
- Excel中不使用vba完成借款计算
- 哈夫曼树
- 贝叶斯新闻分类demo(Python)
- 最全Pycharm教程(22)——Pycharm编辑器功能之窗口选项卡管理
- 硬币面值方案
- MongoDB完全教程
- LeetCode.147 Insertion Sort List
- usaco垃圾陷阱解题报告
- 2017京东校招在线编程题——集合
- 二叉树
- C# Winform 跨线程更新UI控件常用方法汇总