贝叶斯新闻分类demo(Python)

来源:互联网 发布:网络数据保险箱 编辑:程序博客网 时间:2024/06/15 03:51
#pip install jiebaimport pandas as pdimport jieba

数据源:http://www.sogou.com/labs/resource/ca.php

df_news = pd.read_table('./data/val.txt',names=['category','theme','URL','content'],encoding='utf-8')df_news = df_news.dropna()df_news.head()

这里写图片描述

df_news.shape

(5000, 4)

分词:使用结吧分词器

content = df_news.content.values.tolist()print (content[1000])
content_S = []for line in content:    current_segment = jieba.lcut(line)    if len(current_segment) > 1 and current_segment != '\r\n': #换行符        content_S.append(current_segment)
content_S[1000]

这里写图片描述

df_content=pd.DataFrame({'content_S':content_S})df_content.head()

这里写图片描述

# 停词表stopwords=pd.read_csv("stopwords.txt",index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8')stopwords.head(20)

这里写图片描述

# 去掉停用词def drop_stopwords(contents,stopwords):    contents_clean = []    all_words = []    for line in contents:        line_clean = []        for word in line:            if word in stopwords:                continue            line_clean.append(word)            all_words.append(str(word))        contents_clean.append(line_clean)    return contents_clean,all_words    #print (contents_clean)contents = df_content.content_S.values.tolist()    stopwords = stopwords.stopword.values.tolist()contents_clean,all_words = drop_stopwords(contents,stopwords)#df_content.content_S.isin(stopwords.stopword)#df_content=df_content[~df_content.content_S.isin(stopwords.stopword)]#df_content.head()
df_content=pd.DataFrame({'contents_clean':contents_clean})df_content.head()

这里写图片描述

df_all_words=pd.DataFrame({'all_words':all_words})df_all_words.head()

这里写图片描述

import numpy words_count=df_all_words.groupby(by=['all_words'])['all_words'].agg({"count":numpy.size})words_count=words_count.reset_index().sort_values(by=["count"],ascending=False)words_count.head()

这里写图片描述

#制作词云#pip install wordcloudfrom wordcloud import WordCloudimport matplotlib.pyplot as plt%matplotlib inlineimport matplotlibmatplotlib.rcParams['figure.figsize'] = (10.0, 5.0)wordcloud=WordCloud(font_path="./data/simhei.ttf",background_color="white",max_font_size=80)word_frequence = {x[0]:x[1] for x in words_count.head(100).values}wordcloud=wordcloud.fit_words(word_frequence)plt.imshow(wordcloud)

这里写图片描述

TF-IDF :提取关键词

import jieba.analyseindex = 2400print (df_news['content'][index])content_S_str = "".join(content_S[index])  print ("  ".join(jieba.analyse.extract_tags(content_S_str, topK=5, withWeight=False))) #取前五个作为关键词

这里写图片描述
这五个关键词基本可以组成这则新闻摘要了

LDA :主题模型

格式要求:list of list形式,分词好的的整个语料

from gensim import corpora, models, similaritiesimport gensim#http://radimrehurek.com/gensim/
#做映射,相当于词袋dictionary = corpora.Dictionary(contents_clean)corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20) #类似Kmeans自己指定K值
#一号分类结果print (lda.print_topic(1, topn=5))

0.005*”孩子” + 0.004*”说” + 0.004*”女人” + 0.004*”中” + 0.003*”於”

for topic in lda.print_topics(num_topics=20, num_words=5):    print (topic[1])

这里写图片描述

df_train=pd.DataFrame({'contents_clean':contents_clean,'label':df_news['category']})df_train.tail()

这里写图片描述

df_train.label.unique()

array([‘汽车’, ‘财经’, ‘科技’, ‘健康’, ‘体育’, ‘教育’, ‘文化’, ‘军事’, ‘娱乐’, ‘时尚’], dtype=object)

label_mapping = {"汽车": 1, "财经": 2, "科技": 3, "健康": 4, "体育":5, "教育": 6,"文化": 7,"军事": 8,"娱乐": 9,"时尚": 0}df_train['label'] = df_train['label'].map(label_mapping)df_train.head()

这里写图片描述

from sklearn.model_selection import train_test_splitx_train, x_test, y_train, y_test = train_test_split(df_train['contents_clean'].values, df_train['label'].values, random_state=1)
#x_train = x_train.flatten()x_train[0][1]

‘上海’

words = []for line_index in range(len(x_train)):    try:        #x_train[line_index][word_index] = str(x_train[line_index][word_index])        words.append(' '.join(x_train[line_index]))    except:        print (line_index,word_index)words[0] 

这里写图片描述

print (len(words))

3750

from sklearn.feature_extraction.text import CountVectorizertexts=["dog cat fish","dog cat cat","fish bird", 'bird']cv = CountVectorizer()cv_fit=cv.fit_transform(texts)print(cv.get_feature_names())print(cv_fit.toarray())print(cv_fit.toarray().sum(axis=0))

这里写图片描述

from sklearn.feature_extraction.text import CountVectorizertexts=["dog cat fish","dog cat cat","fish bird", 'bird']cv = CountVectorizer(ngram_range=(1,4))cv_fit=cv.fit_transform(texts)print(cv.get_feature_names())print(cv_fit.toarray())print(cv_fit.toarray().sum(axis=0))

这里写图片描述

from sklearn.feature_extraction.text import CountVectorizervec = CountVectorizer(analyzer='word', max_features=4000,  lowercase = False)vec.fit(words)
from sklearn.naive_bayes import MultinomialNBclassifier = MultinomialNB()classifier.fit(vec.transform(words), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

test_words = []for line_index in range(len(x_test)):    try:        #x_train[line_index][word_index] = str(x_train[line_index][word_index])        test_words.append(' '.join(x_test[line_index]))    except:         print (line_index,word_index)test_words[0]

这里写图片描述

classifier.score(vec.transform(test_words), y_test)

0.80400000000000005

from sklearn.feature_extraction.text import TfidfVectorizervectorizer = TfidfVectorizer(analyzer='word', max_features=4000,  lowercase = False)vectorizer.fit(words)

这里写图片描述

from sklearn.naive_bayes import MultinomialNBclassifier = MultinomialNB()classifier.fit(vectorizer.transform(words), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

classifier.score(vectorizer.transform(test_words), y_test)

0.81520000000000004

原创粉丝点击