wordcloud用来制作中文词云

来源:互联网 发布:jre 7u9 windows xp32 编辑:程序博客网 时间:2024/05/16 00:42
1. 读入数据,删除NAN,用jieba分词df = pd.read_csv("./data/entertainment_news.csv", encoding='utf-8')dfdf = df.dropna()dfcontent=df.content.values.tolist()content#jieba.load_userdict(u"data/user_dic.txt")segment=[]for line in content:    try:        segs=jieba.lcut(line)        for seg in segs:            if len(seg)>1 and seg!='\r\n':                segment.append(seg)    except:        print line        continue
2. 去掉停用词words_df=pd.DataFrame({'segment':segment})#words_df.head()stopwords=pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用#stopwords.head()words_df=words_df[~words_df.segment.isin(stopwords.stopword)]words_df
3. 统计计数
words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})
words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)words_stat.head()
4. 绘图
wordcloud=WordCloud(font_path="data/simhei.ttf",background_color="white",max_font_size=80)word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}wordcloud=wordcloud.fit_words(word_frequence)plt.imshow(wordcloud)


原创粉丝点击