LAD gensim 主题分析

来源:互联网 发布:济南行知小学位置 编辑:程序博客网 时间:2024/05/18 09:15
# coding: utf-8#from gensim.models import word2vecfrom gensim.corpora import Dictionaryfrom gensim.models import LdaModelfrom gensim import modelsimport sysimport pandas as pdData=pd.read_csv('yuliao.txt',encoding='utf-8',header=None)# In[14]:Listdata=list(Data[0])train_set=[listi.split(' ') for listi in  Listdata]# In[40]:reload(sys)sys.setdefaultencoding("utf-8")# In[43]:len(train_set)print('你好')# In[ ]:print('你好')# In[44]:# 构建训练语料dictionary = Dictionary(train_set)corpus = [ dictionary.doc2bow(text) for text in train_set]#构建稀疏向量tfidf = models.TfidfModel(corpus) #统计tfidf   # print "lda"corpus_tfidf = tfidf[corpus]  #得到每个文本的tfidf向量,稀疏矩阵lda2 = LdaModel(corpus_tfidf, id2word =dictionary, num_topics = 34) #corpus_lda = lda[corpus_tfidf] ## In[19]:print(corpus )# In[22]:# lda模型训练lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)# In[28]:test=lda.print_topics(20)len((test[0]))for i in test:    print i[0],i[1]