主题模型lda使用

来源：互联网发布：联想网络同传怎么用编辑：程序博客网时间：2024/06/07 08:24
import pymysqlfrom sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizerfrom sklearn.decomposition import LatentDirichletAllocationimport jiebaimport ldaimport numpy as npdef mysql_run_sql(sql):    db = pymysql.connect(        host='***.18.***.51',        port=3306,        user='root',        password='****',        database='****',        charset='utf8',    )    cursor = db.cursor()    cursor.execute(sql)    data = cursor.fetchall()    # # 关闭数据库连接    db.close()    return data#导入停用词库stopwords = open('D:\****\stop_words.txt', 'r', encoding='utf-8').readlines()stops = [stopword.strip() for stopword in stopwords]content  = mysql_run_sql("SELECT c_content FROM math_compute.news_result_02")#atl_list = json_to_list(content)    def text_to_words(text):    words = jieba.lcut(str(text).strip())      meaningful_words = [w for w in words if not w in stops]    return ' '.join(meaningful_words)clean_content = [text_to_words(raw_review) for raw_review in content[1]] #从文本中提取1000个最重要的特征关键词#vectorizer = CountVectorizer(analyzer="word", tokenizer=None, \#preprocessor=None, stop_words=None, token_pattern=r"(?u)\b\w+\b", ngram_range=(1,1), max_features=None)#tf = vectorizer.fit_transform(clean_content)n_features = 1000tf_vectorizer = TfidfVectorizer(strip_accents = 'unicode',max_features = n_features,stop_words = 'english')tf = tf_vectorizer.fit_transform(clean_content)#定义主题数量n_topics = 1lda = LatentDirichletAllocation(n_topics = n_topics,max_iter = 50,learning_method = 'online',learning_offset = 50,random_state =0)lda.fit(tf)#n_topics = 5#model = lda.LDA(n_topics = n_topics,n_iter = 500,random_state = 1)#model.fit(tf)'''#主题-单词分布topic_word = model.topic_word_print("type(topic_word): {}".format(type(topic_word)))  print("shape: {}".format(topic_word.shape))  print(clean_content[:3]) print(topic_word[:, :3])    for n in range(5):      sum_pr = sum(topic_word[n,:])      print("topic: {} sum: {}".format(n, sum_pr))  #计算各主题Top-N个单词n = 5 for i ,topic_dist in enumerate(topic_word):    topic_words = np.array(tf)[np.argsort(topic_dist)][:-(n+1):-1]    print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))  '''###将每个主题里面的前若干个关键词显def print_top_words(model, feature_names, n_top_words):        for topic_idx, topic in enumerate(model.components_):            print("Topic #%d:" % topic_idx)            print(" ".join([feature_names[i]                            for i in topic.argsort()[:-n_top_words - 1:-1]]))        print()n_top_words = 20tf_feature_names = tf_vectorizer.get_feature_names()print_top_words(lda,tf_feature_names,n_top_words)
阅读全文
0 0