python使用gensim训练搜狗语料的LDA
来源:互联网 发布:c语言 二维数组编程题 编辑:程序博客网 时间:2024/04/28 13:25
# -*- coding: utf-8 -*-import jieba, osimport codecsfrom gensim import corpora, models, similaritiesfrom pprint import pprintfrom collections import defaultdictimport sysreload(sys)sys.setdefaultencoding('utf-8')def load_data(): walk = os.walk('D:/dev_data/sogou') documents = [] for root, dirs, files in walk: for name in files: raw = codecs.open(os.path.join(root, name), 'r', 'utf-8','ignore').read() documents.append(raw) return documentsdef preprocess(documents): stoplist = codecs.open('tmp/stopword.txt','r',encoding='utf8').readlines() stoplist = set(w.strip() for w in stoplist) #分词,去停用词 texts = [[word for word in list(jieba.cut(document.replace('\t','').replace('\n',''), cut_all = True)) if word not in stoplist] for document in documents] #去除低频词 frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 2] for text in texts] dictionary = corpora.Dictionary(texts) dictionary.save('tmp/sogou.dict') print(dictionary) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('tmp/sogou.mm', corpus) return corpus,dictionarydef train_lda(corpus,dictionary): tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # 模型训练 lda = models.LdaModel(corpus_tfidf, id2word = dictionary, num_topics = 9) #模型的保存/ 加载 lda.save('tmp/sogou_lda.model')def load_lda(): lda = models.ldamodel.LdaModel.load('tmp/sogou_lda.model') for i in range(4): print lda.print_topic(i)def test_lda(): lda_model = models.ldamodel.LdaModel.load('tmp/sogou_lda.model') dictionary = corpora.Dictionary.load('tmp/sogou.dict') corpus = corpora.MmCorpus('tmp/sogou.mm') stoplist = codecs.open('tmp/stopword.txt', 'r', encoding='utf8').readlines() unseen_document = """ 在本赛季的这三场比赛中,骑士三战皆胜。值得一提的是,全场比赛骑士三分线外46投25中,打破NBA常规赛单场比赛单支球队三分球命中数纪录。 """ d = "".join(unseen_document.split()) print "The unseen document is composed by the following text:", unseen_document print text = [word for word in list(jieba.cut(d, cut_all=True)) if word not in stoplist] bow_vector = dictionary.doc2bow(text) for i in range(0, 9): print lda_model.print_topic(i) print lda_model[bow_vector] for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1 * tup[1]): print "Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 3))def print_lda(): lda_model = models.ldamodel.LdaModel.load('tmp/sogou_lda.model') for i in range(0, 9): print i,lda_model.print_topic(i,10) print 0, lda_model.print_topic(0, 10)def train(): documents = load_data() corpus,dictionary = preprocess(documents) train_lda(corpus,dictionary)def test(): #load_lda() #test_lda() dictionary = corpora.Dictionary.load('tmp/sogou.dict') print dictionary[10] print len(dictionary) print dictionarydef test1(): lda = models.ldamodel.LdaModel.load('tmp/sogou_lda.model') test_doc = """ 中华网总经理陈晓薇表示,该公司将在今年首季推出生活频道及重建英语频道,并着手发展与其他国家及知名企业合作的资讯网页,此外在5月份,中华网推出针对内地专业人士的娱乐内容,作为将来3G手机内容的供应来源。(英宁) """ test_doc = list(jieba.cut(test_doc)) # 新文档进行分词 doc_bow = dictionary.doc2bow(test_doc) # 文档转换成bow doc_lda = lda[doc_bow] # 得到新文档的主题分布 # 输出新文档的主题分布 print doc_lda for topic in doc_lda: print "%s\t%f\n" % (lda.print_topic(topic[0]), topic[1])#train()print_lda()
0 0
- python使用gensim训练搜狗语料的LDA
- gensim 中文语料训练 word2vec
- 【python gensim使用】word2vec词向量处理英文语料
- 【python gensim使用】word2vec词向量处理中文语料
- 【python gensim使用】word2vec词向量处理中文语料
- 【python gensim使用】word2vec词向量处理英文语料
- 使用gensim中的lda模型训练主题分布
- LDA的使用记录--gensim库
- Gensim-维基百科中文语料LDA,LSI实验记录
- 初试主题模型LDA-基于python的gensim包
- gensim-lda
- gensim实现python对word2vec的训练和计算
- 基于python的gensim word2vec训练词向量
- gensim实现python对word2vec的训练和计算
- Word2Vec的Python版Gensim的使用
- pyLDAvis基于gensim的LDA模型可视化
- GENSIM 使用笔记1 --- 语料和向量空间
- Python中gensim库word2vec的使用
- 【脚本】Compiler for Script
- 《HBase基本常识及与JAVA交互》
- android:clipChildren妙用:底部的radioGroup中间的button突出
- hdu 1176-免费馅饼
- APP怎么判断自己是处于前台还是后台
- python使用gensim训练搜狗语料的LDA
- 灰度图像覆盖率计算
- SpringMVC Controller的配置方式
- 另外关于tp中用到的$this->遇到的几个标签
- 解决 eclipse logcat不显示信息
- Android Studio 乱码问题
- 【腾讯TMQ】WIFI安全测试,其实不难
- Android Studio 错误: 非法字符: '\ufeff' 解决方案|错误: 需要class, interface或enum
- Xcode_7 iOS_9 ActiveIndicatorView_ProgressView_警告窗_操作表 控件 Objective-C (3)