pythonNLP-文本相似度计算-Demo
来源:互联网 发布:华为游戏防御矩阵 编辑:程序博客网 时间:2024/05/17 03:13
参照博客[我爱自然语言处理]里面的如何计算两个文本的相似度系列,把代码自己实现了一遍,对整个流程有了了解。纯属个人记录,新手想学习可直接去上面的博客学习,讲的非常好。
代码
#-*- coding:utf-8import gensimfrom gensim import corpora, models, similaritiesimport tracebackdocuments = [ "Shipment of gold damaged in a fire", "Delivery of silver arrived in a silver truck", "Shipment of gold arrived in a truck"]'''@:return:texts是token_list,只要我生成了token_list,给它就行了'''def pre_process( documents ): try: documents_token_list = [ [word for word in document.lower().split() ] for document in documents ] print "[INFO]: pre_process is finished!" return documents_token_list except Exception,e: print traceback.print_exc()'''这个函数是比较通用的,可以跟我自己写的结合。这个是根据document[ token_list ]来训练tf_idf模型的@texts: documents = [ document1, document2, ... ] document1 = token_list1@return: dictionary 根据texts建立的vsm空间,并且记录了每个词的位置,和我的实现一样,对于vsm空间每个词,你要记录他的位置。否则,文档生成vsm空间的时候,每个词无法找到自己的位置@return: corpus_idf 每篇document在vsm上的tf-idf表示.但是他的输出和我的不太一样,我的输出就是单纯的vsm空间中tf-idf的值,但是它的空间里面不是。还有位置信息在。并且输出的时候,看到的好像没有值为0的向量,但是vsm向量的空间是一样的。所以,我觉得应该是只输出了非0的。这两个返回值和我的都不一样,因为字典(vsm)以及corpus_idf(vsm)都输出了位置信息。但是这两个信息,可以快速生成lda和lsi模型'''def tf_idf_trainning(documents_token_list): try: # 将所有文章的token_list映射为 vsm空间 dictionary = corpora.Dictionary(documents_token_list) # 每篇document在vsm上的tf表示 corpus_tf = [ dictionary.doc2bow(token_list) for token_list in documents_token_list ] # 用corpus_tf作为特征,训练tf_idf_model tf_idf_model = models.TfidfModel(corpus_tf) # 每篇document在vsm上的tf-idf表示 corpus_tfidf = tf_idf_model[corpus_tf] print "[INFO]: tf_idf_trainning is finished!" return dictionary, corpus_tf, corpus_tfidf except Exception,e: print traceback.print_exc()def lsi_trainning( dictionary, corpus_tfidf, K ): try: # 用tf_idf作为特征,训练lsi模型 lsi_model = models.LsiModel( corpus_tfidf, id2word=dictionary, num_topics = K ) # 每篇document在K维空间上表示 corpus_lsi = lsi_model[corpus_tfidf] print "[INFO]: lsi_trainning is finished!" return lsi_model, corpus_lsi except Exception,e: print traceback.print_exc()def lda_trainning( dictionary, corpus_tfidf, K ): try: # 用corpus_tf作为特征,训练lda_model lda_model = models.LdaModel( corpus_tfidf, id2word=dictionary, num_topics = K ) # 每篇document在K维空间上表示 corpus_lda = lda_model[corpus_tfidf] for aa in corpus_lda: print aa print "[INFO]: lda_trainning is finished!" return lda_model, corpus_lda except Exception,e: print traceback.print_exc()def similarity( query, dictionary, corpus_tf, lda_model ): try: # 建立索引 index = similarities.MatrixSimilarity( lda_model[corpus_tf] ) # 在dictionary建立query的vsm_tf表示 query_bow = dictionary.doc2bow( query.lower().split() ) # 查询在K维空间的表示 query_lda = lda_model[query_bow] # 计算相似度 simi = index[query_lda] query_simi_list = [ item for _, item in enumerate(simi) ] print query_simi_list except Exception,e: print traceback.print_exc()documents_token_list = pre_process(documents)dict, corpus_tf, corpus_tfidf = tf_idf_trainning(documents_token_list)#lsi_trainning(corpus_tfidf, dict, 2)lda_model, corpus_lda = lda_trainning(dict, corpus_tfidf, 2)similarity( "Shipment of gold arrived in a truck", dict, corpus_tf, lda_model )
代码
#-*- coding:utf-8from gensim import corpora, models, similaritiesfrom nltk.tokenize import word_tokenizefrom nltk.corpus import stopwordsfrom nltk.stem.lancaster import LancasterStemmerimport traceback'''------------------------------------------------------------函数声明'''# 预处理def pre_process(PATH): try: # 课程信息 courses = [ line.strip() for line in file(PATH) ] courses_copy = courses courses_name = [ course.split('\t')[0] for course in courses ] # 分词-转化小写 texts_tokenized = [[word.lower() for word in word_tokenize(document.decode("utf-8"))] for document in courses] # 去除停用词 english_stopwords = stopwords.words('english') texts_filtered_stopwords = [ [ word for word in document if word not in english_stopwords ] for document in texts_tokenized ] # 去除标点符号 english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] texts_filterd = [ [ word for word in document if word not in english_punctuations ] for document in texts_filtered_stopwords ] # 词干化 st = LancasterStemmer() texts_stemmed = [ [ st.stem(word) for word in document ] for document in texts_filterd ] #print texts_stemmed[0] # 去除低频词 all_stems = sum(texts_stemmed, []) stem_once = set( stem for stem in set(all_stems) if all_stems.count(stem) == 1 ) texts = [ [ word for word in text if word not in stem_once ] for text in texts_stemmed] print "[INFO]: pre_process is finished!" return texts, courses_copy, courses_name except Exception,e: print traceback.print_exc()# 训练tf_idf模型def tf_idf_trainning(documents_token_list): try: # 将所有文章的token_list映射为 vsm空间 dictionary = corpora.Dictionary(documents_token_list) # 每篇document在vsm上的tf表示 corpus_tf = [ dictionary.doc2bow(token_list) for token_list in documents_token_list ] # 用corpus_tf作为特征,训练tf_idf_model tf_idf_model = models.TfidfModel(corpus_tf) # 每篇document在vsm上的tf-idf表示 corpus_tfidf = tf_idf_model[corpus_tf] print "[INFO]: tf_idf_trainning is finished!" return dictionary, corpus_tf, corpus_tfidf except Exception,e: print traceback.print_exc()# 训练lsi模型def lda_trainning( dictionary, corpus_tfidf, K ): try: # 用corpus_tf作为特征,训练lda_model lda_model = models.LdaModel( corpus_tfidf, id2word=dictionary, num_topics = K ) # 每篇document在K维空间上表示 corpus_lda = lda_model[corpus_tfidf] print "[INFO]: lda_trainning is finished!" return lda_model, corpus_lda except Exception,e: print traceback.print_exc()# 基于lda模型的相似度计算def similarity( query, dictionary, corpus_tf, lda_model ): try: # 建立索引 index = similarities.MatrixSimilarity( lda_model[corpus_tf] ) # 在dictionary建立query的vsm_tf表示 query_bow = dictionary.doc2bow( query.lower().split() ) # 查询在K维空间的表示 query_lda = lda_model[query_bow] # 计算相似度 simi = index[query_lda] sort_simi = sorted(enumerate(simi), key=lambda item: -item[1]) print sort_simi[0:10] except Exception,e: print traceback.print_exc()'''------------------------------------------------------------常量定义'''PATH = "../../data/coursera/coursera_corpus"number_of_topics = 10'''------------------------------------------------------------'''texts, courses, courses_name = pre_process(PATH)dict, corpus_tf, corpus_tfidf = tf_idf_trainning(texts)lda_model, corpus_lda = lda_trainning( dict, corpus_tf, number_of_topics )similarity(courses[210], dict, corpus_tf, lda_model)
1 0
- pythonNLP-文本相似度计算-Demo
- pythonNLP-文本相似度计算实验汇总
- 文本相似度计算
- 计算文本相似度
- 文本相似度计算
- 计算文本相似度
- java文本相似度计算
- simhash文本相似度计算
- simhash计算文本相似度
- python文本相似度计算
- python文本相似度计算
- simhash 文本相似度计算
- tfidf算法+余弦相似度算法计算文本相似度
- 利用余弦相似度计算文本相似度
- lucene计算文本相似度算法
- lucene计算文本相似度算法
- 文本相似度计算基本方法小结
- 文本挖掘2相似度计算
- ansible安装
- 后知后觉:论后悔药有多贵
- gone:visible:invisible的区别
- 重载
- 用于大数据分类的KNN算法研究
- pythonNLP-文本相似度计算-Demo
- HttpServletResponse 前端响应压缩文件And在压缩包中新建文件夹
- Java EnumSet 代替位域
- libevent学习贴总结
- 欢迎使用CSDN-markdown编辑器
- objective-c 中数据类型之五 数值类(NSNumber及NSNumberFormatter)
- UIAlertController message信息左对齐
- NGUI下alpha通道的分离与合并
- React-Native傻瓜式学习笔记(二):封装Navigator工具类