python自然语言处理-----计算文本相似度

来源:互联网 发布:火火软件站 编辑:程序博客网 时间:2024/04/28 10:17
from gensim import corpora,models,similaritiesimport jiebafrom collections import defaultdictimport urllib.request#d1=open("C:/Users/yyq/Desktop/毕业论文/文档1.txt").read()#d2=open("C:/Users/yyq/Desktop/毕业论文/文档2.txt").read()jieba.load_userdict("C:/Users/yyq/Desktop/毕业论文/词典.txt")d1=urllib.request.urlopen("file:///C:/php/WWW/%E6%96%87%E6%A1%A31.html").read().decode("gbk","ignore")d2=urllib.request.urlopen("file:///C:/php/WWW/%E6%96%87%E6%A1%A3%202.html").read().decode("gbk","ignore")#print(data2)data1=jieba.cut(d1)data2=jieba.cut(d2)data11=""for item in data1:    data11+=item+"  " data21=""for item in data2:    data21+=item+"  "documents=[data11,data21] #存储到数组texts=[[word for word in document.split()]for document in documents]#print(texts)frequency=defaultdict(int)for text in texts:    for token in text:        frequency[token]+=1#print(frequency)#texts=[[word for word in text if frequency[token]>2]for text in texts]dictionary=corpora.Dictionary(texts)#dictionary.save("C:/php/WWW/分词2.html")d3=urllib.request.urlopen("file:///C:/php/WWW/%E6%96%87%E6%A1%A33.html").read().decode("gbk","ignore")data3=jieba.cut(d3)data31=""for item in data3:    data31+=item+"  "new_doc=data31new_vec=dictionary.doc2bow(new_doc.split())#print(new_vec)corpus=[dictionary.doc2bow(text) for text in texts]#print(corpus)corpora.MmCorpus.serialize("C:/Users/yyq/Desktop/毕业论文/corpus.txt",corpus)tfidf=models.TfidfModel(corpus)featureNum=len(dictionary.token2id.keys())index=similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=featureNum)sim=index[tfidf[new_vec]]print(sim)

这里写图片描述
结论:第三个文本和第一个第二个文本的相似度为:0.007和0.03