计算两篇文档的余弦相似度(tfidf)

来源:互联网 发布:js报表控件 编辑:程序博客网 时间:2024/06/05 20:11
# -*- coding:utf-8 -*-"""@author: Linlifang"""
import osimport jiebaimport sysimport reimport stringfrom sklearn.feature_extraction.text import TfidfTransformerfrom sklearn.feature_extraction.text import CountVectorizerreload(sys)sys.setdefaultencoding('utf-8')'''首先读取文件夹里的文档,然后通过结巴分词,将分词的结果存入文件,接着使用sklearn包计算每一篇文档的tfidf值并保存在一个文件里,最后从这些文件中任选两个txt文件来计算他们的余弦相似度。'''def getFileList(path):    filelist = []    files = os.listdir(path)    for f in files:        if f[0] == '.':            pass        else:            filelist.append(f)    return filelist, pathdef segment(filename, path, segPath):    f = open(path + "/" + filename, 'r+')    file_list = f.read()    f.close()#对文档进行分词处理    if not os.path.exists(segPath):        os.mkdir(segPath)#对空格,换行符进行处理    # Segmenting the document    seg_list = jieba.cut(file_list, cut_all=False)    # stopword = open('stopworda.txt').readlines()    result = []    for seg in seg_list:        seg = ''.join(seg.split())        reg = 'w+'        r = re.search(reg, seg)        if seg != '' and seg != ' = ' and seg != '[' and seg != ']' and seg != '(' and seg != ')' and not r:            result.append(seg)    finalresult = []    stopword = open('stopworda.txt').read()    for word in result: #去除停用词        if word in stopword:            continue        else:            if word >= u'\u4e00' and word <= u'\u9fa5': #判断是否是汉字                finalresult.append(word)            # 将分词后的结果用空格隔开,保存在本地    f = open(segPath + "/" + filename + "-seg.txt", "w+")    f.write(' '.join(finalresult))    f.close()# 读取已经分词好的文档,进行TFIDF计算def Tfidf(filelist, sFilePath, path):    corpus = []    for ff in filelist:        fname = path + "/" + ff        f = open(fname + "-seg.txt", 'r+')        content = f.read()        f.close()        corpus.append(content)    vectorizer = CountVectorizer()    transformer = TfidfTransformer()    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))    word = vectorizer.get_feature_names()  # 全部文本关键字    weight = tfidf.toarray()    if not os.path.exists(sFilePath):        os.mkdir(sFilePath)    for i in range(len(weight)):        print u'-writing all the tf-idf in the ', i, u'file into ', sFilePath + '/' + string.zfill(i, 2) + ".txt"        f = open(sFilePath + "/" + string.zfill(i, 2) + ".txt", 'w+')        for j in range(len(word)):            f.write(word[j] + "  " + str(weight[i][j]) + " " + "\n")        def coutcos(file1,file2):    cipin1 = open(file1).readlines()    cipin2 = open(file2).readlines()    list1 = []    list2 = []    for x in cipin1:        y = x.split(' ')        list1.append(y[2])    for x in cipin2:        y = x.split(' ')        list2.append(y[2])    dot_product = 0.0    normA = 0.0    normB = 0.0    for a, b in zip(list1, list2):        a = float(a)        b = float(b)        dot_product += a * b        normA += a ** 2        normB += b ** 2    if normA == 0.0 or normB == 0.0:        return None    else:        return dot_product / ((normA * normB) ** 0.5)if __name__ == "__main__":    # 保存TFIDF的计算结果到文件夹    sFilePath = "C:/Users/llfang1/PycharmProjects/untitled2/corpus/tfidffile"    # 保存分词的文件夹    segPath = 'C:/Users/llfang1/PycharmProjects/untitled2/corpus/segfile'    (allfile, path) = getFileList('C:/Users/llfang1/PycharmProjects/untitled2/corpus/allkeyword')    for ff in allfile:        print "Using jieba on " + ff        segment(ff, path, segPath)    Tfidf(allfile, sFilePath, segPath)    file1 = sFilePath + "/" + "04.txt"    file2 = sFilePath + "/" + "05.txt"    similar = coutcos(file1,file2)    print similar
注:此程序參考了一位同行的程序后进行了改动并添加一些内容