重写gensim.word2vec的文本相似度匹配函数（wmdistance）

来源：互联网发布：网上订餐系统源码下载编辑：程序博客网时间：2024/05/22 17:03

1.为什么要重写

因为在D jango上莫名其妙的不能import gensim。
从而不能from gensim.models import Word2Vec。
不能load_model.word2vec_model.wmdistance（sentence1,sentence2）。
因此根据原码更改了引入gensim包的部分内容。

2.改写后的代码

import pyemdfrom gensim.corpora.dictionary import Dictionaryfrom numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\    double, uint32, seterr, array, uint8, vstack, fromstring, sqrt, newaxis,\    ndarray, empty, sum as np_sum, prod, ones, ascontiguousarrayfrom keras.models import load_modelimport cPickle as pickleimport logginglogger = logging.getLogger(__name__)import sysreload(sys)sys.setdefaultencoding('utf-8')def sentence_distance(document1, document2):    len_pre_oov1 = len(document1)    len_pre_oov2 = len(document2)    document1 = [word_index.get(token) for token in document1 if word_index.has_key(token)]    document2 = [word_index.get(token) for token in document2 if word_index.has_key(token)]    diff1 = len_pre_oov1 - len(document1)    diff2 = len_pre_oov2 - len(document2)    if diff1 > 0 or diff2 > 0:        print ('Removed %d and %d OOV words from document 1 and 2 (respectively).',                    diff1, diff2)    if len(document1) == 0 or len(document2) == 0:        print ('At least one of the documents had no words that were'                    'in the vocabulary. Aborting (returning inf).')        return float('inf')    #dictionary = Dictionary(documents=[document1, document2])    dictionarys = list(set(document1+document2))    dictionary = dict(enumerate(dictionarys))    vocab_len = len(dictionary)    if vocab_len == 1:        # Both documents are composed by a single unique token        return 0.0    docset1 = set(document1)    docset2 = set(document2)    distance_matrix = zeros((vocab_len, vocab_len), dtype=double)    for i, t1 in dictionary.items():        for j, t2 in dictionary.items():            if not t1 in docset1 or not t2 in docset2:                continue            distance_matrix[i, j] = sqrt(np_sum((embedding[t1] - embedding[t2]) ** 2))    if np_sum(distance_matrix) == 0.0:        print ('The distance matrix is all zeros. Aborting (returning inf).')        return float('inf')    def nbow(document):        d = zeros(vocab_len, dtype=double)        nbow = doc2bow(document,vocab_len,dictionary)  # Word frequencies.        doc_len = len(document)        for (idx,freq) in  nbow.items():        #for idx, freq in nbow:            d[idx] = float(freq) / float(doc_len)  # Normalized word frequencies.        return d    def doc2bow(document,vocab_len,dictionary):        freq_dic = dict()        for i in document:            if freq_dic.has_key(i):                freq_dic[i] = freq_dic[i]+1            else:                freq_dic[i] = 1        return_freq = dict()        for i in range(len(document)):            if return_freq.has_key(i):                for key in range(len(dictionary)):                    if(dictionary[key] == document[i]):                        return_freq[key] = freq_dic[document[i]]            else:                for key in range(len(dictionary)):                    if(dictionary[key] == document[i]):                        return_freq[key] = freq_dic[document[i]]        return return_freq    d1 = nbow(document1)    d2 = nbow(document2)    print pyemd.emd(d1,d2,distance_matrix)    return pyemd.emd(d1, d2, distance_matrix)if __name__ == "__main__":    embedding = pickle.load(open('/home/.../word2vec_save.pkl'))    word_index = pickle.load(open('/home/...distance('我的密码忘记了','我忘了密码')

0 0