读glove文件的代码

来源:互联网 发布:sci灌水 知乎 编辑:程序博客网 时间:2024/06/05 09:22
import hashlibimport gensim#原文件加上一行成为gensim可读的格式def prepend_slow(infile, outfile, line):    """    Slower way to prepend the line by re-creating the inputfile.    """    with open(infile, 'r',encoding= 'utf-8') as fin:        with open(outfile, 'w',encoding= 'utf-8 ') as fout:            fout.write(line + "\n")            for line in fin:                fout.write(line)def checksum(filename):    """    This is to verify the file checksum is the same as the glove files we use to    pre-computed the no. of lines in the glove file(s).    """    BLOCKSIZE = 65536    hasher = hashlib.md5()    with open(filename, 'rb') as afile:        buf = afile.read(BLOCKSIZE)        while len(buf) > 0:            hasher.update(buf)            buf = afile.read(BLOCKSIZE)    return hasher.hexdigest()# Pre-computed glove files values.pretrain_num_lines = {"glove.840B.300d.txt": 2196017}def check_num_lines_in_glove(filename, check_checksum=False):        return pretrain_num_lines[filename]# Input: GloVe Model File# More models can be downloaded from http://nlp.stanford.edu/projects/glove/glove_file = "glove.840B.300d.txt"_, tokens, dimensions, _ = glove_file.split('.')num_lines = check_num_lines_in_glove(glove_file)dims = int(dimensions[:-1])# Output: Gensim Model text format.gensim_file = 'glove_model.txt'gensim_first_line = "{} {}".format(num_lines, dims)# Prepends the line.prepend_slow(glove_file, gensim_file, gensim_first_line)# Load modelmodel =gensim.models.KeyedVectors.load_word2vec_format('glove_model.txt')model.syn0norm = model.syn0  # prevent recalc of normed vectorsmodel.word_vec('computer') #obtain word vectorprint(model.most_similar(positive=['australia'], topn=10))print(model.similarity('woman', 'man'))