【word2vec实例1】

来源:互联网 发布:java gc机制 编辑:程序博客网 时间:2024/06/05 02:43
#  coding = utf8import wordcutimport create_dictimport vectorizeimport classifyimport pickleimport psutilimport parametersimport osfrom collections import dequeimport gensimimport numpy as npimport csvimport processHandlerdef cos_sim(arrA, arrB):    return arrA.dot(arrB)/(np.linalg.norm(arrA)*np.linalg.norm(arrB))# def compare(word_cutter,vectorizer,sentenceA,sentenceB):#     tokenA = word_cutter.word_cut(sentenceA) # word_cuttter returns a list#     tokenB = word_cutter.word_cut(sentenceB)#     vectorA = np.array(vectorizer.vectorize(tokenA))#     vectorB = np.array(vectorizer.vectorize(tokenB))#     return cos_sim(vectorA,vectorB)def main():    current_dir = os.path.abspath('.')    stopword_set = set()    parameter = parameters.Parameters(os.path.join(current_dir, 'config.ini'), stopword_set)    # w2v_training = os.path.join(current_dir, 'training_set20170810.csv')    # pickle_path = os.path.join(current_dir, 'corpus')    sentence_list = deque()    # answer_path = os.path.join(current_dir, 'answer.csv')    question_path =os.path.join(current_dir, 'question.csv')    # concept_des_path =os.path.join(current_dir, 'concept_description.csv')    w2v_file = os.path.join(current_dir, 'w2v_file_2017012.bin')    word_cutter = wordcut.WordCutter(overlapping=parameter.overlapping)    # try using stop-words    preprocessor = processHandler.Prerocessor(False,False,False)    # file = open(pickle_path,'rb')    # pklist = pickle.load(file)    # file.close()    trainingset = []    # i = 0    # for row in pklist:    #     temp = row[1].replace('\r','')    #     temp = temp.replace('\n', '')    #     trainingset.append(temp)    #     temp = row[2].replace('\r','')    #     temp = temp.replace('\n', '')    #     trainingset.append(temp)    #     i += 1    #     if i > 100000:    #         break    # del pklist    # file = open(answer_path,encoding='gb18030')    # cache = file.readlines()    # file.close()    # for item in cache:    #     temp = item.replace('\n','')    #     temp = temp.replace('\r', '')    #     trainingset.append(temp)    file = open(question_path,encoding='gb18030')    cache = file.readlines()    file.close()    for item in cache:        temp = item.replace('\n','')        temp = temp.replace('\r', '')        trainingset.append(temp)    # file = open(concept_des_path,encoding='gb18030')    # cache = file.readlines()    # file.close()    # for item in cache:    #     temp = item.replace('\n','')    #     temp = temp.replace('\r', '')    #     trainingset.append(temp)    # del cache    while len(trainingset) > 0:        contain_chinese = False        last = trainingset.pop()        for item in last:            if word_cutter.is_chinese(item):                contain_chinese = True                break        if contain_chinese:            temp = last            for symbol in (u'', u'', u'', '!', '?'):                temp = temp.replace(symbol, ' ')            temp = temp.split()            for sentence in temp:                #print(sentence)                sentence_list.append(sentence)    del trainingset    sentence_token = deque()    total = len(sentence_list)    i = 0    while len(sentence_list) > 0:        i += 1        #print(item)        #print(preprocessor.process_main(item))        temp = preprocessor.process_main(sentence_list.pop())[-1]        if temp is not None:            #print(temp)            sentence_token.append(temp)        if i >= 10000:            print([len(sentence_list), total])            i = 0    dic = gensim.models.Word2Vec(sentence_token,size=parameter.n_neuron,workers=3,seed=1024,iter=20) #sg=1 represents using skip-gram #sentence_token,size=parameter.n_neuron,workers=4,seed=1024,iter=20,sg=0    #dic = gensim.models.Doc2Vec(sentence_token,size=parameter.n_neuron,workers=3,seed=1024,iter=20)    sentence_token = deque()    dic.save(w2v_file)    # dic.save_word2vec_format(w2v_file, binary=True)if __name__ == '__main__':    main()