【word2vec实例1】
来源:互联网 发布:java gc机制 编辑:程序博客网 时间:2024/06/05 02:43
# coding = utf8import wordcutimport create_dictimport vectorizeimport classifyimport pickleimport psutilimport parametersimport osfrom collections import dequeimport gensimimport numpy as npimport csvimport processHandlerdef cos_sim(arrA, arrB): return arrA.dot(arrB)/(np.linalg.norm(arrA)*np.linalg.norm(arrB))# def compare(word_cutter,vectorizer,sentenceA,sentenceB):# tokenA = word_cutter.word_cut(sentenceA) # word_cuttter returns a list# tokenB = word_cutter.word_cut(sentenceB)# vectorA = np.array(vectorizer.vectorize(tokenA))# vectorB = np.array(vectorizer.vectorize(tokenB))# return cos_sim(vectorA,vectorB)def main(): current_dir = os.path.abspath('.') stopword_set = set() parameter = parameters.Parameters(os.path.join(current_dir, 'config.ini'), stopword_set) # w2v_training = os.path.join(current_dir, 'training_set20170810.csv') # pickle_path = os.path.join(current_dir, 'corpus') sentence_list = deque() # answer_path = os.path.join(current_dir, 'answer.csv') question_path =os.path.join(current_dir, 'question.csv') # concept_des_path =os.path.join(current_dir, 'concept_description.csv') w2v_file = os.path.join(current_dir, 'w2v_file_2017012.bin') word_cutter = wordcut.WordCutter(overlapping=parameter.overlapping) # try using stop-words preprocessor = processHandler.Prerocessor(False,False,False) # file = open(pickle_path,'rb') # pklist = pickle.load(file) # file.close() trainingset = [] # i = 0 # for row in pklist: # temp = row[1].replace('\r','') # temp = temp.replace('\n', '') # trainingset.append(temp) # temp = row[2].replace('\r','') # temp = temp.replace('\n', '') # trainingset.append(temp) # i += 1 # if i > 100000: # break # del pklist # file = open(answer_path,encoding='gb18030') # cache = file.readlines() # file.close() # for item in cache: # temp = item.replace('\n','') # temp = temp.replace('\r', '') # trainingset.append(temp) file = open(question_path,encoding='gb18030') cache = file.readlines() file.close() for item in cache: temp = item.replace('\n','') temp = temp.replace('\r', '') trainingset.append(temp) # file = open(concept_des_path,encoding='gb18030') # cache = file.readlines() # file.close() # for item in cache: # temp = item.replace('\n','') # temp = temp.replace('\r', '') # trainingset.append(temp) # del cache while len(trainingset) > 0: contain_chinese = False last = trainingset.pop() for item in last: if word_cutter.is_chinese(item): contain_chinese = True break if contain_chinese: temp = last for symbol in (u'。', u'?', u'!', '!', '?'): temp = temp.replace(symbol, ' ') temp = temp.split() for sentence in temp: #print(sentence) sentence_list.append(sentence) del trainingset sentence_token = deque() total = len(sentence_list) i = 0 while len(sentence_list) > 0: i += 1 #print(item) #print(preprocessor.process_main(item)) temp = preprocessor.process_main(sentence_list.pop())[-1] if temp is not None: #print(temp) sentence_token.append(temp) if i >= 10000: print([len(sentence_list), total]) i = 0 dic = gensim.models.Word2Vec(sentence_token,size=parameter.n_neuron,workers=3,seed=1024,iter=20) #sg=1 represents using skip-gram #sentence_token,size=parameter.n_neuron,workers=4,seed=1024,iter=20,sg=0 #dic = gensim.models.Doc2Vec(sentence_token,size=parameter.n_neuron,workers=3,seed=1024,iter=20) sentence_token = deque() dic.save(w2v_file) # dic.save_word2vec_format(w2v_file, binary=True)if __name__ == '__main__': main()
阅读全文
0 0
- 【word2vec实例1】
- gensim word2vec 实例
- word2vec【1】
- 【word2vec实例2】加载模型
- word2vec学习(1)
- 表示学习1-word2vec
- Word2Vec训练1
- word2vec
- word2vec
- word2vec
- Word2Vec
- word2vec
- Word2Vec
- Word2Vec
- word2vec
- word2vec
- word2vec
- word2vec
- 天天学Linux命令21--find命令之exec
- 【Java线程】锁机制:synchronized、Lock、Condition
- 将数据库读取的平级数据转换成父子孙结构
- H5移动端调用相机/相册
- mysql操作相关合集
- 【word2vec实例1】
- 金融科技:为何商业银行必须追随零售银行
- MySQL 常见的报错及解决方式
- DFINITY区块链连载(一) 密码学技术介绍
- DFINITY区块链连载(二)阈值组创建与区块生成
- 百练+链表相加+主要就是链表的操作
- probimage问题
- 将pdf按照页数转化为图片
- oracle工作整理 存储过程封装sql脚本实例,完整用到oracle好几个用法