word2vec 词向量训练

来源:互联网 发布:局域网屏幕监控软件 编辑:程序博客网 时间:2024/05/17 04:17
# -*- coding: utf-8 -*-
"""
Created on Sat Oct  7 09:06:18 2017


@author: su
"""
import jieba
import re
import pandas as pd
fw = open("xiyoujiyuliao.txt","w",encoding="utf-8")
fr = open("西游记.txt","r",encoding="gbk")
# 创建停用词list  
def stopwordslist(filepath):  
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]  
    return stopwords
def seg_sentence(sentence):  
    sentence_seged = jieba.cut(sentence.strip())  
    stopwords = stopwordslist('stopwords.txt')  # 这里加载停用词的路径  
    outstr = ''  
    for word in sentence_seged:  
        if word not in stopwords:  
            if word != '\t':  
                outstr += word  
                outstr += " "  
    return outstr  
    
  
lines =fr.readlines()  
for line in lines:
          pattern = re.compile(r'[\u4e00-\u9fa5]+') 
          filterdata = re.findall(pattern, line)
          filter = " ".join(filterdata)
          seg_list = jieba.cut(filter, cut_all=False)  
          line_seg = seg_sentence(line)  # 这里的返回值是字符串  
          fw.write(line_seg + '\n')  
   
fw.close()  
fr.close()  
from gensim.models import word2vec  
import logging  
   
   
# 主程序  
logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO)  
sentences =word2vec.Text8Corpus(u"xiyoujiyuliao.txt")  # 加载语料  
model =word2vec.Word2Vec(sentences, size=128)  #训练skip-gram模型,默认window=5  
   
print(model ) 
# 计算两个词的相似度/相关程度  
try:  
    y1 = model.similarity(u"孙悟空", u"")  
except KeyError:  
    y1 = 0  
print (u"【孙悟空】和【牛魔王】的相似度为:", y1)  
print("-----\n"   ) 
y2 = model.most_similar(u"八戒", topn=20)  # 20个最相关的  
print(u"和【八戒】最相关的词有:\n") 
for item in y2:  
    print(item[0], item[1]) 
print("-----\n") 
          
阅读全文
0 0