word2vec 词向量训练

来源：互联网发布：局域网屏幕监控软件编辑：程序博客网时间：2024/05/17 04:17

# -*- coding: utf-8 -*-
"""
Created on Sat Oct 7 09:06:18 2017

@author: su
"""
import jieba
import re
import pandas as pd
fw = open("xiyoujiyuliao.txt","w",encoding="utf-8")
fr = open("西游记.txt","r",encoding="gbk")
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('stopwords.txt') # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr


lines =fr.readlines()
for line in lines:
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, line)
filter = " ".join(filterdata)
seg_list = jieba.cut(filter, cut_all=False)
line_seg = seg_sentence(line) # 这里的返回值是字符串
fw.write(line_seg + '\n')

fw.close()
fr.close()
from gensim.models import word2vec
import logging

# 主程序
logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO)
sentences =word2vec.Text8Corpus(u"xiyoujiyuliao.txt") # 加载语料
model =word2vec.Word2Vec(sentences, size=128) #训练skip-gram模型，默认window=5

print(model )
# 计算两个词的相似度/相关程度
try:
y1 = model.similarity(u"孙悟空", u"")
except KeyError:
y1 = 0
print (u"【孙悟空】和【牛魔王】的相似度为：", y1)
print("-----\n" )
y2 = model.most_similar(u"八戒", topn=20) # 20个最相关的
print(u"和【八戒】最相关的词有：\n")
for item in y2:
print(item[0], item[1])
print("-----\n")

阅读全文

0 0