word2vec 词向量训练
来源:互联网 发布:局域网屏幕监控软件 编辑:程序博客网 时间:2024/05/17 04:17
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 7 09:06:18 2017
@author: su
"""
import jieba
import re
import pandas as pd
fw = open("xiyoujiyuliao.txt","w",encoding="utf-8")
fr = open("西游记.txt","r",encoding="gbk")
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('stopwords.txt') # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
lines =fr.readlines()
for line in lines:
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, line)
filter = " ".join(filterdata)
seg_list = jieba.cut(filter, cut_all=False)
line_seg = seg_sentence(line) # 这里的返回值是字符串
fw.write(line_seg + '\n')
fw.close()
fr.close()
from gensim.models import word2vec
import logging
# 主程序
logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO)
sentences =word2vec.Text8Corpus(u"xiyoujiyuliao.txt") # 加载语料
model =word2vec.Word2Vec(sentences, size=128) #训练skip-gram模型,默认window=5
print(model )
# 计算两个词的相似度/相关程度
try:
y1 = model.similarity(u"孙悟空", u"")
except KeyError:
y1 = 0
print (u"【孙悟空】和【牛魔王】的相似度为:", y1)
print("-----\n" )
y2 = model.most_similar(u"八戒", topn=20) # 20个最相关的
print(u"和【八戒】最相关的词有:\n")
for item in y2:
print(item[0], item[1])
print("-----\n")
"""
Created on Sat Oct 7 09:06:18 2017
@author: su
"""
import jieba
import re
import pandas as pd
fw = open("xiyoujiyuliao.txt","w",encoding="utf-8")
fr = open("西游记.txt","r",encoding="gbk")
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('stopwords.txt') # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
lines =fr.readlines()
for line in lines:
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, line)
filter = " ".join(filterdata)
seg_list = jieba.cut(filter, cut_all=False)
line_seg = seg_sentence(line) # 这里的返回值是字符串
fw.write(line_seg + '\n')
fw.close()
fr.close()
from gensim.models import word2vec
import logging
# 主程序
logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO)
sentences =word2vec.Text8Corpus(u"xiyoujiyuliao.txt") # 加载语料
model =word2vec.Word2Vec(sentences, size=128) #训练skip-gram模型,默认window=5
print(model )
# 计算两个词的相似度/相关程度
try:
y1 = model.similarity(u"孙悟空", u"")
except KeyError:
y1 = 0
print (u"【孙悟空】和【牛魔王】的相似度为:", y1)
print("-----\n" )
y2 = model.most_similar(u"八戒", topn=20) # 20个最相关的
print(u"和【八戒】最相关的词有:\n")
for item in y2:
print(item[0], item[1])
print("-----\n")
阅读全文
0 0
- word2vec 词向量训练
- 利用Word2Vec训练词向量过程
- 使用预训练的word2vec词向量
- 使用预训练的word2vec词向量
- Windows下使用Word2vec继续词向量训练
- Windows下使用Word2vec继续词向量训练
- word2vec词向量训练及中文文本相似度计算
- Windows下使用Word2vec继续词向量训练
- Windows下使用Word2vec继续词向量训练
- word2vec (四) 动手训练一个词向量空间
- word2vec词向量训练及中文文本相似度计算
- 用Word2vec训练中文wiki,构造词向量并做词聚类
- 基于python的gensim word2vec训练词向量
- word2vec词向量训练及gensim的使用
- 用word2vec训练文本摘要的词向量模型
- Windows下使用Word2vec继续词向量训练
- Windows下运行C语言版Word2Vec训练词向量
- windows下使用Word2vec进行词向量训练
- 如何在VMware虚拟机中安装Linux SUSE 11系统
- Windows7下安装cpu版的Tensorflow
- pip安装python包出现Cannot fetch index base URL http://pypi.python.org/simple/
- Python format函数详解
- 向码云提交项目
- word2vec 词向量训练
- 对称加密算法/非对称加密算法/不可逆加密算法
- 解决运行scrapy是报错No module named cryptography,解决cryptography的安装问题,解决libffi的安装问题
- 安装Intel HAXM为Android 模拟器加速,30秒内启动完成
- 7-56 找鞍点(20 分)
- 解决JS浮点数(小数)计算加减乘除的BUG
- Java正则表达式中量词贪婪型,勉强型,占有型简单描述
- GIT JSON MAVEN DOCKER
- 个人学习记录-无用处