CSIC2010学习Word2vec表示及可视化

来源:互联网 发布:阿里云ecs推荐码 编辑:程序博客网 时间:2024/06/07 04:42
1sudo apt-get install liblapack-dev2sudo apt-get install gfortran3sudo apt-get install python-pandas4sudo pip install --upgrade gensim5sudo pip install jieba6sudo pip install theano (0.7)

根据给定词生成word2vec词向量

# -*- coding: utf-8 -*-"""Created on Thu Jun 15 16:24:01 2017@author: Jiabao Wang@description: Generate word2vec model based on given words"""import gensim.models.word2vec as w2v  def train_model(input_file_name, model_file_name):    #模型训练,生成词向量      sentences = w2v.LineSentence(input_file_name)      model = w2v.Word2Vec(sentences, size=20, window=5, min_count=5, workers=4)       model.save(model_file_name)input_file_name = 'wPred_word.txt' # Input Wordsmodel_file_name = 'wPred_model.txt' # Output Modeltrain_model(input_file_name, model_file_name)# Compute and evaluate similarity and probabilitymodel = w2v.Word2Vec.load(model_file_name)print model.similarity('eval','@')for k in model.similar_by_word('eval'):    print str(k[1])+"\t# "+k[0].decode('utf-8')

可视化词向量

# -*- coding: utf-8 -*-"""# This is the visualization for the embedding word vectors:# Input: the words for visualization, the words labels, and the word2vec model# Output: the visualization of the given wordsCreated on Thu Jun 22 01:55:37 2017@author: Jiabao Wang"""import numpy as npfrom gensim.models.word2vec import Word2Vecimport matplotlib.pyplot as plt#import sklearn.manifold.TSNE as tsnemodelpath = 'pub_data/wPred_model.txt' # 词向量模型model = Word2Vec.load(modelpath)sentenceFilePath = 'pub_data/wordList.txt' # 可视化词的词典labelFilePath = 'pub_data/wordName.txt' # 可视化词对应显示名称visualizeVecs = []with open(sentenceFilePath, 'r') as f:    for line in f:        word = line.strip()        vec = model[word]        visualizeVecs.append(vec)visualizeWords = []with open(labelFilePath, 'r') as f:    for line in f:        word = line.strip()        visualizeWords.append(word)visualizeVecs = np.array(visualizeVecs).astype(np.float64)#Y = tsne(visualizeVecs, 2, 200, 20.0);## Plot.scatter(Y[:,0], Y[:,1], 20,labels);## ChineseFont1 = FontProperties(‘SimHei‘)#for i in xrange(len(visualizeWords)):#    # if i<len(visualizeWords)/2:#    #     color=‘green‘#    # else:#    #     color=‘red‘#    color = 'red'#    plt.text(Y[i, 0], Y[i, 1], visualizeWords[i],bbox=dict(facecolor=color, alpha=0.1))#plt.xlim((np.min(Y[:, 0]), np.max(Y[:, 0])))#plt.ylim((np.min(Y[:, 1]), np.max(Y[:, 1])))#plt.show()# vis_norm = np.sqrt(np.sum(temp**2, axis=1, keepdims=True))# temp = temp / vis_normtemp = (visualizeVecs - np.mean(visualizeVecs, axis=0))covariance = 1.0 / visualizeVecs.shape[0] * temp.T.dot(temp)U, S, V = np.linalg.svd(covariance)coord = temp.dot(U[:, 0:2])for i in xrange(len(visualizeWords)):    print i    print coord[i, 0]    print coord[i, 1]    color = 'red'    plt.text(coord[i, 0], coord[i, 1], visualizeWords[i], bbox=dict(facecolor=color, alpha=0.1),             fontsize=12)  # fontproperties = ChineseFont1plt.xlim((np.min(coord[:, 0])-5, np.max(coord[:, 0])+5))plt.ylim((np.min(coord[:, 1])-5, np.max(coord[:, 1])+5))plt.savefig('pub_data/distrubution.png', format='png',dpi = 1000,bbox_inches='tight')plt.show()

可视化效果如下:

这里写图片描述
图中间部分的词为SQL攻击的关键词,相对其他词更加聚集。

原创粉丝点击