
来源:互联网 发布:网络代理打鱼怎么判 编辑:程序博客网 时间:2024/05/21 06:23

word2vec也叫word embeddings,中文名“词向量”,作用就是将自然语言中的字词转为计算机可以理解的稠密向量(Dense Vector)。Word2Vec可以将One-Hot Encoder转化为低维度的连续值,也就是稠密向量,并且其中意思相近的词将被映射到向量空间中相近的位置。 word2vec模型其实就是简单化的神经网络

#coding:utf-8import tensorflow as tfimport numpy as npimport collectionsimport matplotlib.pyplot as plt"""将文本通过word2vec转为向量"""batch_size = 20embedding_size = 2#负样本个数num_samples = 15sentences = ["the quick brown fox jumped over the lazy dog",            "I love cats and dogs",            "we all love cats and dogs",            "cats and dogs are great",            "sung likes cats",            "she loves dogs",            "cats can be very independent",            "cats are great companions when they want to be",            "cats are playful",            "cats are natural hunters",            "It's raining cats and dogs",            "dogs and cats love sung"]#按空格将句子拆分为单词words = " ".join(sentences).split()#统计不同单词出现的次数count = collections.Counter(words).most_common()#提取不同的单词keywords = [i[0] for i in count]#字典,键是单词,值是键所在的索引位置dict = {w:i for i,w in enumerate(keywords)}#语料长度voc_size = len(dict)#每个单词在keywords中的索引words_index = [dict[word] for word in words]#CBOW构造语境和目标词汇映射关系映射关系#he is a handsome boy [he,a]-->is,[is,handsome]-->a"""单词索引:[3,4,5,6,7,3,1,3,2,1......]映射关系:[[[3,5],4],[[4,6],5],[[5,7],6]......]"""chow_pairs = []for i in range(1,len(words_index)-1):    chow_pairs.append([[words_index[i-1],words_index[i+1]],words_index[i]])skip_gram_pairs = []for c in chow_pairs:    skip_gram_pairs.append([c[1],c[0][0]])    skip_gram_pairs.append([c[1],c[0][1]])"""skip_gram_pairs存储[[4,3],[4,5],[5,4],[5,6],[6,5],[6,7]......]"""def generate_batch(size):    assert size < len(skip_gram_pairs)    x_data = []    y_data = []    #np.random.choice(a,size,replace=False)从a中随机选择size个数    r = np.random.choice(range(len(skip_gram_pairs)),size,replace=False)    for i in r:        x_data.append(skip_gram_pairs[i][0])        y_data.append([skip_gram_pairs[i][1]])    return x_data,y_data"""如果选到:[4,3],[5,4],[6,7]x_data:[4,5,6]y_data:[3,        4,        7]"""train_inputs = tf.placeholder(tf.int32, shape=[batch_size])train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])#构建embedding层#tf.random_unoform(size,a,b)a,b范围内指定size的随机数embeddings = tf.Variable(        tf.random_uniform([voc_size, embedding_size], -1.0, 1.0))embed = tf.nn.embedding_lookup(embeddings, train_inputs) #构造nce噪声对比估计nce_weights = tf.Variable(    tf.random_uniform([voc_size, embedding_size],-1.0, 1.0))nce_biases = tf.Variable(tf.zeros([voc_size]))loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed, num_sampled=num_samples, voc_size))train_op = tf.train.AdamOptimizer(1e-1).minimize(loss)with tf.Session() as sess:    init = tf.global_variables_initializer()    sess.run(init)    for step in range(100):        batch_inputs, batch_labels = generate_batch(batch_size)        _, loss_val = sess.run([train_op, loss],                feed_dict={train_inputs: batch_inputs, train_labels: batch_labels})        if step % 10 == 0:          print("Loss at ", step, loss_val) # Report the loss    trained_embeddings = embeddings.eval()#将词向量在二维平面中表示if trained_embeddings.shape[1] == 2:    labels = keywords[:10]     for i, label in enumerate(labels):        x, y = trained_embeddings[i,:]        plt.scatter(x, y)        #plt.annotate()添加注释xy表示符号位置,xytext表示文字位置        plt.annotate(label, xy=(x, y), xytext=(5, 2),textcoords='offset points', ha='right', va='bottom')    plt.savefig("word2vec.png")




tf.nn.embedding_lookup(params, ids, partition_strategy=’mod’, name=None, validate_indices=True, max_norm=None)
a = [[0.1, 0.2, 0.3], [1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3], [4.1, 4.2, 4.3]]
a = np.asarray(a)
idx1 = tf.Variable([0, 2, 3, 1], tf.int32)
idx2 = tf.Variable([[0, 2, 3, 1], [4, 0, 2, 2]], tf.int32)
out1 = tf.nn.embedding_lookup(a, idx1)
out2 = tf.nn.embedding_lookup(a, idx2)
init = tf.global_variables_initializer()

with tf.Session() as sess:
print sess.run(out1)
print out1
print ‘==================’
print sess.run(out2)
print out2

[[ 0.1 0.2 0.3]
[ 2.1 2.2 2.3]
[ 3.1 3.2 3.3]
[ 1.1 1.2 1.3]]

Tensor(“embedding_lookup:0”, shape=(4, 3), dtype=float64)
[[[ 0.1 0.2 0.3]
[ 2.1 2.2 2.3]
[ 3.1 3.2 3.3]
[ 1.1 1.2 1.3]]

[[ 4.1 4.2 4.3]
[ 0.1 0.2 0.3]
[ 2.1 2.2 2.3]
[ 2.1 2.2 2.3]]]
Tensor(“embedding_lookup_1:0”, shape=(2, 4, 3), dtype=float64)
