学习笔记TF018:词向量、维基百科语料库训练词向量模型

来源：互联网发布：网络暴力乔任梁议论文编辑：程序博客网时间：2024/06/13 11:10

词向量嵌入需要高效率处理大规模文本语料库。word2vec。简单方式，词送入独热编码(one-hot encoding)学习系统，长度为词汇表长度的向量，词语对应位置元素为1,其余元素为0。向量维数很高，无法刻画不同词语的语义关联。共生关系(co-occurrence)表示单词，解决语义关联，遍历大规模文本语料库，统计每个单词一定距离范围内的周围词汇，用附近词汇规范化数量表示每个词语。类似语境中词语语义相似。用PCA或类似方法降维出现向量(occurrence vector)，得到更稠密表示。性能好，追踪所有词汇共生矩阵，宽度、高度为词汇表长度。2013年,Mikolov、Tomas等提出上下文计算词表示方法，《Efficient estimation of word representations in vector space》(arXiv preprint arXiv:1301.3781(2013))。skip-gram模型，从随机表示开始，依据当前词语预测上下文词语简单分类器，误差通过分类器权值和词表示传播，对两者调整减少预测误差。大规模语料库训练模型表示赂量逼近压缩后共生向量。

数据集, 英文维基百科转储文件包含所有页面完整修订历史，当前页面版本100GB，https://dumps.wikimedia.org/backup-index.html。

下载转储文件，提取页面词语。统计词语出现次数，构建常见词汇表。用词汇表对提取页面编码。逐行读取文件，结果立即写入磁盘。在不同步骤间保存检查点，避免程序崩溃重来。

iter遍历词语索引列表页面。encode获取字符串词语词汇索引。decode依据词汇索引返回字符串词语。_read_pages从维基百科转储文件(压缩XML)提取单词，保存到页面文件，每个页面一行空格分隔的单词。bz2模块open函数读取文件。中间结果压缩处理。正则表达式捕捉任意连续字母序列或单独特殊字母。_build_vocabulary统计页面文件单词数，出现频率高词语写入文件。独热编码需要词汇表。词汇表索引编码。移除拼写错误、极不常见词语，词汇表只包含vocabulary_size - 1个最常见词语。所有不在词汇表词语标记，未出现单词词向量。

动态形成训练样本，组织到大批数据，分类器不占大量内存。skip-gram模型预测当前词语的上下文词语。遍历文本，当前词语数据，周围词语目标，创建训练样本。上下文尺寸R，每个单词生成2R样本，当前词左右各R个词。语义上下文，距离近重要，尽量少创建远上下文词语训练样本，范围[1,D=10]随机选择词上下文尺寸。依据skip-gram模型形成训练对。Numpy数组生成数值流批数据。

初始，单词表示为随机向量。分类器根据中层表示预测上下文单词当前表示。传播误差，微调权值、输入单词表示。MomentumOptimizer 模型优化，智能不足，效率高。

分类器是模型核心。噪声对比估计损失(noisecontrastive estimation loss)性能优异。softmax分类器建模。tf.nn.nce_loss 新随机向量负样本(对比样本)，近似softmax分类器。

训练模型结束，最终词向量写入文件。维基百科语料库子集，普通CPU训练5小时，得到NumPy数组嵌入表示。完整语料库: https://dumps.wikimedia.org/enwiki/20160501/enwiki-20160501-pages-meta-current.xml.bz2 。AttrDict类等价Python dict，键可属性访问。

import bz2import collectionsimport osimport refrom lxml import etreefrom helpers import downloadclass Wikipedia:

TOKEN_REGEX = re.compile(r’[A-Za-z]+|[!?.:,()]’)
def init(self, url, cache_dir, vocabulary_size=10000):
self._cache_dir = os.path.expanduser(cache_dir)
self._pages_path = os.path.join(self._cache_dir, ‘pages.bz2’)
self._vocabulary_path = os.path.join(self._cache_dir, ‘vocabulary.bz2’)
if not os.path.isfile(self._pages_path):
print(‘Read pages’)
self._read_pages(url)
if not os.path.isfile(self._vocabulary_path):
print(‘Build vocabulary’)
self._build_vocabulary(vocabulary_size)
with bz2.open(self._vocabulary_path, ‘rt’) as vocabulary:
print(‘Read vocabulary’)
self._vocabulary = [x.strip() for x in vocabulary]
self._indices = {x: i for i, x in enumerate(self._vocabulary)}
def iter(self):
with bz2.open(self._pages_path, ‘rt’) as pages:
for page in pages:
words = page.strip().split()
words = [self.encode(x) for x in words]
yield words
@property
def vocabulary_size(self):
return len(self._vocabulary)
def encode(self, word):
return self._indices.get(word, 0)
def decode(self, index):
return self._vocabulary[index]
def _read_pages(self, url):
wikipedia_path = download(url, self._cache_dir)
with bz2.open(wikipedia_path) as wikipedia, \
bz2.open(self._pages_path, ‘wt’) as pages:
for _, element in etree.iterparse(wikipedia, tag=’{*}page’):
if element.find(‘./{*}redirect’) is not None:
continue
page = element.findtext(‘./{}revision/{}text’)
words = self._tokenize(page)
pages.write(’ ‘.join(words) + ‘\n’)
element.clear()
def _build_vocabulary(self, vocabulary_size):
counter = collections.Counter()
with bz2.open(self._pages_path, ‘rt’) as pages:
for page in pages:
words = page.strip().split()
counter.update(words)
common = [‘’] + counter.most_common(vocabulary_size - 1)
common = [x[0] for x in common]
with bz2.open(self._vocabulary_path, ‘wt’) as vocabulary:
for word in common:
vocabulary.write(word + ‘\n’)
@classmethod
def _tokenize(cls, page):
words = cls.TOKEN_REGEX.findall(page)
words = [x.lower() for x in words]
return words

import tensorflow as tfimport numpy as npfrom helpers import lazy_propertyclass EmbeddingModel:

def init(self, data, target, params):
self.data = data
self.target = target
self.params = params
self.embeddings
self.cost
self.optimize
@lazy_property
def embeddings(self):
initial = tf.random_uniform(
[self.params.vocabulary_size, self.params.embedding_size],
-1.0, 1.0)
return tf.Variable(initial)
@lazy_property
def optimize(self):
optimizer = tf.train.MomentumOptimizer(
self.params.learning_rate, self.params.momentum)
return optimizer.minimize(self.cost)
@lazy_property
def cost(self):
embedded = tf.nn.embedding_lookup(self.embeddings, self.data)
weight = tf.Variable(tf.truncated_normal(
[self.params.vocabulary_size, self.params.embedding_size],
stddev=1.0 / self.params.embedding_size ** 0.5))
bias = tf.Variable(tf.zeros([self.params.vocabulary_size]))
target = tf.expand_dims(self.target, 1)
return tf.reduce_mean(tf.nn.nce_loss(
weight, bias, embedded, target,
self.params.contrastive_examples,
self.params.vocabulary_size))

import collectionsimport tensorflow as tfimport numpy as npfrom batched import batchedfrom EmbeddingModel import EmbeddingModelfrom skipgrams import skipgramsfrom Wikipedia import Wikipediafrom helpers import AttrDictWIKI_DOWNLOAD_DIR = './wikipedia'params = AttrDict(

vocabulary_size=10000,
max_context=10,
embedding_size=200,
contrastive_examples=100,
learning_rate=0.5,
momentum=0.5,
batch_size=1000,
)
data = tf.placeholder(tf.int32, [None])
target = tf.placeholder(tf.int32, [None])
model = EmbeddingModel(data, target, params)
corpus = Wikipedia(
‘https://dumps.wikimedia.org/enwiki/20160501/’
‘enwiki-20160501-pages-meta-current1.xml-p000000010p000030303.bz2’,
WIKI_DOWNLOAD_DIR,
params.vocabulary_size)
examples = skipgrams(corpus, params.max_context)
batches = batched(examples, params.batch_size)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
average = collections.deque(maxlen=100)
for index, batch in enumerate(batches):
feed_dict = {data: batch[0], target: batch[1]}
cost, _ = sess.run([model.cost, model.optimize], feed_dict)
average.append(cost)
print(‘{}: {:5.1f}’.format(index + 1, sum(average) / len(average)))
if index > 100000:
break
embeddings = sess.run(model.embeddings)
np.save(WIKI_DOWNLOAD_DIR + ‘/embeddings.npy’, embeddings)

参考资料：
《面向机器智能的TensorFlow实践》

欢迎加我微信交流：qingxingfengzi
我的微信公众号:qingxingfengzigz
我老婆张幸清的微信公众号：qingqingfeifangz

阅读全文

0 0