词向量源码解析：（4.5）hyperwords源码解析之representations

来源：互联网发布：plc仿真软件编辑：程序博客网时间：2024/06/07 18:19

representations包的主要任务是进一步处理和包装之前模型生成的词向量。这个包中包括了四个文件，下面依次介绍这四个文件。首先介绍matrix_serializer文件。这个文件的主要功能是负责内存磁盘之间的数据交换。比如我想把内存的字典写入磁盘，或是把字典读入内存，就调用这个文件里面的函数。

def save_matrix(f, m)://把共现矩阵写到磁盘中
np.savez_compressed(f, data=m.data, indices=m.indices, indptr=m.indptr, shape=m.shape)

def load_matrix(f)://把共现矩阵从磁盘中读取出来
if not f.endswith('.npz'):
f += '.npz'
loader = np.load(f)
return csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])

def save_vocabulary(path, vocab)://把词典写到磁盘中
with open(path, 'w') as f:
for w in vocab:
print >>f, w

def load_vocabulary(path)://把词典从磁盘中读到内存
with open(path) as f:
vocab = [line.strip() for line in f if len(line) > 0]
return dict([(a, i) for i, a in enumerate(vocab)]), vocab

def save_count_vocabulary(path, vocab)://词典分成仅有单词的词典和包括单词以及其词频的词典。
with open(path, 'w') as f:
for w, c in vocab:
print >>f, w, c

def load_count_vocabulary(path):
with open(path) as f:
# noinspection PyTypeChecker
vocab = dict([line.strip().split() for line in f if len(line) > 0])
return vocab

下面看一下representation_factory文件的内容，这个文件和embedding文件以及explicit文件一起，对生成的词向量进行处理和包装。在我们需要用到词向量的时候，我们会调用这个文件的内容，而不是直接去从文件中把词向量读取出来。

def create_representation(args)://这个函数接受args作为输入，args里面包括的信息有，词向量的路径，词向量的类型（哪个模型生成的词向量），以及对词向量进一步加工的参数
rep_type = args['<representation>']//词向量类型
path = args['<representation_path>']//词向量路径
neg = int(args['--neg'])
w_c = args['--w+c']
eig = float(args['--eig'])

if rep_type == 'PPMI'://PPMI是显式的表示类型，根据类型返回不同的类，类是对词向量的一个包装，包括了词向量以及对应的词典等信息。
if w_c:
raise Exception('w+c is not implemented for PPMI.')
else:
return PositiveExplicit(path, True, neg)

elif rep_type == 'SVD':
if w_c:
return EnsembleEmbedding(SVDEmbedding(path, False, eig, False), SVDEmbedding(path, False, eig, True), True)
else:
return SVDEmbedding(path, True, eig)

else:
if w_c:
return EnsembleEmbedding(Embedding(path + '.words', False), Embedding(path + '.contexts', False), True)
else:
return Embedding(path + '.words', True)

下面我们看看对于词向量包装的类是什么样子的。先看看Explicit类，这个类对PPMI生成的词向量进行了包装。

class Explicit:
"""
Base class for explicit representations. Assumes that the serialized input is e^PMI.
"""

def __init__(self, path, normalize=True)://构造函数需要读取中心词和上下文词典，读取共现矩阵或是PMI矩阵，以及对矩阵进行正规化
self.wi, self.iw = load_vocabulary(path + '.words.vocab')
self.ci, self.ic = load_vocabulary(path + '.contexts.vocab')
self.m = load_matrix(path)//稀疏矩阵csr格式
self.m.data = np.log(self.m.data)//这个代码把对矩阵的处理的部分内容放在这这里。一般来说log以后才是PMI矩阵
self.normal = normalize
if normalize:
self.normalize()

def normalize(self)://对稀疏矩阵进行正规化
m2 = self.m.copy()
m2.data **= 2
norm = np.reciprocal(np.sqrt(np.array(m2.sum(axis=1))[:, 0]))
normalizer = dok_matrix((len(norm), len(norm)))
normalizer.setdiag(norm)
self.m = normalizer.tocsr().dot(self.m)

def represent(self, w)://这个类支持对词向量的检索
if w in self.wi:
return self.m[self.wi[w], :]
else:
return csr_matrix((1, len(self.ic)))

def similarity_first_order(self, w, c):
return self.m[self.wi[w], self.ci[c]]

def similarity(self, w1, w2)://这个类支持对词向量相似度的计算，给两个单词计算它们的相似度，后面评估词向量性质的代码都会调用这个函数
"""
Assumes the vectors have been normalized.
"""
return self.represent(w1).dot(self.represent(w2).T)[0, 0]

def closest_contexts(self, w, n=10):
"""
Assumes the vectors have been normalized.
"""
scores = self.represent(w)
return heapq.nlargest(n, zip(scores.data, [self.ic[i] for i in scores.indices]))

def closest(self, w, n=10):
"""
Assumes the vectors have been normalized.
"""
scores = self.m.dot(self.represent(w).T).T.tocsr()
return heapq.nlargest(n, zip(scores.data, [self.iw[i] for i in scores.indices]))

PositiveExplicit类继承了Explicit类，把PMI矩阵变成PPMI矩阵。其实也可以算作变成SPPMI，S是指shifted，我们看到代码里还会减掉log(neg)

class PositiveExplicit(Explicit):
"""
Positive PMI (PPMI) with negative sampling (neg).
Negative samples shift the PMI matrix before truncation.
"""

def __init__(self, path, normalize=True, neg=1):
Explicit.__init__(self, path, False)
self.m.data -= np.log(neg)
self.m.data[self.m.data < 0] = 0
self.m.eliminate_zeros()
if normalize:
self.normalize()

然后看看Embedding，这个类处理稠密的词向量，包括word2vec和SVD生成的词向量。

class Embedding://这个类的逻辑和Explicit一样，只不过稠密的词向量用numpy数组去保存
"""
Base class for all embeddings. SGNS can be directly instantiated with it.
"""

def __init__(self, path, normalize=True):
self.m = np.load(path + '.npy')
if normalize:
self.normalize()
self.dim = self.m.shape[1]
self.wi, self.iw = load_vocabulary(path + '.vocab')

def normalize(self):
norm = np.sqrt(np.sum(self.m * self.m, axis=1))
self.m = self.m / norm[:, np.newaxis]

def represent(self, w):
if w in self.wi:
return self.m[self.wi[w], :]
else:
return np.zeros(self.dim)

def similarity(self, w1, w2):
"""
Assumes the vectors have been normalized.
"""
return self.represent(w1).dot(self.represent(w2))

def closest(self, w, n=10):
"""
Assumes the vectors have been normalized.
"""
scores = self.m.dot(self.represent(w))
return heapq.nlargest(n, zip(scores, self.iw))

读入SVD产生的词向量和word2vecf产生的词向量有点不一样，所以单独写一个SVDEmbedding。它会继承embedding

class SVDEmbedding(Embedding)://SVD中也有不少的超参，在SVDEmbedding的构造函数中提现出来
"""
SVD embeddings.
Enables controlling the weighted exponent of the eigenvalue matrix (eig).
Context embeddings can be created with "transpose".
"""

def __init__(self, path, normalize=True, eig=0.0, transpose=False):
if transpose:
ut = np.load(path + '.vt.npy')
self.wi, self.iw = load_vocabulary(path + '.contexts.vocab')
else:
ut = np.load(path + '.ut.npy')
self.wi, self.iw = load_vocabulary(path + '.words.vocab')
s = np.load(path + '.s.npy')

if eig == 0.0:
self.m = ut.T
elif eig == 1.0:
self.m = s * ut.T
else:
self.m = np.power(s, eig) * ut.T

self.dim = self.m.shape[1]

if normalize:
self.normalize()

还有两个类也集成Embedding，这里就不详细介绍了，后面用的也不多，大家有兴趣可以自己去看看。

阅读全文

0 0