词向量源码解析：（2.2）word2vec源码解析之word2phrase

来源：互联网发布：淘宝店铺开店资料编辑：程序博客网时间：2024/06/05 12:50

我们首先过一遍源码word2phrase.c。这个源码中的很多内容和word2vec.c中的源码是共用的，可以之后方便我们理解word2vec.c。这个代码的的思想很简单，就是希望找出语料中的短语。比如New York这个短语，如果我们把它当做两个单词分开处理显然不合适。这里从语料中找短语的基本思想和和互信息的思想很相似，需要考虑两方面的信息，第一个方面是两个单词，比如New和York，在这个语料中共同出现的次数。如果两个单词基本就没有在语料中以New York这样的形式出现过，那么显然New York就不是常用短语了。另外还需要考虑两个单词在语料中单独出现的次数。像of the在语料中肯定大量的出现，但是他们并不是短语。这两个单词本身单独就会在语料中大量的出现。这也导致他们碰巧一块出现的次数也会非常的多。

单词用结构体表示，记录了单词以及其在语料中出现的次数

struct vocab_word {
long long cn;
char *word;
};

一些全局变量，有的后面会说它们的含义

char train_file[MAX_STRING], output_file[MAX_STRING];//输入文件名和输出文件名这个代码的功能就是把输入的语料过滤一遍，输出把单词整合成phrase的语料
struct vocab_word *vocab;//字典，保存所有单词以及他们在语料中出现的次数
int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1;
long long vocab_max_size = 10000, vocab_size = 0;
long long train_words = 0;//已经处理的单词或是叫token的数量
real threshold = 100;

首先从main函数看起

vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));//为字典分配空间

vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));//由于C语言中没有python中字典（dict）这样的数据结构，所以这里是自己实现的。vocab_hash记录了hash值和vocab_word字典中的位置的对应关系

TrainModel();//开始训练

TrainModel函数中的代码：

long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0;
char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
real score;
FILE *fo, *fin;
printf("Starting training using file %s\n", train_file);
LearnVocabFromTrainFile();
fin = fopen(train_file, "rb");
fo = fopen(output_file, "wb");
word[0] = 0;

上面这些代码声明了一些变量，打开了输入和输出文件。最重要的代码就是LearnVocabFromTrain();它的功能是建立字典一般做自然语言问题第一件事都是要过一遍语料建立字典。这部分代码和word2vec.c中是一模一样的。下面仔细讲解一下如何建立字典。

字典的数据结构有单词结构体数组vocab以及int数组vocab_hash。C语言中没有python中字典数据结构所以这里是自己实现的。当我们要找到一个单词，想知道字典中有没有这个单词以及它的id的时候，首先计算这个单词的哈希值。比如是15，vocab_hash[15]中就记录了这个单词在vocab中的位置。比如vocab[15]=100，那么我们就直接知道了这个单词的id是100，以及vocab[100]中就存了单词的以及它的频数。如果vocab[15]=-1那么这个单词不在字典中，我们会进行插入等操作。还有可能几个单词有一样的哈希值，后面介绍代码的时候会介绍如何处理这种情况。

ReadWord函数从文件中读取一个单词。一个一个字符的读，遇到空格，\t，\n，或是文件结束就代表单词已经读完了，word[a] = 0;C语言中字符串最后一位为0，表示字符串结束

void ReadWord(char *word, FILE *fin) {
int a = 0, ch;
while (!feof(fin)) {
ch = fgetc(fin);
if (ch == 13) continue;
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
if (a > 0) {
if (ch == '\n') ungetc(ch, fin);
break;
}
if (ch == '\n') {
strcpy(word, (char *)"</s>");
return;
} else continue;
}
word[a] = ch;
a++;
if (a >= MAX_STRING - 1) a--; // Truncate too long words
}
word[a] = 0;
}

GetWordHash函数输入一个单词，计算这个单词的哈希值并且返回

int GetWordHash(char *word) {
unsigned long long a, hash = 1;
for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
hash = hash % vocab_hash_size;
return hash;
}

SearchVocab函数输入一个单词如果这单词在字典中就返回这个单词的id，如果不在就返回-1。由于可能有多个单词的哈希值是一样的。比如a,b,c三个单词的哈希值都是15，那么一种可能性是vocab_hash[15]存的是a的id，vocab_hash[16]存的是b的id，vocab_hash[17]存的是c的id。后面建立字典的过程中就能看得很清楚了。所以要找到word的真正的id需要去字典中对比一下字符串strcmp(word, vocab[vocab_hash[hash]].word)

int SearchVocab(char *word) {
unsigned int hash = GetWordHash(word);//首先得到单词的哈希值
while (1) {
if (vocab_hash[hash] == -1) return -1;//字典中没有这个单词，返回-1
if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];//vocab_hash[hash]中存的是否真的是word这个单词的id，不是的话还得继续找hash += 1，找到就停下了，如果到了遇到了vocab_hash[hash]等于-1那证明确实没有这个单词，返回-1
hash = (hash + 1) % vocab_hash_size;
}
return -1;
}

ReadWordIndex函数的输入就是文件指针，从文件中读一个单词返回其id，把之前几个函数串了起来。

int ReadWordIndex(FILE *fin) {
char word[MAX_STRING];
ReadWord(word, fin);
if (feof(fin)) return -1;
return SearchVocab(word);
}

之前说的是从字典中查找单词，返回单词的id，下面的函数AddWordToVocab是要往字典中插入单词。插入的时候有两个地方要考虑，一个是字典大小不够了需要realloc重新分配内存，另一个是解决哈希冲突。

int AddWordToVocab(char *word) {
unsigned int hash, length = strlen(word) + 1;
if (length > MAX_STRING) length = MAX_STRING;//单词要是太长就截断
vocab[vocab_size].word = (char *)calloc(length, sizeof(char));//vocab[0]到vocab[vocab_size-1]都有单词了，新单词放在vocab[vocab_size]中，首先为这个单词字符串分配一点空间
strcpy(vocab[vocab_size].word, word);
vocab[vocab_size].cn = 0;//新加的单词，当前频数是0，会在函数外面加一的
vocab_size++;//字典大小加一
// Reallocate memory if needed //如果字典不够大了分配更大的空间。vocab_max_size记录了当前vocab的空间
if (vocab_size + 2 >= vocab_max_size) {
vocab_max_size += 10000;
vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
}
hash = GetWordHash(word);//下面是更新哈希表(vocah_hash)
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;//解决hash冲突，找到空的位置
vocab_hash[hash]=vocab_size - 1;//vocab_size-1就是插入的单词的id
return vocab_size - 1;//返回当前字典大小
}

VocabCompare函数对单词进行排序

int VocabCompare(const void *a, const void *b) {
return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
}

SortVocab对vocab进行排序，id小的单词频数大。vocab[i].cn>vocab[i+1].cn

void SortVocab() {
int a;
unsigned int hash;
// Sort the vocabulary and keep </s> at the first position
qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);//对vocab排序，单词的id和之前的就完全不一样了，vocab_hash需要更新
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
for (a = 0; a < vocab_size; a++) {//遍历所有单词，重新计算vocab_hash以及丢弃低频词
// Words occuring less than min_count times will be discarded from the vocab
if (vocab[a].cn < min_count) {
vocab_size--;
free(vocab[vocab_size].word);
} else {
// Hash will be re-computed, as after the sorting it is not actual
hash = GetWordHash(vocab[a].word);
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
vocab_hash[hash] = a;
}
}
vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word));//低频词都不要了，vocab_size有变化，原来vocab最后的低频词丢弃
}

ReduceVocab会缩减单词的数量。在构建词典的过程中很容易造成内存不足。word2vec使用的机制是当字典中单词个数超过一定的阈值，就把当前已有的低频词扔掉。每一次出发阈值，min_reduce+=1。也就是第一次出发阈值过滤掉频率为1的单词，第二次过滤掉频率为2的单词，以此类推。这种机制输入贪心算法，可能会错误的漏掉一些单词，但是一般在实际中没有什么大问题。

void ReduceVocab() {
int a, b = 0;
unsigned int hash;
for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
vocab[b].cn = vocab[a].cn;
vocab[b].word = vocab[a].word;
b++;
} else free(vocab[a].word);//小于reduce_min的话删掉单词
vocab_size = b;
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;//由于单词id改变了，需要重新计算vocab_hash
for (a = 0; a < vocab_size; a++) {
// Hash will be re-computed, as it is not actual
hash = GetWordHash(vocab[a].word);
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
vocab_hash[hash] = a;
}
fflush(stdout);
min_reduce++;
}

LearnVocabFromTrainFile函数扫一遍语料生成字典，字典包括了bigram，但是bigram是不是真正的phrase还需要后面去判断。最后字典是排好序的。注意字典只包含bigram。实际中phrase可能包含三个或者更多的单词，所以需要我们多次运行这个程序才能得带包含多个单词的phrase

void LearnVocabFromTrainFile() {
char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
FILE *fin;
long long a, i, start = 1;
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
fin = fopen(train_file, "rb");
if (fin == NULL) {
printf("ERROR: training data file not found!\n");
exit(1);
}
vocab_size = 0;
AddWordToVocab((char *)"</s>");
while (1) {//循环，遍历整个语料，读取一个单词，更新这个单词以及这个单词和前面单词组成的bigram到字典
ReadWord(word, fin);
if (feof(fin)) break;
if (!strcmp(word, "</s>")) {//处理换行的情形，不过逻辑有点问题，但是基本不影响最后的结果
start = 1;
continue;
} else start = 0;
train_words++;
if ((debug_mode > 1) && (train_words % 100000 == 0)) {
printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13);
fflush(stdout);
}
i = SearchVocab(word);//插入当前的单词如果字典中没有的话
if (i == -1) {
a = AddWordToVocab(word);
vocab[a].cn = 1;
} else vocab[i].cn++;
if (start) continue;
sprintf(bigram_word, "%s_%s", last_word, word);
bigram_word[MAX_STRING - 1] = 0;
strcpy(last_word, word);
i = SearchVocab(bigram_word);//插入bigram如果当前字典没有的话
if (i == -1) {
a = AddWordToVocab(bigram_word);
vocab[a].cn = 1;
} else vocab[i].cn++;
if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
}
SortVocab();
if (debug_mode > 0) {
printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size);
printf("Words in train file: %lld\n", train_words);
}
fclose(fin);
}

最后就是一开始main函数中的TrainModel函数，在建立好字典后，会再过一遍语料，把符合条件的bigram变成phrase

void TrainModel() {
long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0;
char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
real score;
FILE *fo, *fin;
printf("Starting training using file %s\n", train_file);
LearnVocabFromTrainFile();//建立字典
fin = fopen(train_file, "rb");
fo = fopen(output_file, "wb");
word[0] = 0;
while (1) {//循环，遍历整个语料
strcpy(last_word, word);
ReadWord(word, fin);
if (feof(fin)) break;
if (!strcmp(word, "</s>")) {
fprintf(fo, "\n");
continue;
}
cn++;
if ((debug_mode > 1) && (cn % 100000 == 0)) {
printf("Words written: %lldK%c", cn / 1000, 13);
fflush(stdout);
}
oov = 0;//需要满足很多条件，当前单词才能和前面的单词组成phrase
i = SearchVocab(word);
if (i == -1) oov = 1; else pb = vocab[i].cn;//当前单词和之前单词都要出现在字典中，才有可能组成phrase
if (li == -1) oov = 1;
li = i;
sprintf(bigram_word, "%s_%s", last_word, word);
bigram_word[MAX_STRING - 1] = 0;
i = SearchVocab(bigram_word);
if (i == -1) oov = 1; else pab = vocab[i].cn;//bigram也要出现在字典中才有可能成为phrase，还有单个单词大于某个阈值的条件
if (pa < min_count) oov = 1;
if (pb < min_count) oov = 1;
if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words;//最重要的条件，互信息要大于某个阈值
if (score > threshold) {//所有条件都满足，成为phrase，写出下划线，并且pb置为0，下个单词就不能和当前单词组成phrase了
fprintf(fo, "_%s", word);
pb = 0;
} else fprintf(fo, " %s", word);//否则就当做普通单词直接输出
pa = pb;
}
fclose(fo);
fclose(fin);
}

这个文件的代码就全都分析完毕了。最后看一下这个代码怎么调用。打开demo-phrases.sh，可以看到word2phrase被调动了两次，我们能得到最多包含三个单词的phrase，第一次word2phrase的输出是第二次word2phrase的输入。第二次输出的文件news.2012.en.shuffled-norm0-phrase1经过tr命令变化得到news.2012.en.shuffled-norm1-phrase1，作为word2vec的输入

make
if [ ! -e news.2012.en.shuffled ]; then
wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
gzip -d news.2012.en.shuffled.gz -f
fi
sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
./distance vectors-phrase.bin

阅读全文

1 0