词向量源码解析:(4.4)hyperwords源码解析之word2vecf

来源:互联网 发布:teradata待遇知乎 编辑:程序博客网 时间:2024/06/08 17:17

在word2vec代码中只考虑了上下文是单词的情况。已经有大量的论文提出了不同的方式,丰富上下文的信息,从而得到更高质量,或者是性质不同的词向量。word2vecf就是在word2vec上面一个扩展。以前word2vec的输入是语料,现在word2vecf的输入是pair,这样可以支持各种形式的中心词和上下文。比如我们需要引入dependency的信息,我们只需要提供单词文档的pair就好,word2vecf的内容完全不需要修改。word2vecf和word2vec区别不大,我们这里简单的过一遍代码

首先看看scripts文件中的install_word2vecf.sh。下载word2vecf,对C文件进行编译。所有源代码以及编译好的可执行文件都在word2vecf文件夹中。

#!/bin/sh
mkdir word2vecf
wget https://bitbucket.org/yoavgo/word2vecf/get/1b94252a58d4.zip
unzip 1b94252a58d4.zip
rm 1b94252a58d4.zip
mv yoavgo-word2vecf-1b94252a58d4/*.c word2vecf/.
mv yoavgo-word2vecf-1b94252a58d4/*.h word2vecf/.
mv yoavgo-word2vecf-1b94252a58d4/makefile word2vecf/.
rm -r yoavgo-word2vecf-1b94252a58d4
make -C word2vecf

下面看看makefile文件,我们最后就需要word2vecf可执行文件,可以看到它是由三个文件得到了word2vecf.c vocab.c io.c,我们一会儿就解析这三个文件就好。

CC = gcc
#The -Ofast might not work with older versions of gcc; in that case, use -O2
#CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result
CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result


all: word2vec word2phrase distance word-analogy compute-accuracy word2vecf count_and_filter


count_and_filter: count_and_filter.c vocab.c io.c
$(CC) vocab.c count_and_filter.c io.c -o count_and_filter $(CFLAGS)
word2vec : word2vec.c
$(CC) word2vec.c -o word2vec $(CFLAGS)
word2vecf : word2vecf.c vocab.c io.c
$(CC) word2vecf.c vocab.c io.c -o word2vecf $(CFLAGS)
word2phrase : word2phrase.c
$(CC) word2phrase.c -o word2phrase $(CFLAGS)
distance : distance.c
$(CC) distance.c -o distance $(CFLAGS)
word-analogy : word-analogy.c
$(CC) word-analogy.c -o word-analogy $(CFLAGS)
compute-accuracy : compute-accuracy.c
$(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS)
chmod +x *.sh


clean:
rm -rf word2vec word2phrase distance word-analogy compute-accuracy count_and_filter word2vecf

先看看io.c的内容。io.c就是从文件读取一个单词,word2vec和GloVe都用了类似的代码,区别不大。

void ReadWord(char *word, FILE *fin, int MAX_STRING) {
  int a = 0, ch;
  while (!feof(fin)) {
    ch = fgetc(fin);
    if (ch == 13) continue;
    if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
      if (a > 0) break;
      else continue; 
    }
    word[a] = ch;
    a++;
    if (a >= MAX_STRING - 1) a--;   // Truncate too long words
  }
  word[a] = 0;
}

再看看vocab的内容,先看头文件。

struct vocab_word {//我们只会用到单词频数cn以及单词拼写字符串word
  long long cn;
  int *point;
  char *word, *code, codelen;
};


struct vocabulary {//这里把词典的所有内容封装到了结构体中,本质和word2vec中的方法一模一样
   struct vocab_word *vocab;//单词数组
   int *vocab_hash;//哈希表,用来快速检索单词
   long long vocab_max_size; //1000//vocab_word的大小,不够的时候会分配重新内存,大于当前词典的大小,为了提升效率,不会多加一个单词就分配一块内存
   long vocab_size;//字典的大小
   long long word_count;//语料的大小,或者是单词频数的求和
};

下面是建立词典的函数的声明。可以看到C语言建立词典比python可复杂多了。word2vecf对词典的创建写得更加清晰

int ReadWordIndex(struct vocabulary *v, FILE *fin);
inline int GetWordHash(struct vocabulary *v, char *word);
int SearchVocab(struct vocabulary *v, char *word);
int AddWordToVocab(struct vocabulary *v, char *word);
void SortAndReduceVocab(struct vocabulary *v, int min_count);
struct vocabulary *CreateVocabulary();
void SaveVocab(struct vocabulary *v, char *vocab_file);
struct vocabulary *ReadVocab(char *vocab_file);
void EnsureVocabSize(struct vocabulary *v);

下面在vocab.c中看看这几个函数的实现。GetWordHash返回单词的哈希值。这里就用的是最简单的哈希形式

// Returns hash value of a word
inline int GetWordHash(struct vocabulary *v, char *word) {
  unsigned long long hash = 0;
  char *b = word;
  //for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
  //hash = FastHash(word, strlen(word)) % vocab_hash_size;
  while (*b != 0) hash = hash * 257 + *(b++);
  hash = hash % vocab_hash_size;
  return hash;
}

CreateVocabulary创建一个词典。

struct vocabulary *CreateVocabulary() {
   struct vocabulary *v = malloc(sizeof(struct vocabulary));
   long long a;
   v->vocab_max_size = 1000;//一上来vocab数组的大小
   v->vocab_size = 0;//一上来没有单词


   v->vocab = (struct vocab_word *)calloc(v->vocab_max_size, sizeof(struct vocab_word));//为vocab分配内存


   v->vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));//为哈希表分配内存
   for (a = 0; a < vocab_hash_size; a++) v->vocab_hash[a] = -1;
   return v;
}

向词典中插入单词

// Adds a word to the vocabulary
int AddWordToVocab(struct vocabulary *v, char *word) {
  //static long collide = 0;
  //static long nocollide = 0;
  unsigned int hash, length = strlen(word) + 1;
  if (length > MAX_STRING) length = MAX_STRING;
  v->vocab[v->vocab_size].word = (char *)calloc(length, sizeof(char));//插入单词
  strcpy(v->vocab[v->vocab_size].word, word);
  v->vocab[v->vocab_size].cn = 0;//插入频数
  v->vocab_size++;//词典单词数加一
  // Reallocate memory if needed
  if (v->vocab_size + 2 >= v->vocab_max_size) {//数组大小马上要不够的话重新为vocab分配内存
    v->vocab_max_size += 1000;
    v->vocab = (struct vocab_word *)realloc(v->vocab, v->vocab_max_size * sizeof(struct vocab_word));
  }
  hash = GetWordHash(v, word);//下面是更新哈希表
  //if (v->vocab_hash[hash] != -1) { collide += 1; } else { nocollide += 1; }
  //if ((collide + nocollide) % 100000 == 0) printf("%d %d %f collisions\n\n",collide, nocollide, (float)collide/(collide+nocollide));
  while (v->vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
  v->vocab_hash[hash] = v->vocab_size - 1;
  return v->vocab_size - 1;//返回单词在vocab中的位置
}

根据单词查找它在vocab中的位置,或者说查找它的ID。

int SearchVocab(struct vocabulary *v, char *word) {
  unsigned int hash = GetWordHash(v, word);//根据哈希值查找
  while (1) {
    if ((v->vocab_hash)[hash] == -1) return -1;
    if (!strcmp(word, v->vocab[v->vocab_hash[hash]].word)) return v->vocab_hash[hash];
    hash = (hash + 1) % vocab_hash_size;
  }
  return -1;
}

对词典进行缩减,去掉低频词,机制和word2vec中一样。

// Reduces the vocabulary by removing infrequent tokens
void ReduceVocab(struct vocabulary *v) {
   static int min_reduce = 1;//去掉低频词的阈值
   printf("reducevocab\n");
  int a, b = 0;
  unsigned int hash;
  for (a = 0; a < v->vocab_size; a++) if (v->vocab[a].cn > min_reduce) {//遍历一遍词典,去掉低频词
    v->vocab[b].cn = v->vocab[a].cn;
    v->vocab[b].word = v->vocab[a].word;
    b++;
  } else free(v->vocab[a].word);
  v->vocab_size = b;
  for (a = 0; a < vocab_hash_size; a++) v->vocab_hash[a] = -1;//更新哈希表
  for (a = 0; a < v->vocab_size; a++) {
    // Hash will be re-computed, as it is not actual
    hash = GetWordHash(v, v->vocab[a].word);
    while (v->vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
    v->vocab_hash[hash] = a;
  }
  fflush(stdout);
  min_reduce++;
}

最后再看一个又排序又过滤低频词的代码。

void SortAndReduceVocab(struct vocabulary *v, int min_count) {
  int a, size;
  unsigned int hash;
  // Sort the vocabulary and keep </s> at the first position//先排序
  qsort(&(v->vocab[1]), v->vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
  for (a = 0; a < vocab_hash_size; a++) v->vocab_hash[a] = -1;
  size = v->vocab_size;
  v->word_count = 0;
  for (a = 0; a < size; a++) {//遍历词典,删掉低频词,不是低频词的话就更新哈希表
    // Words occuring less than min_count times will be discarded from the vocab
    if (v->vocab[a].cn < min_count) {
      v->vocab_size--;
      free(v->vocab[v->vocab_size].word);
    } else {
      // Hash will be re-computed, as after the sorting it is not actual
      hash=GetWordHash(v, v->vocab[a].word);
      while (v->vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
      v->vocab_hash[hash] = a;
      v->word_count += v->vocab[a].cn;
    }
  }
  v->vocab = (struct vocab_word *)realloc(v->vocab, (v->vocab_size + 1) * sizeof(struct vocab_word));
}

word2vecf构建词典通过读取词典文件,而不像word2vec那样是扫描语料。

struct vocabulary *ReadVocab(char *vocabfile) {
  long long a, i = 0;
  char c;
  char word[MAX_STRING];
  FILE *fin = fopen(vocabfile, "rb");//打开词典文件,词典的格式是每行一个单词后面跟着单词的频数
  if (fin == NULL) {
    printf("Vocabulary file not found\n");
    exit(1);
  }
  struct vocabulary *v = CreateVocabulary();//建立一个词典
  while (1) {//读取所有单词,reduce函数没有用
    ReadWord(word, fin, MAX_STRING);//读取一个单词
    if (feof(fin)) break;
    a = AddWordToVocab(v, word);
    fscanf(fin, "%lld%c", &v->vocab[a].cn, &c);//读取单词的频数,c是换行符
    i++;
  }
  SortAndReduceVocab(v, 0);//最后对单词进行排序(按照频数)
  printf("Vocab size: %d\n", v->vocab_size);
  printf("Word count: %lld\n", v->word_count);
  return v;
}

我们最后再看看word2vecf中的内容,下面是TrainModel中的内容,套路和word2vec一样,先建立词典,然后开启多线程训练。这里有两个词典,中心词词典和上下文词典。

  long a, b, c, d;
  FILE *fo;
  FILE *fo2;
  file_size = GetFileSize(train_file);
  pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
  printf("Starting training using file %s\n", train_file);
  starting_alpha = alpha;
  wv = ReadVocab(wvocab_file);//中心词词典
  cv = ReadVocab(cvocab_file);//上下文词典
  InitNet(wv, cv);
  InitUnigramTable(cv);
  start = clock();
  for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
  for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);

最后再看一下词向量的训练,和word2vec最大的区别是读入的是pair。

void *TrainModelThread(void *id) {
   int ctxi = -1, wrdi = -1;
  long long d;
  long long word_count = 0, last_word_count = 0;
  long long l1, l2, c, target, label;
  unsigned long long next_random = (unsigned long long)id;
  real f, g;
  clock_t now;
  real *neu1 = (real *)calloc(layer1_size, sizeof(real));
  real *neu1e = (real *)calloc(layer1_size, sizeof(real));
  FILE *fi = fopen(train_file, "rb");//训练文件使pair
  long long start_offset = file_size / (long long)num_threads * (long long)id;
  long long end_offset = file_size / (long long)num_threads * (long long)(id+1);
  int iter;
  //printf("thread %d %lld %lld \n",id, start_offset, end_offset);
  for (iter=0; iter < numiters; ++iter) {
     fseek(fi, start_offset, SEEK_SET);
     // if not binary:
     while (fgetc(fi) != '\n') { }; //TODO make sure its ok//由于切分的不是很准,所以跳过一行,保证每行是从头开始读入的
     printf("thread %d %lld\n", id, ftell(fi));


     long long train_words = wv->word_count;
     while (1) { //HERE @@@
        // TODO set alpha scheduling based on number of examples read.
        // The conceptual change is the move from word_count to pair_count
        if (word_count - last_word_count > 10000) {//打印一下基本的信息
           word_count_actual += word_count - last_word_count;
           last_word_count = word_count;
           if ((debug_mode > 1)) {
              now=clock();
              printf("%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, alpha,
                    word_count_actual / (real)(numiters*train_words + 1) * 100,
                    word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
              fflush(stdout);
           }
           alpha = starting_alpha * (1 - word_count_actual / (real)(numiters*train_words + 1));
           if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
        }
        if (feof(fi) || ftell(fi) > end_offset) break;
        for (c = 0; c < layer1_size; c++) neu1[c] = 0;
        for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
        wrdi = ReadWordIndex(wv, fi);//读入pair中第一个单词,并得到其id

        ctxi = ReadWordIndex(cv, fi);//读入pair中第二个单词,并得到其id
        word_count++; //TODO ?
        if (wrdi < 0 || ctxi < 0) continue;


        if (sample > 0) {//决定是否subsample这个pair
           real ran = (sqrt(wv->vocab[wrdi].cn / (sample * wv->word_count)) + 1) * (sample * wv->word_count) / wv->vocab[wrdi].cn;
           next_random = next_random * (unsigned long long)25214903917 + 11;
           if (ran < (next_random & 0xFFFF) / (real)65536) continue;
           ran = (sqrt(cv->vocab[ctxi].cn / (sample * cv->word_count)) + 1) * (sample * cv->word_count) / cv->vocab[ctxi].cn;
           next_random = next_random * (unsigned long long)25214903917 + 11;
           if (ran < (next_random & 0xFFFF) / (real)65536) continue;
        }
        //fread(&wrdi, 4, 1, fi);
        //fread(&ctxi, 4, 1, fi);
        // NEGATIVE SAMPLING//负采样,和word2vec里面完全一样
        l1 = wrdi * layer1_size;
        for (d = 0; d < negative + 1; d++) {
           if (d == 0) {
              target = ctxi;
              label = 1;
           } else {
              next_random = next_random * (unsigned long long)25214903917 + 11;
              target = unitable[(next_random >> 16) % table_size];
              if (target == 0) target = next_random % (cv->vocab_size - 1) + 1;
              if (target == ctxi) continue;
              label = 0;
           }
           l2 = target * layer1_size;
           f = 0;
           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
           if (f > MAX_EXP) g = (label - 1) * alpha;
           else if (f < -MAX_EXP) g = (label - 0) * alpha;
           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
           for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
        }
        // Learn weights input -> hidden
        for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
     }
  }
  fclose(fi);
  free(neu1);
  free(neu1e);
  pthread_exit(NULL);
}

阅读全文
0 0
原创粉丝点击