Word2Vec源码详细解析（上）

来源：互联网发布：中超球员籍贯数据库编辑：程序博客网时间：2024/06/05 19:18
相关链接：
1、Word2Vec源码最详细解析（上）
2、Word2Vec源码最详细解析（下）
Word2Vec源码最详细解析（上）
在这一部分中，主要介绍的是Word2Vec源码中的主要数据结构、各个变量的含义与作用，以及所有算法之外的辅助函数，包括如何从训练文件中获取词汇、构建词表、hash表、Haffman树等，为算法实现提供数据准备。而算法部分的代码实现将在《Word2Vec源码最详细解析（下）》一文中，重点分析。
该部分代码分析如下：
#include <stdio.h>#include <stdlib.h>#include <string.h>#include <math.h>#include <pthread.h>#define MAX_STRING 100#define EXP_TABLE_SIZE 1000#define MAX_EXP 6#define MAX_SENTENCE_LENGTH 1000#define MAX_CODE_LENGTH 40const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabularytypedef float real;                    // Precision of float numbers//每个词的基本数据结构struct vocab_word {  long long cn;//词频，从训练集中计数得到或直接提供词频文件  int *point;//Haffman树中从根节点到该词的路径，存放的是路径上每个节点的索引  //word为该词的字面值  //code为该词的haffman编码  //codelen为该词haffman编码的长度  char *word, *code, codelen;};char train_file[MAX_STRING], output_file[MAX_STRING];char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];//词表，该数组的下标表示这个词在此表中的位置，也称之为这个词在词表中的索引struct vocab_word *vocab;int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;//词hash表，该数组的下标为每个词的hash值，由词的字面值ASCII码计算得到。vocab_hash[hash]中存储的是该词在词表中的索引int *vocab_hash;//vocab_max_size是一个辅助变量，每次当词表大小超出vocab_max_size时，一次性将词表大小增加1000//vocab_size为训练集中不同单词的个数，即词表的大小//layer1_size为词向量的长度long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;real alpha = 0.025, starting_alpha, sample = 1e-3;//syn0存储的是词表中每个词的词向量//syn1存储的是Haffman树中每个非叶节点的向量//syn1neg是负采样时每个词的辅助向量//expTable是提前计算好的Sigmond函数表real *syn0, *syn1, *syn1neg, *expTable;clock_t start;int hs = 0, negative = 5;const int table_size = 1e8;int *table;//计算每个函数的能量分布表，在负采样中用到void InitUnigramTable() {  int a, i;  long long train_words_pow = 0;  real d1, power = 0.75;  //为能量表table分配内存空间，共有table_size项，table_size为一个既定的数1e8  table = (int *)malloc(table_size * sizeof(int));  //遍历词表，根据词频计算能量总值  for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);  i = 0;  //d1：表示已遍历词的能量值占总能量的比  d1 = pow(vocab[i].cn, power) / (real)train_words_pow;  //a：能量表table的索引  //i：词表的索引  for (a = 0; a < table_size; a++) {    //i号单词占据table中a位置<span style="white-space:pre"></span>table[a] = i;<span style="white-space:pre"></span>//能量表反映的是一个单词的能量分布，如果该单词的能量越大，所占table的位置就越多<span style="white-space:pre"></span>//如果当前单词的能量总和d1小于平均值，i递增，同时更新d1；反之如果能量高的话，保持i不变，以占据更多的位置    if (a / (real)table_size > d1) {      i++;      d1 += pow(vocab[i].cn, power) / (real)train_words_pow;    }<span style="white-space:pre"></span>//如果词表遍历完毕后能量表还没填满，将能量表中剩下的位置用词表中最后一个词填充    if (i >= vocab_size) i = vocab_size - 1;  }}//从文件中读入一个词到word，以space' '，tab'\t'，EOL'\n'为词的分界符//截去一个词中长度超过MAX_STRING的部分//每一行的末尾输出一个</s>void ReadWord(char *word, FILE *fin) {  int a = 0, ch;  while (!feof(fin)) {    ch = fgetc(fin);    if (ch == 13) continue;    if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {      if (a > 0) {        if (ch == '\n') ungetc(ch, fin);        break;      }      if (ch == '\n') {        strcpy(word, (char *)"</s>");        return;      } else continue;    }    word[a] = ch;    a++;    if (a >= MAX_STRING - 1) a--;   // Truncate too long words  }  word[a] = 0;}//返回一个词的hash值，由词的字面值计算得到，可能存在不同词拥有相同hash值的冲突情况int GetWordHash(char *word) {  unsigned long long a, hash = 0;  for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];  hash = hash % vocab_hash_size;  return hash;}//返回一个词在词表中的位置，若不存在则返回-1//先计算词的hash值，然后在词hash表中，以该值为下标，查看对应的值//如果为-1说明这个词不存在索引，即不存在在词表中，返回-1//如果该索引在词表中对应的词与正在查找的词不符，说明发生了hash值冲突，按照开放地址法去寻找这个词int SearchVocab(char *word) {  unsigned int hash = GetWordHash(word);  while (1) {    if (vocab_hash[hash] == -1) return -1;    if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];    hash = (hash + 1) % vocab_hash_size;  }  return -1;}//从文件中读入一个词，并返回这个词在词表中的位置，相当于将之前的两个函数包装了起来int ReadWordIndex(FILE *fin) {  char word[MAX_STRING];  ReadWord(word, fin);  if (feof(fin)) return -1;  return SearchVocab(word);}//为一个词构建一个vocab_word结构对象，并添加到词表中//词频初始化为0，hash值用之前的函数计算，//返回该词在词表中的位置int AddWordToVocab(char *word) {  unsigned int hash, length = strlen(word) + 1;  if (length > MAX_STRING) length = MAX_STRING;  vocab[vocab_size].word = (char *)calloc(length, sizeof(char));  strcpy(vocab[vocab_size].word, word);  vocab[vocab_size].cn = 0;  vocab_size++;  //每当词表数目即将超过最大值时，一次性为其申请添加一千个词结构体的内存空间  if (vocab_size + 2 >= vocab_max_size) {    vocab_max_size += 1000;    vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));  }  hash = GetWordHash(word);  //如果该hash值与其他词产生冲突，则使用开放地址法解决冲突（为这个词寻找一个hash值空位）  while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;  //将该词在词表中的位置赋给这个找到的hash值空位  vocab_hash[hash] = vocab_size - 1;  return vocab_size - 1;}//按照词频从大到小排序int VocabCompare(const void *a, const void *b) {    return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;}//统计词频，按照词频对词表中的项从大到小排序void SortVocab() {  int a, size;  unsigned int hash;  //对词表进行排序，将</s>放在第一个位置  qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);  //充值hash表  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;  size = vocab_size;  train_words = 0;  for (a = 0; a < size; a++) {    //将出现次数小于min_count的词从词表中去除，出现次数大于min_count的重新计算hash值，更新hash词表    if ((vocab[a].cn < min_count) && (a != 0)) {      vocab_size--;      free(vocab[a].word);    } else {//hash值计算      hash=GetWordHash(vocab[a].word);//hash值冲突解决      while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;      vocab_hash[hash] = a;//计算总词数      train_words += vocab[a].cn;    }  }  //由于删除了词频较低的词，这里调整词表的内存空间  vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));  // 为Haffman树的构建预先申请空间  for (a = 0; a < vocab_size; a++) {    vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));    vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));  }}//从词表中删除出现次数小于min_reduce的词，没执行一次该函数min_reduce自动加一void ReduceVocab() {  int a, b = 0;  unsigned int hash;  for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {    vocab[b].cn = vocab[a].cn;    vocab[b].word = vocab[a].word;    b++;  } else free(vocab[a].word);  vocab_size = b;  //重置hash表  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;  //更新hash表  for (a = 0; a < vocab_size; a++) {    //hash值计算    hash = GetWordHash(vocab[a].word);//hash值冲突解决    while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;    vocab_hash[hash] = a;  }  fflush(stdout);  min_reduce++;}//利用统计到的词频构建Haffman二叉树//根据Haffman树的特性，出现频率越高的词其二叉树上的路径越短，即二进制编码越短void CreateBinaryTree() {  long long a, b, i, min1i, min2i, pos1, pos2;  //用来暂存一个词到根节点的Haffman树路径  long long point[MAX_CODE_LENGTH];  //用来暂存一个词的Haffman编码  char code[MAX_CODE_LENGTH];    //内存分配，Haffman二叉树中，若有n个叶子节点，则一共会有2n-1个节点   //count数组前vocab_size个元素为Haffman树的叶子节点，初始化为词表中所有词的词频  //count数组后vocab_size个元素为Haffman书中即将生成的非叶子节点（合并节点）的词频，初始化为一个大值1e15  long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));  //binary数组记录各节点相对于其父节点的二进制编码（0/1）  long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));  //paarent数组记录每个节点的父节点  long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));  //count数组的初始化  for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;  for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;    //以下部分为创建Haffman树的算法，默认词表已经按词频由高到低排序  //pos1，pos2为别为词表中词频次低和最低的两个词的下标（初始时就是词表最末尾两个）  //</s>词也包含在树内  pos1 = vocab_size - 1;  pos2 = vocab_size;  //最多进行vocab_size-1次循环操作，每次添加一个节点，即可构成完整的树  for (a = 0; a < vocab_size - 1; a++) {    //比较当前的pos1和pos2，在min1i、min2i中记录当前词频最小和次小节点的索引//min1i和min2i可能是叶子节点也可能是合并后的中间节点    if (pos1 >= 0) {  //如果count[pos1]比较小，则pos1左移，反之pos2右移      if (count[pos1] < count[pos2]) {        min1i = pos1;        pos1--;      } else {        min1i = pos2;        pos2++;      }    } else {      min1i = pos2;      pos2++;    }    if (pos1 >= 0) {  //如果count[pos1]比较小，则pos1左移，反之pos2右移      if (count[pos1] < count[pos2]) {        min2i = pos1;        pos1--;      } else {        min2i = pos2;        pos2++;      }    } else {      min2i = pos2;      pos2++;    }//在count数组的后半段存储合并节点的词频（即最小count[min1i]和次小count[min2i]词频之和）    count[vocab_size + a] = count[min1i] + count[min2i];//记录min1i和min2i节点的父节点    parent_node[min1i] = vocab_size + a;    parent_node[min2i] = vocab_size + a;    //这里令每个节点的左右子节点中，词频较低的为1（则词频较高的为0）binary[min2i] = 1;  }    //根据得到的Haffman二叉树为每个词（树中的叶子节点）分配Haffman编码  //由于要为所有词分配编码，因此循环vocab_size次  for (a = 0; a < vocab_size; a++) {    b = a;    i = 0;    while (1) {  //不断向上寻找叶子结点的父节点，将binary数组中存储的路径的二进制编码增加到code数组末尾      code[i] = binary[b];  //在point数组中增加路径节点的编号      point[i] = b;  //Haffman编码的当前长度，从叶子结点到当前节点的深度      i++;      b = parent_node[b];  //由于Haffman树一共有vocab_size*2-1个节点，所以vocab_size*2-2为根节点      if (b == vocab_size * 2 - 2) break;    }//在词表中更新该词的信息//Haffman编码的长度，即叶子结点到根节点的深度    vocab[a].codelen = i;//Haffman路径中存储的中间节点编号要在现在得到的基础上减去vocab_size，即不算叶子结点，单纯在中间节点中的编号//所以现在根节点的编号为(vocab_size*2-2) - vocab_size = vocab_size - 2    vocab[a].point[0] = vocab_size - 2;//Haffman编码和路径都应该是从根节点到叶子结点的，因此需要对之前得到的code和point进行反向。    for (b = 0; b < i; b++) {      vocab[a].code[i - b - 1] = code[b];      vocab[a].point[i - b] = point[b] - vocab_size;    }  }  free(count);  free(binary);  free(parent_node);}//从训练文件中获取所有词汇并构建词表和hash比void LearnVocabFromTrainFile() {  char word[MAX_STRING];  FILE *fin;  long long a, i;    //初始化hash词表  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;    //打开训练文件  fin = fopen(train_file, "rb");  if (fin == NULL) {    printf("ERROR: training data file not found!\n");    exit(1);  }    //初始化词表大小  vocab_size = 0;  //将</s>添加到词表的最前端  AddWordToVocab((char *)"</s>");   //开始处理训练文件  while (1) {//从文件中读入一个词    ReadWord(word, fin);    if (feof(fin)) break;//对总词数加一，并输出当前训练信息    train_words++;    if ((debug_mode > 1) && (train_words % 100000 == 0)) {      printf("%lldK%c", train_words / 1000, 13);      fflush(stdout);    }//搜索这个词在词表中的位置    i = SearchVocab(word);    //如果词表中不存在这个词，则将该词添加到词表中，创建其在hash表中的值，初始化词频为1；反之，词频加一if (i == -1) {      a = AddWordToVocab(word);      vocab[a].cn = 1;    } else vocab[i].cn++;//如果词表大小超过上限，则做一次词表删减操作,将当前词频最低的词删除    if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();  }  //对词表进行排序，剔除词频低于阈值min_count的值，输出当前词表大小和总词数  SortVocab();  if (debug_mode > 0) {    printf("Vocab size: %lld\n", vocab_size);    printf("Words in train file: %lld\n", train_words);  }  //获取训练文件的大小，关闭文件句柄  file_size = ftell(fin);  fclose(fin);}//将单词和对应的词频输出到文件中void SaveVocab() {  long long i;  FILE *fo = fopen(save_vocab_file, "wb");  for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);  fclose(fo);}//从词汇表文件中读词并构建词表和hash表//由于词汇表中的词语不存在重复，因此与LearnVocabFromTrainFile相比没有做重复词汇的检测void ReadVocab() {  long long a, i = 0;  char c;  char word[MAX_STRING];  //打开词汇表文件  FILE *fin = fopen(read_vocab_file, "rb");  if (fin == NULL) {    printf("Vocabulary file not found\n");    exit(1);  }  //初始化hash词表  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;  vocab_size = 0;    //开始处理词汇表文件  while (1) {//从文件中读入一个词    ReadWord(word, fin);    if (feof(fin)) break;//将该词添加到词表中，创建其在hash表中的值，并通过输入的词汇表文件中的值来更新这个词的词频    a = AddWordToVocab(word);    fscanf(fin, "%lld%c", &vocab[a].cn, &c);    i++;  }  //对词表进行排序，剔除词频低于阈值min_count的值，输出当前词表大小和总词数  SortVocab();  if (debug_mode > 0) {    printf("Vocab size: %lld\n", vocab_size);    printf("Words in train file: %lld\n", train_words);  }  //打开训练文件，将文件指针移至文件末尾，获取训练文件的大小  fin = fopen(train_file, "rb");  if (fin == NULL) {    printf("ERROR: training data file not found!\n");    exit(1);  }  fseek(fin, 0, SEEK_END);  file_size = ftell(fin);  //关闭文件句柄  fclose(fin);}
1 0