srilm 阅读文档10

来源：互联网发布：白夜追凶淘宝视频编辑：程序博客网时间：2024/05/18 00:38

Vocab.h Vocab.cc
文档作者：rickjin
创立时间：08.09.10

--------------
1、基本类
--------------
    Vocab.h Vocab.cc 这两个文件主要提供了一个Vocab 类用于存储语料中出现的
    单词，实现一个单词(VocabString)与其对应的索引(VocabIndex)之间的快捷
    映射。在语言模型(LM)中需要表示一个单词的时候通常都用相应的索引(VocabIndex),
    而把单词串存放在 Vocab 类中, 这样能够节省内存空间。

VocabIter 是 Vocab 类的迭代器，用于迭代访问 Vocab 中的每个单词。

----------------
2、类接口说明
----------------

2.1) Vocab 类主要接口

const VocabIndex Vocab_None = (VocabIndex)-1; /* 单词索引可能的最大值，用于指示 NULL word*/

const VocabString   Vocab_Unknown   = "<unk>";    /* 未登录词 */
const VocabString   Vocab_SentStart = "<s>";      /* 句首     */
const VocabString   Vocab_SentEnd   = "</s>";     /* 句尾     */
const VocabString   Vocab_Pause     = "-pau-";    /* 停顿?    */

/* 用于单个单词索引排序的比较函数 */
typedef int (*VocabIndexComparator)(VocabIndex, VocabIndex);

/* 用于单词索引序列排序的比较函数 */
typedef int (*VocabIndicesComparator)(const VocabIndex *, const VocabIndex *);

class Vocab
{
friend class VocabIter;

public:

    /* ----------------
     * --- 基本接口 ---
     * ----------------
     */

/* 构造函数 */
Vocab(VocabIndex start = 0, VocabIndex end = (Vocab_None-1));

/* 添加一个单词 */
virtual VocabIndex addWord(VocabString name);

    /* 把一个单词 name 映射为另一个已经存在的单词 word */
    virtual VocabIndex addWordAlias(VocabIndex word, VocabString name);

    /* 获得索引 index 对应的单词 */
    virtual VocabString getWord(VocabIndex index);

/* 获得单词的索引，如果单词不存在，则返回未登录词 unknow word 对应的 unkindex */
virtual VocabIndex getIndex(VocabString name, VocabIndex unkIndex = Vocab_None);

    /* 按给定单词串或索引，删除一个单词 */
    virtual void remove(VocabString name);
    virtual void remove(VocabIndex index);

/* 返回单词个数 */
virtual unsigned int numWords() const;

/* 返回 Vocab 中存储的单词的最大索引值 */
virtual VocabIndex highIndex() const;

    /* 返回特殊词的索引 */
    virtual VocabIndex &unkIndex();      /* 未登录词   "<unk>"   */
    virtual VocabIndex &ssIndex();       /* 句首       "<s>"     */
    virtual VocabIndex &seIndex();       /* 句尾       "</s>"    */
    virtual VocabIndex &pauseIndex();    /* 停顿词     "-pau-"   */

/* 是否把未登录词 "<unk>" 当作普通词处理 */
virtual Boolean &unkIsWord() { return _unkIsWord; };

/* 是否把单词都转换为小写字母 */
virtual Boolean &toLower() { return _toLower; };

    /* ---------------------
     * --- NonEvent 处理 ---
     * ---------------------
     *
     * 有些字符串只能在上下文中出现，在这些单词上面并不分配概率, 我们把这些字
     * 符串被处理成为 "pseudo words", 专门存放在一个哈希表 nonEventMap 中。在
     * 语料中进行事件统计的时候，把这些单词当成 NonEvent 处理。接下来的几个函
     * 数处理这些 NonEvent 单词串.
     */

     /* 检测一个单词是否为 NonEvent */
    virtual Boolean isNonEvent(VocabString word);
    virtual Boolean isNonEvent(VocabIndex word) const

    /* 把一个单词加入 NonEvent 的哈希表中 */
    virtual VocabIndex addNonEvent(VocabIndex word);
    virtual VocabIndex addNonEvent(VocabString name)

/* 把词表 nonevents 中的所有单词作为 NonEvent 单词加入当前 Vocab */
virtual Boolean addNonEvents(Vocab &nonevents);

/* 从 NonEvent 哈希表中删除给定的单词 */
virtual Boolean removeNonEvent(VocabIndex word);

    /* ---------------------
     * --- Meta-Tag 处理 ---
     * ---------------------
     *
     * 处理对应于 meta-count (即 count-of-count) 的字符串标签(meta-tag)，这些
     * 标签用于对单词的型(type)进行计数, 而普通的单词串对应单词的值(token). 所
     * 有的meta-tag 都以 _metaTag为前缀(默认为 "__META__")，这些 meta-tag 都存放
     * 在一个哈希表 metaTagMap 中
     *
     * __META__        单词总数(type)
     * __META__1       频率为 1 的单词 type 总数
     * __META__2       频率为 2 的单词 type 总数
     * ...         ...
     * __META__N       频率为 N 的单词 type 总数
     */

/* meta-tag 的前缀字符串 */
virtual VocabString &metaTag() { return _metaTag; }; /* meta-count tag */

/* 检查一个字符串是否为 meta-tag */
Boolean isMetaTag(VocabIndex word)

    /* 检查一个meta-tag字符串对应的统计频率 ,
     * 如果参数 word 对应于 __META_k, 则返回整数 k
     */
    unsigned typeOfMetaTag(VocabIndex word)

    /* 给定整数 k, 返回对应的标签 __META__k 由于该标签存放在哈希表 metaTagMap
     * 中，返回相应的索引(VocabIndex) 即可
     */
    VocabIndex metaTagOfType(unsigned k );

    /* -------------------------------
     * --- 单词序列 word sequences ---
     * -------------------------------
     *
     * 以下函数用于处理单词序列
     */

    /* 给定索引序列 wids，把相应的单词序列存放在 words 数组中， max 指示 words 数组大小 */
     *
     * @param wids - 索引序列，以 NULL 指示结尾
     * @param words - 用于存放单词序列的数组
     * @param max   - 指示数组 words 的最大长度
     * @return 一共转换了多少个单词
     */
    virtual unsigned int getWords(const VocabIndex *wids, VocabString *words, unsigned int max);

/* 加入 words 中的单词，把相应的索引存放在 wids 数组中， max 指示数组 wids 大小 */
virtual unsigned int addWords(const VocabString *words, VocabIndex *wids, unsigned int max);

    /* 同上，把给定的单词序列转换为索引序列，但是不把单词加入到表中，如果单词不存在，则映射到未登录词 */
    virtual unsigned int getIndices(const VocabString *words, VocabIndex *wids, unsigned int max, VocabIndex unkIndex );

    /* 把给定的单词序列转换为索引序列，并检查所有单词是否已经加入表中，返回检查结果 */
    virtual Boolean checkWords(const VocabString *words, VocabIndex *wids, unsigned int max);

/* 把字符串在空格处切词，结果存放在 words 数组中， max 指示数组大小 */
static unsigned int parseWords(char *line, VocabString *words, unsigned int max);

/* 单词串长度，索引 Vocab_None 指示结尾 */
static unsigned int length(const VocabIndex *words);

/* 单词串长度， NULL 指示结尾 */
static unsigned int length(const VocabString *words);

/* 拷贝单词索引序列， Vocab_None 指示索引结尾 */
static VocabIndex *copy(VocabIndex *to, const VocabIndex *from);

/* 拷贝单词词串序列， Vocab_None 指示索引结尾 */
static VocabString *copy(VocabString *to, const VocabString *from);

/* 倒置单词索引序列 */
static VocabIndex *reverse(VocabIndex *words);

/* 倒置单词词串序列 */
static VocabString *reverse(VocabString *words);

/* 检查单词序列中是否包含给定单词 */
static Boolean contains(const VocabIndex *words, VocabIndex word);

/* 把单词写入文件，每行一个单词 */
static void write(File &file, const VocabString *words);

    /* 比较两个单词大小 (默认按字符串比较) */
    static int compare(VocabIndex word1, VocabIndex word2);
    static int compare(VocabString word1, VocabString word2)

    /* 比较两个单词序列大小 (默认逐个单词按字符串比较) */
    static int compare(const VocabIndex *word1, const VocabIndex *word2);
    static int compare(const VocabString *word1, const VocabString *word2);

/* 用于单个单词索引排序的比较函数 */
VocabIndexComparator compareIndex() const;

/* 用于单词序列排序的比较函数 */
VocabIndicesComparator compareIndices() const;

/* 我也看不明白这个函数 */
Boolean ngramsInRange(VocabString *startRange, VocabString *endRange);

    /* -----------------------
     * --- Miscellaneous ----
     * -----------------------
     */

/* 从给定文件读入单词加入词表，每行一个单词 */
virtual unsigned int read(File &file);

/* 从文件中读入单词别名映射，文件每行格式 : <alias, word>
virtual unsigned int readAliases(File &file);

/* 把单词排序后写入文件，每行一个单词 */
virtual void write(File &file, Boolean sorted = true) const;

/* 使用自身作为输出 outputvocab */
virtual void use() const { outputVocab = (Vocab *)this; }; // discard const

/* 把文件中的单词索引/单词串 (index, word) 映射读入当前的 Vocab, 使用 map 记录必要的索引映射 */
virtual Boolean readIndexMap(File &file, Array<VocabIndex> &map, Boolean limitVocab = false);

/* 把当前的 Vocab 中的单词和对应的索引按 (index, word) 格式写入文件 */
virtual void writeIndexMap(File &file);

/*在重载操作符 << 的时候用于通过单词索引输出单词串, 默认为 this 指针 */
static Vocab *outputVocab; /* implicit parameter to operator<< */

/* 在 compare() 函数中，用于通过单词索引比较两个单词，默认为 this 指针 */
static Vocab *compareVocab; /* implicit parameter to compare() */

protected:

    LHash<VocabString,VocabIndex> byName;    /* 单词->索引哈希表，实现快速按单词串查询 */
    Array<VocabString> byIndex;              /* 索引->单词映射表，实现类似数组的快速下标访问 */
    VocabIndex nextIndex;                    /* 下一个插入的新单词应该分配的索引 */
    VocabIndex maxIndex;                     /* 当前 Vocab 的最大单词索引值 */

LHash<VocabIndex, unsigned> nonEventMap ;/* NonEvent 词表 */
LHash<VocabIndex, unsigned> metaTagMap; /* meta-tag 词表，存储一个 meta-tag 到其对应 type 的映射 */

    VocabIndex _unkIndex;                    /* 未登录词 <unk> 索引 */
    VocabIndex _ssIndex;                     /* 句首词 <s> 索引 */
    VocabIndex _seIndex;                     /* 句尾词 </s> 索引 */
    VocabIndex _pauseIndex;                  /* 停顿词 -pau- 索引 */
    Boolean _unkIsWord;                      /* 是否把未登录词 <unk> 当作普通词处理 */
    Boolean _toLower;                        /* 是否把单词串转换为小写字母 */
    VocabString _metaTag;                    /* meta-tag 的前缀 */
};

说明:
a. Vocab 类中的接口函数主要分为四大类
   1) 基本操作 : 实现单词的增，删，找等基本操作
   2) NonEvent 词处理 : 实现 NonEvent word 的增，删，找等操作
   3) meta-tag 词处理 : 实现 meta-tag 和其想对应的 type 之间的转换
   4) 单词序列处理    : 实现单词序列的转换(单词串<->索引), 拷贝, 比较等操作
   5) 其他杂项

b. 对于一个单词串序列 VocabString* word, 以指针 NULL 指示结束；对于单词索引序列
VocabIndex* word, 以 Vocab_None 指示结束位置

c. 所有的单词串都存放在 byName, byIndex 两个映射表中， NonEvent 词和 meta-tag
分别存放在 nonEventMap 映射表和 metaTagMap 映射表中, 但是存放的是索引，不是单
词串。

-------------------
3、主要函数功能说明
-------------------

2.1) Vocab 类主要接口

2.1.1) 添加单词
<src>
VocabIndex Vocab::addWord(VocabString name)
{
    /* 把单词加入 Vocab 普通词表，并的到单词索引 */
    VocabIndex *indexPtr = byName.insert(name, found);
    ...
    *indexPtr = nextIndex;
    ...
byIndex[nextIndex] = byName.getInternalKey(name);
    ...

    /* 检测当前的词是否是 meta-tag, 如果是则加入 metaTagMap */
    if (_metaTag != 0) {
        unsigned metaTagLength = strlen(_metaTag);

        if (strncmp(name, _metaTag, metaTagLength) == 0) {
            int type = -1;
            if (name[metaTagLength] == '/0') {
                type = 0;
            } else {
                sscanf(&name[metaTagLength], "%u", &type);
            }
            if (type >= 0) {
                *metaTagMap.insert(nextIndex) = type;
            }
        }
    }

return nextIndex++;
}
</src>

Vocab 类中定义了字符串 _metaTag, 任何以 _metaTag 为前缀的单词都将被当作
meta-tag 处理，每个meta-tag 字符串都是由 _metaTag + type 来表示，其中 type 为
一整数，表示对应的单词频率。例如当 _metaTag = "__META__" 的时候, "__META__10"
对应于频率为10 的所有单词。这种 meta-tag 用于处理 count-of-count, 在语言模型的
平滑中将用到。

2.1.2) 添加单词别名映射

参数说明：
@param word 已经存在于vocab 中的单词
@param name 单词的别名

<src>
VocabIndex
Vocab::addWordAlias(VocabIndex word, VocabString name)
{
    //确认单词已经存在, 不存在则返回空
    if (byIndex[word] == 0) {
        return Vocab_None;
    } else {
        // 避免添加的别名等于自身
        if (strcmp(name, byIndex[word]) == 0) {
            return word;
        }

// 确认单词串 name 不在当前 Vocab 的词表中
// 如果存在，则删除
remove(name);

VocabIndex *indexPtr = byName.insert(name);
*indexPtr = word;
return word;
}
}
</src>

该函数把不同的单词串映射到同一个 VocabIndex, 从而建立起单词之间的映射关系。在
对单词进行统计的时候是按照 VocabIndex 进行的，因而单词和被映射的别名将被当作一
个单词项处理。例如，在处理英语单词 cat 的复数 cats 的时候，建立映射关系
cats --> cat, 然后再进行词频统计。

2.1.2 获取相应频率的 meta-tag

参数说明：
@param type meta-tag 对应的频率，

<src>
VocabIndex
Vocab::metaTagOfType(unsigned type)
{
    if (_metaTag == 0) {
        return Vocab_None;
    } else {
        if (type == 0) {
            return getIndex(_metaTag);
        } else {
            makeArray(char, tagName, strlen(_metaTag) + 20);

            sprintf(tagName, "%s%u", _metaTag, type);
            return getIndex(tagName);
        }
    }
}
</src>

该函数对于给定整数 k, 转换为对应的元标签字符串 __META__k, 由于该标签存放在哈希
表 metaTagMap 中，返回相应的索引(VocabIndex) 即可。例如，当前缀串 _metaTag 为
"__META__" 的时候, 如果 k=3, 则对应的 meta-tag 为 "__META__3", 表示该 meta-tag
关联到频率为 3 的所有单词, k=0 时是一个特例，直接对应于 "__META__"。

makeArray(char, tagName, strlen(_metaTag) + 20) 是一个宏，大多情况下等价于
数组定义 char tagName[strlen(_metaTag + 20);

/* vi: set ts=4 sw=4 et: */