QT实现Double-Array Trie

来源:互联网 发布:硕鼠有mac版吗 编辑:程序博客网 时间:2024/05/16 18:56

联系作者 qq 843230304

双数组Trie树(DoubleArrayTrie)是一种空间复杂度低的Trie树,应用于字符区间大的语言(如中文、日文等)分词领域。

双数组Trie (Double-Array Trie)结构由日本人JUN-ICHI AOE于1989年提出的,是Trie结构的压缩形式,仅用两个线性数组来表示Trie树,该结构有效结合了数字搜索树(Digital Search Tree)检索时间高效的特点和链式表示的Trie空间结构紧凑的特点。双数组Trie的本质是一个确定有限状态自动机(DFA),每个节点代表自动机的一个状态,根据变量不同,进行状态转移,当到达结束状态或无法转移时,完成一次查询操作。在双数组所有键中包含的字符之间的联系都是通过简单的数学加法运算表示,不仅提高了检索速度,而且省去了链式结构中使用的大量指针,节省了存储空间。
——《基于双数组Trie树算法的字典改进和实现》

Trie树主要应用在信息检索领域,非常高效。今天我们讲Double Array Trie,请先把Trie树忘掉,把信息检索忘掉,我们来讲一个确定有限自动机(deterministic finite automaton ,DFA)的故事。所谓“确定有限自动机”是指给定一个状态和一个变量时,它能跳转到的下一个状态也就确定下来了,同时状态是有限的。请注意这里出现两个名词,一个是“状态”,一个是“变量”,下文会举例说明这两个名词的含义。

头文件

#ifndef DOUBLEARRAYTRIE_H#define DOUBLEARRAYTRIE_H#include <QList>#include <QVector>#include <QStringList>#include "pinyinresource.h"class DoubleArrayTrie{public:    DoubleArrayTrie();    ~DoubleArrayTrie();   int exactMatchSearch(QString key);   int exactMatchSearch(QString key, int pos, int len, int nodePos);   QList<int> commonPrefixSearch(QString key);   QList<int> commonPrefixSearch(QString key, int pos, int len, int nodePos);    // debug    void dump();    int build(QStringList key);    int build(QStringList _key, int *_length, int *_value, int _keySize);    void clear();    int getUnitSize();    int getSize();    int getTotalSize();    int getNonzeroSize();public:    int error_;private:    struct Node{        int code;        int depth;        int left;        int right;    };    QVector<int> *check;    QVector<int> *base;    QVector<bool> *used;    int size;    int allocSize;    QStringList key;    int keySize;    int *length;    int *value;    int progress;    int nextCheckPos;private:    /// \brief 重置数组大小    /// \param newSize 大小数值    /// \return    int resize(int newSize);    /// \brief 获取    /// \param parent    /// \param siblings    /// \return    int fetch(Node parent, QList<Node> &siblings);    /// \brief 插入    /// \param siblings    /// \return    int insert(QList<Node> siblings);};#endif // DOUBLEARRAYTRIE_H

源文件

#include "doublearraytrie.h"#include <QDebug>#include <QFile>#include <QDataStream>const static int BUF_SIZE = 65536 * 32;const static int UNIT_SIZE = sizeof(int)*2; // size of int + intDoubleArrayTrie::DoubleArrayTrie() {    check = new QVector<int>(BUF_SIZE);    base = new QVector<int>(BUF_SIZE);    used = new QVector<bool>(BUF_SIZE);    size = 0;    allocSize = 0;    keySize = 0;    length = Q_NULLPTR;    value = Q_NULLPTR;    progress = 0;    nextCheckPos = 0;    error_ = 0;}DoubleArrayTrie::~DoubleArrayTrie() {    if(check != Q_NULLPTR) {        delete check;        check = Q_NULLPTR;    }    if(base != Q_NULLPTR) {        delete base;        base = Q_NULLPTR;    }    if(used != Q_NULLPTR) {        delete used;        used = Q_NULLPTR;    }    if(length != Q_NULLPTR) {        delete length;        length = Q_NULLPTR;    }    if(value != Q_NULLPTR) {        delete value;        value = Q_NULLPTR;    }}int DoubleArrayTrie::exactMatchSearch(QString key) {    return exactMatchSearch(key, 0, 0, 0);}int DoubleArrayTrie::exactMatchSearch(QString key, int pos, int len, int nodePos) {    if (len <= 0)        len = key.length();    if (nodePos <= 0)        nodePos = 0;    int result = -1;    const QChar * keyChars = key.constData();    int b = base->at(nodePos);    int p;    for (int i = pos; i < len; i++) {        p = b + (int)(keyChars[i].unicode()) + 1;        if (b == check->at(p))            b = base->at(p);        else            return result;    }    p = b;    int n = base->at(p);    if (b == check->at(p) && n < 0) {        result = -n - 1;    }    return result;}QList<int> DoubleArrayTrie::commonPrefixSearch(QString key) {    return commonPrefixSearch(key, 0, 0, 0);}QList<int> DoubleArrayTrie::commonPrefixSearch(QString key, int pos, int len, int nodePos) {    if (len <= 0)        len = key.length();    if (nodePos <= 0)        nodePos = 0;    QList<int> result;    const QChar * keyChars = key.constData();    int b = base->at(nodePos);    int n = 0;    int p = 0;    for (int i = pos; i < len; i++) {        p = b;        n = base->at(p);        if (b == check->at(p) && n < 0) {            result.append(-n - 1);        }        p = b + (int)(keyChars[i].unicode()) + 1;        if (b == check->at(p))            b = base->at(p);        else            return result;    }    p = b;    n = base->at(p);    if (b == check->at(p) && n < 0) {        result.append(-n - 1);    }    return result;}void DoubleArrayTrie::dump() {    for (int i = 0; i < size; i++) {       qDebug() <<"i: " << i <<" [" << base[i] << ", " << check[i] << "]";    }}int DoubleArrayTrie::resize(int newSize) {    base->resize(newSize);    check->resize(newSize);    used->resize(newSize);    return allocSize = newSize;}int DoubleArrayTrie::fetch(Node parent, QList<Node> &siblings) {    if (error_ < 0)        return 0;    int prev = 0;    for (int i = parent.left; i < parent.right; i++) {        if ((length != Q_NULLPTR ? length[i] : key.at(i).length()) < parent.depth)            continue;        QString tmp = key.at(i);        int cur = 0;        if ((length != Q_NULLPTR ? length[i] : tmp.length()) != parent.depth)            cur = (int) tmp.at(parent.depth).unicode() + 1;        if (prev > cur) {            error_ = -3;            return 0;        }        if (cur != prev || siblings.size() == 0) {            Node tmp_node ;            tmp_node.depth = parent.depth + 1;            tmp_node.code = cur;            tmp_node.left = i;            if (siblings.size() != 0) {                siblings[(siblings.size() - 1)].right = i;            }            siblings.append(tmp_node);        }        prev = cur;    }    if (siblings.size() != 0)        siblings[(siblings.size() - 1)].right = parent.right;    return siblings.size();}int DoubleArrayTrie::insert(QList<Node> siblings) {    if (error_ < 0)        return 0;    if(siblings.length() <= 0) {        return 0;    }    int begin = 0;    int pos = ((siblings[0].code + 1 > nextCheckPos) ? siblings[0].code + 1 : nextCheckPos) - 1;    int nonzero_num = 0;    int first = 0;    if (allocSize <= pos)        resize(pos + 1);outer: while (true) {        pos++;        if (allocSize <= pos)            resize(pos + 1);        if (check->at(pos) != 0) {            nonzero_num++;            continue;        } else if (first == 0) {            nextCheckPos = pos;            first = 1;        }        begin = pos - siblings[0].code;        if (allocSize <= (begin + siblings[(siblings.size() - 1)].code)) {            // progress can be zero            double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05 : 1.0 * keySize / (progress + 1);            resize((int) (allocSize * l));        }        if (used->at(begin))            continue;        for (int i = 1; i < siblings.size(); i++)            if (check->at(begin + siblings[i].code) != 0)                //continue outer;                goto outer;        break;    }    // -- Simple heuristics --    // if the percentage of non-empty contents in check between the    // index    // 'next_check_pos' and 'check' is greater than some constant value    // (e.g. 0.9),    // new 'next_check_pos' index is written by 'check'.    if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95)        nextCheckPos = pos;    used->replace(begin,true);    size = (size > begin + siblings[(siblings.size() - 1)].code + 1) ? size : begin + siblings[(siblings.size() - 1)].code + 1;    for (int i = 0; i < siblings.size(); i++)        check->replace(begin + siblings[i].code, begin);    for (int i = 0; i < siblings.size(); i++) {        QList<Node> new_siblings;        if (fetch(siblings[i], new_siblings) == 0) {            base->replace(begin + siblings[i].code,(value != Q_NULLPTR) ? (-value[siblings[i].left] - 1) : (-siblings[i].left - 1));            if (value != Q_NULLPTR && (-value[siblings[i].left] - 1) >= 0) {                error_ = -2;                return 0;            }            progress++;            // if (progress_func_) (*progress_func_) (progress,            // keySize);        } else {            int h = insert(new_siblings);            base->replace(begin + siblings[i].code, h);        }    }    return begin;}void DoubleArrayTrie::clear() {    check->clear();    base->clear();    used->clear();    allocSize = 0;    size = 0;}int DoubleArrayTrie::getUnitSize() {    return UNIT_SIZE;}int DoubleArrayTrie::getSize() {    return size;}int DoubleArrayTrie::getTotalSize() {    return size * UNIT_SIZE;}int DoubleArrayTrie::getNonzeroSize() {    int result = 0;    for (int i = 0; i < size; i++)        if (check->at(i) != 0)            result++;    return result;}int DoubleArrayTrie::build(QStringList key) {    return build(key, NULL, NULL, key.size());}int DoubleArrayTrie::build(QStringList _key, int *_length, int *_value, int _keySize) {    if (_keySize > _key.size() || _key.size() == 0)        return 0;    // progress_func_ = progress_func;    key = _key;    keySize = _keySize;    if(length != Q_NULLPTR) {        delete length;        length = Q_NULLPTR;    }    if(value != Q_NULLPTR) {        delete value;        value = Q_NULLPTR;    }    length = _length;    value = _value;    progress = 0;    resize(BUF_SIZE);    base->replace(0, 1);    nextCheckPos = 0;    Node root_node;    root_node.left = 0;    root_node.right = keySize;    root_node.depth = 0;    QList<Node> siblings;    fetch(root_node, siblings);    insert(siblings);    // size += (1 << 8 * 2) + 1; // ???    // if (size >= allocSize) resize (size);    if(used != Q_NULLPTR) {        delete used;        used = Q_NULLPTR;    }    key.clear();    return error_;}
原创粉丝点击