QT实现Double-Array Trie
来源:互联网 发布:硕鼠有mac版吗 编辑:程序博客网 时间:2024/05/16 18:56
联系作者 qq 843230304
双数组Trie树(DoubleArrayTrie)是一种空间复杂度低的Trie树,应用于字符区间大的语言(如中文、日文等)分词领域。
双数组Trie (Double-Array Trie)结构由日本人JUN-ICHI AOE于1989年提出的,是Trie结构的压缩形式,仅用两个线性数组来表示Trie树,该结构有效结合了数字搜索树(Digital Search Tree)检索时间高效的特点和链式表示的Trie空间结构紧凑的特点。双数组Trie的本质是一个确定有限状态自动机(DFA),每个节点代表自动机的一个状态,根据变量不同,进行状态转移,当到达结束状态或无法转移时,完成一次查询操作。在双数组所有键中包含的字符之间的联系都是通过简单的数学加法运算表示,不仅提高了检索速度,而且省去了链式结构中使用的大量指针,节省了存储空间。
——《基于双数组Trie树算法的字典改进和实现》
Trie树主要应用在信息检索领域,非常高效。今天我们讲Double Array Trie,请先把Trie树忘掉,把信息检索忘掉,我们来讲一个确定有限自动机(deterministic finite automaton ,DFA)的故事。所谓“确定有限自动机”是指给定一个状态和一个变量时,它能跳转到的下一个状态也就确定下来了,同时状态是有限的。请注意这里出现两个名词,一个是“状态”,一个是“变量”,下文会举例说明这两个名词的含义。
头文件
#ifndef DOUBLEARRAYTRIE_H#define DOUBLEARRAYTRIE_H#include <QList>#include <QVector>#include <QStringList>#include "pinyinresource.h"class DoubleArrayTrie{public: DoubleArrayTrie(); ~DoubleArrayTrie(); int exactMatchSearch(QString key); int exactMatchSearch(QString key, int pos, int len, int nodePos); QList<int> commonPrefixSearch(QString key); QList<int> commonPrefixSearch(QString key, int pos, int len, int nodePos); // debug void dump(); int build(QStringList key); int build(QStringList _key, int *_length, int *_value, int _keySize); void clear(); int getUnitSize(); int getSize(); int getTotalSize(); int getNonzeroSize();public: int error_;private: struct Node{ int code; int depth; int left; int right; }; QVector<int> *check; QVector<int> *base; QVector<bool> *used; int size; int allocSize; QStringList key; int keySize; int *length; int *value; int progress; int nextCheckPos;private: /// \brief 重置数组大小 /// \param newSize 大小数值 /// \return int resize(int newSize); /// \brief 获取 /// \param parent /// \param siblings /// \return int fetch(Node parent, QList<Node> &siblings); /// \brief 插入 /// \param siblings /// \return int insert(QList<Node> siblings);};#endif // DOUBLEARRAYTRIE_H
源文件
#include "doublearraytrie.h"#include <QDebug>#include <QFile>#include <QDataStream>const static int BUF_SIZE = 65536 * 32;const static int UNIT_SIZE = sizeof(int)*2; // size of int + intDoubleArrayTrie::DoubleArrayTrie() { check = new QVector<int>(BUF_SIZE); base = new QVector<int>(BUF_SIZE); used = new QVector<bool>(BUF_SIZE); size = 0; allocSize = 0; keySize = 0; length = Q_NULLPTR; value = Q_NULLPTR; progress = 0; nextCheckPos = 0; error_ = 0;}DoubleArrayTrie::~DoubleArrayTrie() { if(check != Q_NULLPTR) { delete check; check = Q_NULLPTR; } if(base != Q_NULLPTR) { delete base; base = Q_NULLPTR; } if(used != Q_NULLPTR) { delete used; used = Q_NULLPTR; } if(length != Q_NULLPTR) { delete length; length = Q_NULLPTR; } if(value != Q_NULLPTR) { delete value; value = Q_NULLPTR; }}int DoubleArrayTrie::exactMatchSearch(QString key) { return exactMatchSearch(key, 0, 0, 0);}int DoubleArrayTrie::exactMatchSearch(QString key, int pos, int len, int nodePos) { if (len <= 0) len = key.length(); if (nodePos <= 0) nodePos = 0; int result = -1; const QChar * keyChars = key.constData(); int b = base->at(nodePos); int p; for (int i = pos; i < len; i++) { p = b + (int)(keyChars[i].unicode()) + 1; if (b == check->at(p)) b = base->at(p); else return result; } p = b; int n = base->at(p); if (b == check->at(p) && n < 0) { result = -n - 1; } return result;}QList<int> DoubleArrayTrie::commonPrefixSearch(QString key) { return commonPrefixSearch(key, 0, 0, 0);}QList<int> DoubleArrayTrie::commonPrefixSearch(QString key, int pos, int len, int nodePos) { if (len <= 0) len = key.length(); if (nodePos <= 0) nodePos = 0; QList<int> result; const QChar * keyChars = key.constData(); int b = base->at(nodePos); int n = 0; int p = 0; for (int i = pos; i < len; i++) { p = b; n = base->at(p); if (b == check->at(p) && n < 0) { result.append(-n - 1); } p = b + (int)(keyChars[i].unicode()) + 1; if (b == check->at(p)) b = base->at(p); else return result; } p = b; n = base->at(p); if (b == check->at(p) && n < 0) { result.append(-n - 1); } return result;}void DoubleArrayTrie::dump() { for (int i = 0; i < size; i++) { qDebug() <<"i: " << i <<" [" << base[i] << ", " << check[i] << "]"; }}int DoubleArrayTrie::resize(int newSize) { base->resize(newSize); check->resize(newSize); used->resize(newSize); return allocSize = newSize;}int DoubleArrayTrie::fetch(Node parent, QList<Node> &siblings) { if (error_ < 0) return 0; int prev = 0; for (int i = parent.left; i < parent.right; i++) { if ((length != Q_NULLPTR ? length[i] : key.at(i).length()) < parent.depth) continue; QString tmp = key.at(i); int cur = 0; if ((length != Q_NULLPTR ? length[i] : tmp.length()) != parent.depth) cur = (int) tmp.at(parent.depth).unicode() + 1; if (prev > cur) { error_ = -3; return 0; } if (cur != prev || siblings.size() == 0) { Node tmp_node ; tmp_node.depth = parent.depth + 1; tmp_node.code = cur; tmp_node.left = i; if (siblings.size() != 0) { siblings[(siblings.size() - 1)].right = i; } siblings.append(tmp_node); } prev = cur; } if (siblings.size() != 0) siblings[(siblings.size() - 1)].right = parent.right; return siblings.size();}int DoubleArrayTrie::insert(QList<Node> siblings) { if (error_ < 0) return 0; if(siblings.length() <= 0) { return 0; } int begin = 0; int pos = ((siblings[0].code + 1 > nextCheckPos) ? siblings[0].code + 1 : nextCheckPos) - 1; int nonzero_num = 0; int first = 0; if (allocSize <= pos) resize(pos + 1);outer: while (true) { pos++; if (allocSize <= pos) resize(pos + 1); if (check->at(pos) != 0) { nonzero_num++; continue; } else if (first == 0) { nextCheckPos = pos; first = 1; } begin = pos - siblings[0].code; if (allocSize <= (begin + siblings[(siblings.size() - 1)].code)) { // progress can be zero double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05 : 1.0 * keySize / (progress + 1); resize((int) (allocSize * l)); } if (used->at(begin)) continue; for (int i = 1; i < siblings.size(); i++) if (check->at(begin + siblings[i].code) != 0) //continue outer; goto outer; break; } // -- Simple heuristics -- // if the percentage of non-empty contents in check between the // index // 'next_check_pos' and 'check' is greater than some constant value // (e.g. 0.9), // new 'next_check_pos' index is written by 'check'. if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95) nextCheckPos = pos; used->replace(begin,true); size = (size > begin + siblings[(siblings.size() - 1)].code + 1) ? size : begin + siblings[(siblings.size() - 1)].code + 1; for (int i = 0; i < siblings.size(); i++) check->replace(begin + siblings[i].code, begin); for (int i = 0; i < siblings.size(); i++) { QList<Node> new_siblings; if (fetch(siblings[i], new_siblings) == 0) { base->replace(begin + siblings[i].code,(value != Q_NULLPTR) ? (-value[siblings[i].left] - 1) : (-siblings[i].left - 1)); if (value != Q_NULLPTR && (-value[siblings[i].left] - 1) >= 0) { error_ = -2; return 0; } progress++; // if (progress_func_) (*progress_func_) (progress, // keySize); } else { int h = insert(new_siblings); base->replace(begin + siblings[i].code, h); } } return begin;}void DoubleArrayTrie::clear() { check->clear(); base->clear(); used->clear(); allocSize = 0; size = 0;}int DoubleArrayTrie::getUnitSize() { return UNIT_SIZE;}int DoubleArrayTrie::getSize() { return size;}int DoubleArrayTrie::getTotalSize() { return size * UNIT_SIZE;}int DoubleArrayTrie::getNonzeroSize() { int result = 0; for (int i = 0; i < size; i++) if (check->at(i) != 0) result++; return result;}int DoubleArrayTrie::build(QStringList key) { return build(key, NULL, NULL, key.size());}int DoubleArrayTrie::build(QStringList _key, int *_length, int *_value, int _keySize) { if (_keySize > _key.size() || _key.size() == 0) return 0; // progress_func_ = progress_func; key = _key; keySize = _keySize; if(length != Q_NULLPTR) { delete length; length = Q_NULLPTR; } if(value != Q_NULLPTR) { delete value; value = Q_NULLPTR; } length = _length; value = _value; progress = 0; resize(BUF_SIZE); base->replace(0, 1); nextCheckPos = 0; Node root_node; root_node.left = 0; root_node.right = keySize; root_node.depth = 0; QList<Node> siblings; fetch(root_node, siblings); insert(siblings); // size += (1 << 8 * 2) + 1; // ??? // if (size >= allocSize) resize (size); if(used != Q_NULLPTR) { delete used; used = Q_NULLPTR; } key.clear(); return error_;}
阅读全文
0 0
- QT实现Double-Array Trie
- double-array trie算法实现
- double array trie
- 什么是Double Array Trie
- 深入double array trie
- double array trie
- Double Array Trie
- Double Array Trie
- Double Array Trie
- double-array trie 译文+心得
- double-array trie 译文+心得
- Trie树(压缩Trie树及Double-Array Trie)
- Double-Array Trie分词词典简述 [转]
- Double-Array Trie分词词典简述 [转]
- Double-Array Trie分词词典简述(zz)
- Double-Array Trie分词词典简述
- 【基础知识】An Implementation of Double-Array Trie
- 【基础知识】Darts: Double-ARray Trie System
- TF Learn入门 —— 稍复杂使用举例
- 云平台建设学习4
- offset系列、client系列、scroll系列的属性
- 大数据可视化分析平台新应用:提升企业的数字营销策略
- VS番茄插件贼好用
- QT实现Double-Array Trie
- Qt 带自定义数据结构参数的槽函数连接失败
- 微信小程序页面跳转(五)
- CODE[VS]2996 Comrade 同志
- 【麦子学院】03.web前端开发之convas画布
- 程序员获取新编程技能的5个技巧?
- 代码命名大小写规范(Java,Python)
- 如何用shell统计当前目录下子目录的大小
- PL/SQL Developer连接64位的Oracle(Oracle在本地已经安装)