double-array trie c代码 - a efficient implementation of trie structures

来源:互联网 发布:查看mac硬盘使用 编辑:程序博客网 时间:2024/05/16 13:57

这些代码已经stress test, 但是因为set_list 函数还有待改进,所以代码中insert_word还会占用很长的时间。会进一步改进。算法来源于a efficient implementation of trie structures. 作者貌似是个日本人。先贴上代码,以便供喜欢double array trie的人研究,看了libdatrie的源码,比这个复杂,但是 原理是一样的。近期会给出原文章的翻译。http://blog.csdn.net/zzran/article/details/8462002

#include<iostream>#include<string>using namespace std;#define MIN_CODE 1#define MAX_CODE 255#define BC_INC 10#define TAIL_INC 10#define TEMP_INC 5#define CHAR_NUM 26int *BC;char *TAIL;char *TEMP;int BC_POS;int TAIL_POS;int BC_MAX;int TAIL_MAX;int TEMP_MAX;void realloc_bc();void separate(int s, char *b, int tail_pos);int  change_bc(int current, int s, char *list, char ch);int base(int n) {if(n > BC_POS) {return 0;} else {cout << "read base index=" << n << ":value="<< BC[2 * n] << endl;return BC[2 * n];}}int check(int n) {if(n > BC_POS) {return 0;} else {cout << "read check index=" << n << ":value="<< BC[2 * n + 1] << endl;return BC[2 * n + 1];}}void w_base(int n, int node) {while(n >= BC_MAX) {realloc_bc();}if(n > BC_POS) {BC_POS = n;}BC[2 * n] = node; cout << "write base index=" << n << ":value="<< BC[2 * n] << endl;}void w_check(int n, int node) {while(n >= BC_MAX) {realloc_bc();}if(n > BC_POS) {BC_POS = n;}BC[2 * n + 1] = node; cout << "write check index=" << n << ":value="<< BC[2 * n + 1] << endl;}char *mem_str(char *area_name, int *max, int init) {*max = init;char *area = (char*)malloc(sizeof(char) * (*max));if(area == NULL) {cout << area_name << " malloc error!" << endl;}memset(area, *max, '\0');return area;}int arc_index(char ch) {return ch - 'a' + 2;}void realloc_bc() {int i, pre_bc;pre_bc = BC_MAX;BC_MAX += BC_INC;BC =(int*)realloc(BC, sizeof(int) * 2 * BC_MAX);if(BC == NULL) {cout << "realloc bc error!" << endl;return;}for(i = 2 * pre_bc; i < 2 * BC_MAX; i++) {BC[i] = 0;}cout << "realloc bc!" << endl;}char *realloc_str(char *area_name, char *area, int *max, int inc) {int pre_size;int i;pre_size = *max;*max += inc;area = (char*) realloc(area, sizeof(char) * (*max));if(area == NULL) {cout << area_name << " realloc error!" << endl;exit(-1);}for(i = pre_size; i < *max; i++) {area[i] = '\0';}cout << area_name << " realloc ok!" << endl;return area;}void read_tail(int p) {int i = 0;while(TAIL[p] != '#') TEMP[i++] = TAIL[p++];TEMP[i++] = '#';TEMP[i] = '\0';cout << "read tail!" << endl;}void write_tail(char *temp, int p) {int i = 0; int tail_index;tail_index = p;while((p + strlen(temp)) >= TAIL_MAX - 1) {TAIL = realloc_str("TAIL", TAIL, &TAIL_MAX, TAIL_INC);}while(*(temp + i) != '\0') {TAIL[tail_index++] = *(temp + i);i++;}if(p + i + 1 > TAIL_POS) {TAIL_POS = p + i;}cout << "write tail!" << endl;}int x_check(char *list) {int i, base_pos = 1, check_pos;unsigned char ch;i = 0;cout << "x_check start:" << endl;do {ch = list[i++];check_pos = base_pos + ch;  //changeif(check(check_pos) != 0) {base_pos++;i = 0;continue;}} while(list[i] != '\0');cout << "x_check end!" << endl;return base_pos;}char *set_list(int s) {char *list = (char*)malloc(MAX_CODE + 1 + 1); // 256个字符 + 1 '\0'int i, j = 0, t;for(i = MIN_CODE; i < MAX_CODE; i++) {t = base(s) + i;if(check(t) == s) {list[j] = (unsigned char)i; //changej++;}}list[j] = '\0';cout << "set_list:" << list << endl;return list;}void separate(int s, char *b, int tail_pos) {int t = base(s) + (unsigned char)(*b); // changeb++;w_check(t, s);w_base(t, (-1) * tail_pos);write_tail(b, tail_pos);}void bc_insert(int s, char *b) {int t;char list_s[MAX_CODE + 2];char list_t[MAX_CODE + 2];cout << "bc_insert start:" << endl;t = base(s) + (unsigned char)(*b); // changecout << "t=" << t << " check(t)=" << check(t) << endl;if(check(t) != 0) {strcpy(list_s, set_list(s));strcpy(list_t, set_list(check(t)));if(strlen(list_s) + 1 < strlen(list_t)) {cout << "list_s=" << list_s << endl;s = change_bc(s, s, list_s, *b);} else {    cout << "list_t=" << list_t << endl;s = change_bc(s, check(t), list_t, '\0');}}separate(s, b, TAIL_POS);cout << "bc_insert end." << endl;}int  change_bc(int current, int s, char *list, char ch) {int i, k, old_node, new_node, old_base;char a_list [MAX_CODE + 2];old_base = base(s);if(ch != '\0') {strcpy(a_list, list);i = strlen(a_list);a_list[i] = ch;a_list[i + 1] = '\0';} else {strcpy(a_list, list);}w_base(s, x_check(a_list));i = 0;do {old_node = old_base + (unsigned char)(*list); //changenew_node = base(s) + (unsigned char)(*list);cout << "old_node=" << old_node << ",new_node=" << new_node << endl; w_base(new_node, base(old_node));w_check(new_node, s);if(base(old_node) > 0) {k = base(old_node) + 1;while(k - base(old_node) <= MAX_CODE || k < BC_POS) {if(check(k) == old_node) {w_check(k, new_node);}++k;}}if(current != s && old_node == current) {current = new_node;}w_base(old_node, 0);w_check(old_node, 0);list++;} while(*list != '\0');return current;}void tail_insert(int s, char *a, char *b) {char list[3];unsigned char ch;int i = 0;int length = 0;int t;int old_tail_pos;old_tail_pos = (-1) * base(s);cout << "tail_insert:" << "s=" << s << "a=" << a << " b=" << b << endl;while(a[length] == b[length])length++;while(i < length) {ch = a[i++];list[0] = ch;list[1] = '\0';w_base(s, x_check(list));t = base(s) + (unsigned char)(ch);w_check(t, s);s = t;}list[0] = a[length];list[1] = b[length];list[2] = '\0';w_base(s, x_check(list));separate(s, a + length, old_tail_pos);separate(s, b + length, TAIL_POS);}int search_word(char *p_word) { // if found word, return its base index, if not, return -1 unsigned char ch;int h = -1;int s = 1;int t;cout << "begin-search word: " << p_word << endl; do {++h;ch = p_word[h];t = base(s) + (unsigned char)(ch);if(check(t) != s) {cout << "end-search word:" << p_word << endl; return -1;}if(base(t) < 0) {break;}s = t;} while(*(p_word + h));if(p_word[h] != '#')read_tail((-1) * base(t));if(p_word[h] == '#' || strcmp(TEMP, p_word + h + 1) == 0) {cout << "end-search word: " << p_word << endl;return t;} else {cout << "end-search word: " << p_word << endl;return -1;}}int delete_word(char *p_word) { // if delete given word, return 1, else return 0int t = search_word(p_word);if(t = -1) {return 0;} else {w_base(t, 0);w_check(t, 0);return 1;}}int insert_word(char *p_word) {unsigned char ch;int h = -1;int s = 1;int t;cout << "begin-insert word :" << p_word << endl;strcat(p_word, "#");do {++h;ch = *(p_word + h);t = base(s) + (unsigned char)(ch);if(check(t) != s) {cout << "s=" << s << ",t=" << t << ",check(t)=" << check(t) << endl; bc_insert(s, p_word + h);cout << "end-insert word:" << p_word << endl;return 1;}if(base(t) < 0) {break;}s = t;} while(*(p_word + h));if(p_word[h] != '#')read_tail((-1) * base(t));if(p_word[h] == '#' || strcmp(TEMP, p_word + h + 1) == 0) {return 1;} if(base(t) != 0) {tail_insert(t, TEMP, p_word + h + 1);cout << "end-insert word :" << p_word << endl;}return 1;}void initialize() {BC_MAX = BC_INC;BC_POS = 1;TAIL_POS = 1;BC = (int*)malloc(sizeof(int) * 2 * BC_MAX);if(BC == NULL) {cout << "BC malloc error!" << endl;return;}for(int i = 0; i < 2 * BC_MAX; i++) {BC[i] = 0;}w_base(1, 1);BC_POS = 1;TAIL = mem_str("TAIL", &TAIL_MAX, TAIL_INC);TAIL[0] = '#';TEMP = mem_str("TEMP", &TEMP_MAX, TEMP_INC);} void main() {char word[30] = {'\0'};initialize();FILE *key_file = fopen("key_words.txt", "r");if(key_file == NULL) {cout << "open key file error!" << endl;return ;}while(fscanf(key_file, "%s", word) != EOF) {insert_word(word);cout << endl;}strcpy(word, "Beijing#");if(search_word(word) > 0) {printf("find word!\n");} else {printf("not find word!\n");}}