trie树-《算法导论》学习笔记十四
来源:互联网 发布:中国导演 知乎 编辑:程序博客网 时间:2024/05/22 03:03
引用一下百度百科的话吧:
Trie树,又称单词查找树,是一种树形结构,是一种哈希树的变种。典型应用是用于统计,排序和保存大量的字符串(但不仅限于字符串),所以经常被搜索引擎系统用于文本词频统计。它的优点是:利用字符串的公共前缀来减少查询时间,最大限度地减少无谓的字符串比较,查询效率比哈希树高。
这里构建了一棵字典树,每个结点有52个孩子指针,对应26个小写字母和26个大写字母,根节点不存储数据,一个单词从第一个字母开始经由根结点走对应分支进行插入和统计。
trie树结点卫星数据包含了字母、出现次数、是否构成一个单词,孩子指针就是一个52大小的trie树结点指针数组。
实现了几个操作:
1. 插入单词
遍历每个字母,从根结点出发,如果结点对应字母的孩子结点为空,就创建结点,出现次数为1,如果存在这个结点,出现次数就+1,并且如果单词结束,结束处的结点是否构成一个单词字段标识为构成
2. 遍历树,并打印所有单词和每个单词出现次数
3. 统计树,按给定的数字统计出现次数前几的单词
树统计,与遍历类似,用尾递归,并传入一个大于单词最大长度的数组来存储每个分支的单词,如果遇到结点能构成一个单词,就判断你单词个数,并以插入排序的方式插入创建的统计链表(类似打扑克的插排序);
统计链表有更新操作,根据输入的统计前几的数字来维护这个链表该去掉哪些结点,该更新哪些结点的顺序等
获取单词来源为编写的一个简单单词随机生成代码,写入一个文件中,可指定单词最大长度,全大写/全小写/大小写均有,单词个数,单词范围(只支持a-*或A-*,例如5,就是生成a-e/A-E的单词)
贴代码:
随机生成单词
#include <stdio.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <time.h>#include <unistd.h>#include <errno.h>#include <string.h>#include <stdlib.h>int word_len = 0;int upper_low = 65;int lowwer_low = 97;// if letter_size = 5// it will generate a-e or A-E letter.int letter_size = 0;int random_letter(){ return rand() % letter_size;}int random_word( char *word, int opt ){ // minimum word'length is 3. int true_word_len = rand() % word_len + 3; int true_word_len1 = true_word_len; while ( true_word_len-- ) { char letter = 0; if ( opt == 0 ) { letter = random_letter() + lowwer_low; } else if ( opt == 1 ) { letter = random_letter() + upper_low; } else { int opt_case = rand() % 2; if ( opt_case == 0 ) letter = random_letter() + lowwer_low; else letter = random_letter() + upper_low; } word[true_word_len] = letter; } return true_word_len1;}void gen_word( int fd, int word_num, int opt ){ char word[20] = {0}; int true_word_len = 0; while ( word_num-- ) { memset( word, 0, 20); true_word_len = random_word( word, opt ); word[true_word_len] = '\n'; write( fd, word, true_word_len + 1 ); }}int main( int argc, char **argv ){ srand((int)time(NULL)); if ( argc != 5 ) { printf("please input " "word's length & " "words' number & " "word's range & " "gen_case(0:lowwer case,1:upper case,other:both\n"); exit( 0 ); } word_len = atoi( argv[1] ); int word_num = atoi( argv[2] ); letter_size = atoi( argv[3] ); int opt = atoi( argv[4] ); int fd = open("word.txt", O_RDWR | O_TRUNC, 0777); gen_word( fd, word_num, opt ); close( fd ); return 0;}
trie树
#include <stdio.h>#include <unistd.h>#include <stdlib.h>#include <string.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <errno.h>#define MAX_CHILD_NUM 52#define UPPER_LOW 65#define UPPER_UP 90#define LOWER_LOW 97#define LOWER_UP 122#define PRINT(format, arg...) \do { \ printf("[%s/%d]:", __func__, __LINE__); \ printf(format, ##arg); \ printf("\n"); \}while(0) typedef struct trieTreeNode { char letter; int count; int is_word; struct trieTreeNode *next[MAX_CHILD_NUM];} trieTreeNode;typedef struct trieTree { trieTreeNode *root;} trieTree;typedef struct count_data { int order; int count; char string[20]; struct count_data *next;} count_data;int trans_letter_2_index( char letter ){ int index = -1; if ( letter >= LOWER_LOW && letter <= LOWER_UP ) { index = letter - LOWER_LOW + 26; } else if ( letter >= UPPER_LOW && letter <= UPPER_UP ) { index = letter - UPPER_LOW; } else { PRINT("error letter input:%c", letter); exit( 0 ); } return index;}trieTreeNode *create_node( char letter ){ trieTreeNode *node = ( trieTreeNode * )calloc( 1, sizeof(trieTreeNode) ); node->letter = letter; node->count = 0; node->is_word = 0;}void insert( trieTreeNode *root, char *word ){ if ( root == NULL ) { PRINT("root node is null."); return; } int i = 0; trieTreeNode *cur = root; for ( i; word[i] != '\0'; i++ ) { int next_index = trans_letter_2_index(word[i]); //PRINT("letter:%c, index:%d", word[i], next_index); if ( cur->next[next_index] == NULL ) { cur->next[next_index] = create_node( word[i] ); } else { //cur->next[next_index]->count += 1; } if ( word[i+1] == '\0' ) { cur->next[next_index]->count += 1; cur->next[next_index]->is_word = 1; } cur = cur->next[next_index]; }}// 删除链表所有结点void delete_list_all_node( count_data *node ){ count_data *p = NULL; while ( node ) { p = node; node = node->next; free( p ); }}void print_list_all_node( count_data *node ){ printf("\n"); node = node->next; while ( node ) { printf("[%d],count:%d\tword:%s\n", node->order, node->count, node->string); node = node->next; } printf("\n");}void update_insert_node( count_data *insert_node ){ if ( !insert_node->next ) return; count_data *print_p = insert_node; if ( insert_node->order == 1 ) { delete_list_all_node( insert_node->next ); insert_node->next = NULL; } else if ( insert_node->order < 1 ) { PRINT("ERROR!!!!!"); exit( 0 ); } else { count_data *p = insert_node; insert_node = insert_node->next; while ( insert_node ) { if ( insert_node->count < p->count ) { insert_node->order = p->order - 1; } else if ( insert_node->count > p->count ) { PRINT("ERROR!!!cur->count:%d, pre->count:%d", insert_node->count, p->count); exit( 0 ); } else { insert_node->order = p->order; } if ( insert_node->order < 1 ) { delete_list_all_node( insert_node ); p->next = NULL; break; } p = insert_node; insert_node = insert_node->next; } }}void list_insert( char *tmp_word, int cur_count, int tail, count_data *head, int top_num ){ tmp_word[tail] = '\0'; count_data *new_data = ( count_data * )malloc( sizeof(count_data) ); new_data->count = cur_count; memcpy( new_data->string, tmp_word, tail + 1 ); new_data->next = NULL; //PRINT("count:%d\ttmp_word:%s, string:%s", cur_count, tmp_word, new_data->string); if ( head->next == NULL ) { head->next = new_data; new_data->order = top_num; } else if ( cur_count > head->next->count ) { new_data->order = head->next->order; new_data->next = head->next; head->next = new_data; update_insert_node( new_data ); } else { while ( 1 ) { head = head->next; if ( head->next == NULL ) { if ( head->order > 1 ) { head->next = new_data; if ( head->count == new_data->count ) new_data->order = head->order; else new_data->order = head->order - 1; head->next = new_data; } else if ( head->count > new_data->count ) { // 不插入 free( new_data ); } else if ( head->count == new_data->count ) { head->next = new_data; new_data->order = head->order; } else if ( head->count < new_data->count ) { // 此种情况只有求出现次数最多的前1个单词时有 head->count = new_data->count; free( new_data ); } break; } else if ( head->count >= cur_count && head->next->count < cur_count ) { new_data->next = head->next; head->next = new_data; new_data->order = head->order; update_insert_node( new_data ); break; } } }}void find_top_count1( trieTreeNode *root, char *tmp_word, int tail, count_data *head, int top_num ){ if ( !root ) return; tmp_word[tail] = root->letter; tail++; if ( root->is_word ) { /* printf("\n--------------before delete------------------\n"); print_list_all_node( head ); printf("\n--------------------------------------------\n"); */ list_insert( tmp_word, root->count, tail, head, top_num ); /* printf("\n--------------------after delete----------------------------\n"); print_list_all_node( head ); printf("\n-----------------------------------------------------------\n"); */ } int i = 0; for ( i; i < MAX_CHILD_NUM; i++ ) { find_top_count1( root->next[i], tmp_word, tail, head, top_num ); }}void find_top_count( trieTreeNode *root, int top_num ){ if ( !root ) return; int i = 0; count_data *head = ( count_data * )malloc( sizeof(count_data) ); for ( i; i < MAX_CHILD_NUM; i++ ) { char tmp_word[20] = {0}; find_top_count1( root->next[i], tmp_word, 0, head, top_num ); } printf("出现次数最大前%d次的单词:\n", top_num); count_data *p = head->next; count_data *free_p = NULL; while ( p != NULL ) { free_p = p; printf("前%d,count:%d\t%s\n", p->order, p->count, p->string); p = p->next; free( free_p ); } free( head );}void tree_walk1( trieTreeNode *root, char *tmp_word, int tail ){ if ( !root ) return; tmp_word[tail] = root->letter; tail++; //printf("%c\n", root->letter); if ( root->is_word ) { int j = 0; printf("count:%d\t", root->count); for ( j; j < tail; j++ ) { printf("%c", tmp_word[j]); } printf("\n"); } int i = 0; for ( i; i < MAX_CHILD_NUM; i++ ) { tree_walk1( root->next[i], tmp_word, tail ); }}void tree_walk( trieTreeNode *root ){ if ( !root ) return; int i = 0; for ( i; i < MAX_CHILD_NUM; i++ ) { char tmp_word[20] = {0}; tree_walk1( root->next[i], tmp_word, 0 ); }}int main( int argc, char **argv ){ if ( argc != 3 ) { PRINT("USAGE: please input words file & top number"); exit( 0 ); } char *file_name = argv[1]; int top_num = atoi( argv[2] ); trieTree *tree = ( trieTree * )malloc( sizeof(trieTree) ); tree->root = create_node( -1 ); int fd = open(file_name, O_RDONLY); if ( fd < 0 ) { PRINT("OPEN FILE %s ERROR!!!(%s)", file_name, (char *)strerror(errno)); exit( 0 ); } // 每次读取文件的缓冲区 char buf[1024 * 10] = {0}; // 每次读取的大小 int read_len = 1024; // 读取的返回值 int read_bytes = 0; // 从读取的缓冲区每次提取'\n' - '\n'之间的单词 char tmp_word[20] = {0}; // 读取文件缓冲区如果出现单词隔断,剩余部分在下一次 // read才能读到,这个index做单词继续拼接 int tmp_index = 0; while ( 1 ) { memset( buf, 0, read_len ); read_bytes = read( fd, buf, read_len ); if ( read_bytes <= 0 ) break; //printf("readbytes:%d------\n%s\n", read_bytes, buf); int cur = 0; while ( cur < read_bytes ) { // 单词文件最后一个单词末尾一定要有'\n' if ( buf[cur] == '\n' ) { tmp_word[tmp_index] = '\0'; //printf("insert word:%s\n", tmp_word); insert( tree->root, tmp_word ); memset( tmp_word, 0, 20 ); tmp_index = 0; } else { tmp_word[tmp_index] = buf[cur]; tmp_index++; } cur++; } } printf("\n========================================\n"); tree_walk( tree->root ); find_top_count( tree->root, top_num ); close( fd ); return 0;}
trie树的代码使用:./xxx word.txt 10即统计出现次数前10的单词,并打印单词和次数
例如对生成了10000个单词的word.txt文件,统计前5:
./xxx word.txt 5
- trie树-《算法导论》学习笔记十四
- 算法导论—Trie树
- 算法导论学习笔记(十四):贪心算法(一):活动安排问题
- |算法讨论|Trie树 学习笔记
- 《算法导论》学习笔记
- 《算法导论》学习笔记
- 算法导论学习笔记
- 《算法导论》学习笔记
- avl树-《算法导论》学习笔记十三
- Trie树学习笔记
- 算法导论学习笔记-1
- 算法导论学习笔记-2
- 算法导论分而治之学习笔记
- 《算法导论》学习笔记 前言
- 算法导论--学习笔记01
- 算法导论--学习笔记02
- 算法导论--学习笔记03
- 算法导论----学习笔记04
- C++适配器模式
- 文章标题 一些函数的编写
- [LeetCode OJ]Continuous Subarray Sum
- SVN中的Branches分支以及Tags标签详解与应用举例
- 套接字必须知道的图
- trie树-《算法导论》学习笔记十四
- 剑指offer-树的子结构
- js数组Array对象常用方法总结
- 套接字之客户端和服务端的一次通信
- 三十六、SpringBoot配置属性之NOSQL
- Floodlight出现错误:Error! Check the fields specified for the flow.Make sure IPv4 fields are not mixed wi
- 参考博客进行SSM+websocket整合
- Java平台的脚本
- LeetCode(8)