利用二叉搜索树来实现输入文本的单词统计

来源：互联网发布：linux mysql 安装编辑：程序博客网时间：2024/06/08 16:19

这里有个题目，输入一个文本（纯英文），我们要能够得到文本中每个单词出现的个数，并且按照字典顺序输出。

解决这个题目可以利用二叉搜索树，这是一个比较好理解的方法，它将整个文本单词分解从根部(root）延伸，当输入的

单词比上一个大，就连接到此节点的右边，若小，则连接到左边，这样就可以轻易的将他们有序的连接了。

首先,我们需要一个结构体来提供储存单词和单词出现的次数，定义如下：

struct tnode{ char *word;             //记录单词int count;                 //记录次数struct tnode *left;   //左节点strcut tnode *right; //右节点};

然后我们需要一个插入函数，这是一个递归版本的插入函数，talloc只是malloc的替代，是一个自定义函数，后面会给出

其实现代码。

struct tnode *addTree(struct tnode *p,char *w){int cond;if(p == NULL){p = talloc();p->word = strdup(w);p->count = 1;p->left = p->right = NULL;}else if((cond = strcmp(w,p->word)) == 0)p->count++;else if(cond < 0)p->left = addTree(p->left,w);elsep->right = addTree(p->right,w);return p;}

在插入函数中我们有一个strdup函数，它用于将字符串复制给一个动态字符串并返回此动态字符串，

由于strlen只能算出字符串除了‘\0’之外的字符数，所有申请动态内存时应该用strlen（s）+1。下面给出

此函数的实现：

char *strdup(char *s){char *p;p = (char *)malloc(strlen(s)+1);if(p != NULL)strcpy(p,s);return p;}

然后我们需要一个打印这个二叉树的函数，同样采用递归;

void treePrint(struct tnode *p){if(p != NULL){treePrint(p->left);printf("%4d %s\n",p->count,p->word);treePrint(p->right);}}

大家应该注意到了，我们还有getword、getch、ungetch函数为实现，下面将对其一一解释：

第一个：getch函数和ungetch函数

顾名思义，这两个的作用一个是获取字符，一个是压回字符，其实这是堆栈的结构，我们需要

一个足够大的数组来实现静态数组的堆栈：

#define BUFSIZE 100//缓冲区的大小
char buf[BUFSIZE];//字符缓冲区
int bufp = 0;//缓冲区中的下一个位置

下面给出这两个函数的实现：

int getch(void){return (bufp > 0) ? buf[--bufp] : getchar();}void ungetch(int c){if(bufp >= BUFSIZE)printf("ungetch: too many characters\n");elsebuf[bufp++] = c;}

最后，我们来看一看getword函数，这个函数将从缓冲区中获取一个字符，这个字符开头必须是字母，

它返回的字符是输入字符串的第一个字符，代码如下：

int getword(char *word,int lim){int c;char *w = word;while(isspace(c = getch()));if(c != EOF)*w++ = c;if(!isalpha(c)){*w = '\0';return c;}for(;--lim > 0;w++){if(!isalnum(*w = getch())){ungetch(*w);break;}}*w = '\0';return word[0];}

这样我们就可以轻易的实现的文本中的单词统计了。

附录（完整代码）：

#include <stdio.h>#include <ctype.h>#include <string.h>#include <stdlib.h>#define BUFSIZE 100char buf[BUFSIZE];int bufp = 0;struct tnode{char *word;int count;struct tnode *left;struct tnode *right;};int getch(void);void ungetch(int c);int getword(char *word,int lim);struct tnode *addTree(struct tnode *p,char *w);struct tnode *talloc(void);char *strdup(char *s);void treePrint(struct tnode *p);main(){struct tnode *root;char word[100];root = NULL;while(getword(word,100) != EOF)if(isalpha(word[0]))root = addTree(root,word);treePrint(root);system("pause");return 0;}int getword(char *word,int lim){int c;char *w = word;while(isspace(c = getch()));if(c != EOF)*w++ = c;if(!isalpha(c)){*w = '\0';return c;}for(;--lim > 0;w++){if(!isalnum(*w = getch())){ungetch(*w);break;}}*w = '\0';return word[0];}struct tnode *addTree(struct tnode *p,char *w){int cond;if(p == NULL){p = talloc();p->word = strdup(w);p->count = 1;p->left = p->right = NULL;}else if((cond = strcmp(w,p->word)) == 0)p->count++;else if(cond < 0)p->left = addTree(p->left,w);elsep->right = addTree(p->right,w);return p;}struct tnode *talloc(void){return (struct tnode *)malloc(sizeof(struct tnode));}char *strdup(char *s){char *p;p = (char *)malloc(strlen(s)+1);if(p != NULL)strcpy(p,s);return p;}void treePrint(struct tnode *p){if(p != NULL){treePrint(p->left);printf("%4d %s\n",p->count,p->word);treePrint(p->right);}}int getch(void){return (bufp > 0) ? buf[--bufp] : getchar();}void ungetch(int c){if(bufp >= BUFSIZE)printf("ungetch: too many characters\n");elsebuf[bufp++] = c;}