第十五章字符串（一）

来源：互联网发布：微信数据 sd卡编辑：程序博客网时间：2024/05/21 08:37

第十五章是《编程珠玑》的最后一章，本章由易到难围绕字符串处理主要分三个部分：1，单词。2，短语。3，文本。在这三个问题中涉及到的技术有C++中的标准模板库，哈希表，新介绍的一种数据结构“后缀数组”。先从第一个部分“单词”入手。

“我们的第一个问题是为文档中包含的单词生成一个列表”。这个问题用C++标准模板库中的set和string可以很容易解决。

[cpp] view plaincopy
// Sorted list of words(between white space) in file   
  
#include <iostream>  
#include <string>  
#include <set>  
  
using namespace std;  
  
int main()  
{  
    string str;  
    set<string> S;  
    set<string>::iterator i;  
    //while((cin>>str) != EOF)  
    while(cin>>str)  
        S.insert(str);  
    for(i=S.begin(); i!=S.end(); i++)  
        cout<<*i<<endl;  
          
    return 0;  
}  

这里需要注意的问题是C++的标准输入函数cin()的用法，其自动识别空格，并且在输入终止符后自动退出，在我的编译环境下为Ctrl+Z加回车。

“接下来的问题是对文档中每个单词的出现次数进行统计”。这个问题利用map容器即可方便地解决。

[cpp] view plaincopy
// Sorted list of words and counts in file   
  
#include <iostream>  
#include <string>  
#include <map>  
  
using namespace std;  
  
int main()  
{  
    //int count=0;  
    string str;  
    map<string, int> M;  
    map<string, int>::iterator i;  
    //while((cin>>str) != EOF)  
    /* 
    while(cin>>str) 
        M.insert(str, ++count); 
    for(i=S.begin(); i!=S.end(); i++) 
        cout<<(*i)->first<<"\t"<<(*i)->second<<endl; 
    */  
    /* 
    while(cin>>str) 
        M.insert(make_pair(str, ++count)); 
    */  
    while(cin>>str)  
        M[str]++;  
    for(i=M.begin();i!=M.end();++i)  
        cout<<i->first<<"\t"<<i->second<<endl;      
      
    return 0;  
}  

这里面主要需要注意的问题就是map的用法。

程序达人们为了追求效率，减少处理的时间，对这个程序进行改进，定制了散列表，将字符串通过哈希算法分布到散列表中，这样就很有效地减少了程序运行过程中“插入”和“输出”的时间，据书中的记载是这个算法处理有29131个不同的单词詹姆斯一世钦定版《圣经》一共只需要3.0秒（map版本为7.6秒），其中处理时间（主要是插入和输出）是0.56秒（以前是5.2秒），所以用30行代码定制的散列表比C++标准模板库中的映射快一个数量级。

标准模板库中的set和map大部分实现都使用到了“平衡搜索树”这个结构，其将字符串看作是不可分割的对象进行操作。平衡搜索树中的元素始终处于有序状态，从而很容易执行寻找前驱结点或者按顺序输出元素之类的操作。散列表的平均速度很快，但缺乏平衡树提供的最坏情况性能保证，也不能支持其他涉及顺序的操作。

在散列表程序中，有初始条件如下：《圣经》中有29131个不同的单词，因此用跟29131最近接的质数作为散列表的大小，并将乘数定义为31:。

[cpp] view plaincopy
//  Sorted list of words with counts (using hash method)  
  
#include <stdio.h>  
#include <string.h>  
#include <stdlib.h>  
  
#define NHASH 29989     //   
#define MULT  31        //  
  
typedef struct node  
{  
        char *word;  
        int count;  
        struct node *next;  
}node, *nodeptr;  
  
nodeptr bin[NHASH];  
  
#define NODEGROUP 1000  // 一次分配1000个节点块   
int nodesleft = 0;  
nodeptr freenode;  
  
nodeptr nmalloc()  
{  
        if(nodesleft == 0)  
        {  
                     freenode=(nodeptr)malloc(NODEGROUP*sizeof(node));  
                     nodesleft=NODEGROUP;  
        }  
        nodesleft--;  
          
        return freenode++;  
}  
  
#define CHARGROUP 10000 // 一次分配10000个字符   
int charsleft = 0;  
char *freechar;  
  
char *smalloc(int size)  
{  
     //if(charsleft == 0)  
     if(charsleft < size)  
     {  
                  freechar=(char *)malloc((size+CHARGROUP)*sizeof(char)); // size+chargroup  
                  charsleft=size+CHARGROUP;  
     }  
     charsleft-=size;  
     freechar+=size;  
       
     return freechar-size;  
}  
  
unsigned int hash(char *p)  
{  
    unsigned int h=0;  
    for(;*p;++p)  
        h=h*MULT+*p;  
      
    return h%NHASH;  
}  
  
int inword(char *w)  
{  
    int x=hash(w);  
    nodeptr p;  
    for(p=bin[x];p!=NULL;p=p->next)  
        //if(strcmp(p->word, w))  
        if(strcmp(p->word, w)==0) //  
        {  
                           p->count++;  
                           return 0;  
        }  
      
    // THE wrong code  
    /* 
    p=(nodeptr)malloc(sizeof(node)); 
    p->word=(char *)malloc(sizeof(w));    // calculate the size of new input word 
    strcpy(p->word, w);                   // use strcpy() 
    p->count=0; 
    p->next=bin[0]->next; 
    bin[0]->next=p; 
    */  
      
    // THE inefficient code   
    /* 
    p=(nodeptr)malloc(sizeof(node)); 
    p->word=(char *)malloc(strlen(w)+1);  // calculate the size for strcpy 
    strcpy(p->word, w); 
    p->count=1; 
    //p->next=bin[x]->next; 
    p->next=bin[x]; 
    //bin[x]->next=p; 
    bin[x]=p; 
    */  
      
    // THE efficient code   
    p=nmalloc();  
    p->word=smalloc(strlen(w)+1);  
    p->count=1;  
    strcpy(p->word, w);  
    p->next=bin[x];  
    bin[x]=p;  
      
    return 0;  
}  
  
int main()  
{  
    int i;  
    //char *input;  
    char input[100];  
    nodeptr p;  
    for(i=0;i<NHASH;++i)  
        bin[i] = NULL;  
    //while(scanf("%s", &input) != EOF)  
    while(scanf("%s", input) != EOF)    
        inword(input);  
    for(i=0;i<NHASH;++i)  
    {  
                        for(p=bin[i];p!=NULL;p=p->next)  
                            printf("%s\t%d\n",p->word,p->count);  
    }  
      
    return 0;  
}  

这个程序中需要注意的问题是：1，首先对内存分配算法进行了优化，利用整块的分配取代每次单独的malloc()。2，还是需要注意scanf()函数的用法，（这个问题到后面的单词级别生成随机文本时才被我仔细地发现），其自动识别空格键，并在为每个单词（由空格分隔的字符串）提供空字符作为结束标志。3，哈希函数的用法，其中包括哈希表大小的确定和乘数的选择。哈希表的大小可以确定为是质数为最好，可以证明的是这样会使冲突尽量减少而且使数据的分布更加均匀。但是这个质数是选择比数据量大的好还是少的好呢？这个我还没有太多考虑深入研究，就本程序来说，其定义的NHASH为29989要比不同的单词数29131大，但在后面的一个单次级别随机文本生成的问题的哈希表解决方法中，其定义的NHASH要比实际的数量少。我想还是应该定义哈希表的大小比实际的数据量大小小，但是要比实际的哈希值的数量大小大，这样冲突处理方法链地址法才有用武之地，而又不浪费太多的空间又减少了冲突，在本程序中真正的单词数量要远比29131大得多，这个数据只是不同单词的数量。总之哈希部分要再研究研究。

第十五章 字符串（一）

第十五章字符串（一）