【编程珠玑-15章】Strings of pearls

来源：互联网发布：淘宝销售怎么赚钱编辑：程序博客网时间：2024/06/05 04:02

15.1 Words

问题：为文档中包含的单词生成一个列表？

可行的办法是自己建立一个散列表，写一个简单的数据结构来记录单词和单词出现的次数。

1）散列函数的建立

书中给出的源代码提供了一个字符串散列函数。其实很像经典的BKDR算法。C语言描述如下：

unsigned int hash(char *p){unsigned int h = 0;for ( ; *p; p++)h = MULT * h + *p;return h % NHASH;}

我用java实现BKDRHash（参考：http://java.chinaitlab.com/advance/868618.html）：

/* * BKDRHash(String array)的主要思想：种子乘以字符串中每一个字符的大小，总的和即hash，然后取到正整数范围。 * 0X7FFFFFFF即Integer.MAX_VALUE,hash&0X7FFFFFFF将符号位置为0，其余各位不变。* seed可以是31 131 1313 13131 131313 etc.. */public static int BKDRHash(String array) {int seed = 31;int hash = 0;for (int i = 0; i < array.length(); i++) {hash = hash * seed + array.charAt(i);}return (hash & 0X7FFFFFFF);}

*******************************************************************

实现这里的hash

private static final int NHASH=29989,MULT=31;/*散列到0~NHASH=29989范围内*/public static int hash(String array) {int hash=0;for (int i = 0; i < array.length(); i++) {hash=MULT*hash+array.charAt(i);}return (hash&0x7FFFFFFF)%NHASH;}

2）完整代码

书中给出的C语言源代码如下：

/* Copyright (C) 1999 Lucent Technologies *//* From 'Programming Pearls' by Jon Bentley *//* wordfreq.c -- list of words in file, with counts */#include <stdio.h>#include <stdlib.h>#include <string.h>typedef struct node *nodeptr;typedef struct node {char *word;int count;nodeptr next;} node;#define NHASH 29989#define MULT 31nodeptr bin[NHASH];unsigned int hash(char *p){unsigned int h = 0;for ( ; *p; p++)h = MULT * h + *p;return h % NHASH;}#define NODEGROUP 1000int nodesleft = 0;nodeptr freenode;nodeptr nmalloc(){if (nodesleft == 0) {freenode = malloc(NODEGROUP*sizeof(node));nodesleft = NODEGROUP;}nodesleft--;return freenode++;}#define CHARGROUP 10000int charsleft = 0;char *freechar;char *smalloc(int n){if (charsleft < n) {freechar = malloc(n+CHARGROUP);charsleft = n+CHARGROUP;}charsleft -= n;freechar += n;return freechar - n;}void incword(char *s){nodeptr p;int h = hash(s);for (p = bin[h]; p != NULL; p = p->next)if (strcmp(s, p->word) == 0) {(p->count)++;return;}p = nmalloc();p->count = 1;p->word = smalloc(strlen(s)+1);strcpy(p->word, s);p->next = bin[h];bin[h] = p;}int main(){int i;nodeptr p;char buf[100];for (i = 0; i < NHASH; i++)bin[i] = NULL;while (scanf("%s", buf) != EOF)incword(buf);for (i = 0; i < NHASH; i++)for (p = bin[i]; p != NULL; p = p->next)printf("%s %d\n", p->word, p->count);return 0;}

***************************************************************************

下面是我用Java实现的：

/** * 创建时间：2014年9月1日 下午9:31:13 项目名称：Test *  * @author Cao Yanfeng * @since JDK 1.6.0_21 类说明：建立字符串散列函数，以链表的方式解决冲突，统计字符串的出现频次 */public class WordsFrequency {private static final int NHASH = 29989, MULT = 31;private static Node[] bin;/** * @param args */public static void main(String[] args) {// TODO Auto-generated method stub// System.out.println(hash("CAOYANFENGhfOWRFRAEOWGNEARFGAT"));bin = new Node[NHASH];String[] array = { "CaoYanfeng", "agaef", "北京大学", "CaoYanfeng","China", "agaef", "CaoYanfeng", "agaef", "China", "agaef","agaef", "曹艳丰", "CaoYanfeng", "曹艳丰", "北京大学", "CaoYanfeng","China" };for (int i = 0; i < array.length; i++) {incword(array[i]);}for (int i = 0; i < bin.length; i++) {for (Node node = bin[i]; node != null; node = node.getNext()) {System.out.println("单词：" + node.getWordString() + "——数量："+ node.getCount());}}}/* 以链表的方式解决hashTable的冲突 */public static void incword(String string) {int h = hash(string);Node node;for (node = bin[h]; node != null; node = node.getNext()) {if (string.equals(node.getWordString())) {node.setCount(node.getCount() + 1);return;}}node = new Node();node.setWordString(string);node.setCount(1);node.setNext(bin[h]);bin[h] = node;}/* 采用类似于BKDRHash(String array)的方式将字符串散列到0~NHASH=29989范围内 */public static int hash(String string) {int hash = 0;for (int i = 0; i < string.length(); i++) {hash = MULT * hash + string.charAt(i);}return (hash & 0x7FFFFFFF) % NHASH;}/* 数组的节点类 */private static class Node {private String wordString;private int count;private Node next;/** *  */public Node() {// TODO Auto-generated constructor stubwordString = null;count = 0;next = null;}public String getWordString() {return wordString;}public void setWordString(String wordString) {this.wordString = wordString;}public int getCount() {return count;}public void setCount(int count) {this.count = count;}public Node getNext() {return next;}public void setNext(Node next) {this.next = next;}}}

15.2 Phrases

问题：给定一个文本文件作为输入，查找其中最长的重复子字符串。

这里用的是后缀数组。如下目标字符串： banana其长度为6，则后缀数组的长度为6，分别是以b开头的字串（长度为6），以a开头的字串（长度为5），以n开头的字串（长度为4）。。。最后一个是以a开头的字串（长度为1）。
后缀[0] banana
后缀[1] anana
后缀[2] nana
后缀[3] ana
后缀[4] na
后缀[5] a

所以，算法的流程是，先求出字符串的后缀数组，将后缀数组字母排序，然后顺次比较（避免了两两比较）即可。
后缀[0] a
后缀[1] ana
后缀[2] anana
后缀[3] banana
后缀[4] na
后缀[5] nana

最终的比较结果是后缀[1] 和后缀[2] 之间存在最长公共字串 ana。

时间复杂度分析：生成后缀数组 O(n)，排序 O(nlogn*n)，最后面的n是因为字符串比较的时候也是一个一个字符进行比较，所以是O(n)。依次检测相邻的两个字符串 O(n * n)，总的时间复杂度是 O(n^2*logn)，优于暴力方法的 O(n^3)。可以看出，复杂度跟排序相关度大，可以使用倍增算法高效的（nlogn）产生排好序的后缀数组，从而提高复杂度。

书中给出的C语言源代码如下：

/* Copyright (C) 1999 Lucent Technologies *//* From 'Programming Pearls' by Jon Bentley *//* longdup.c -- Print longest string duplicated M times */#include <stdlib.h>#include <string.h>#include <stdio.h>int pstrcmp(char **p, char **q){   return strcmp(*p, *q); }int comlen(char *p, char *q){int i = 0;while (*p && (*p++ == *q++))i++;return i;}#define M 1#define MAXN 5000000char c[MAXN], *a[MAXN];int main(){   int i, ch, n = 0, maxi, maxlen = -1;    while ((ch = getchar()) != EOF) {        a[n] = &c[n];        c[n++] = ch;    }    c[n] = 0;    qsort(a, n, sizeof(char *), pstrcmp);    for (i = 0; i < n-M; i++)        if (comlen(a[i], a[i+M]) > maxlen) {            maxlen = comlen(a[i], a[i+M]);            maxi = i;        }    printf("%.*s\n", maxlen, a[maxi]);    return 0;}

*****************************************************************************

我用java实现如下：

/**   * 创建时间：2014年9月2日 上午10:05:11   * 项目名称：Test   * @author Cao Yanfeng   * @since JDK 1.6.0_21   * 类说明：  利用后缀数组来查找字符串中最长的重复子串 */public class MaxLongDup {/** * @param args */public static void main(String[] args) {// TODO Auto-generated method stubgetMaxLongDup("banana");}/*这里利用了两个系统函数，Arrays.sort()和string.substring(i) * */public static void getMaxLongDup(String string) {int length=string.length();int maxlen=Integer.MIN_VALUE;int position=-1;String[] suffix=new String[length];for (int i = 0; i < suffix.length; i++) {//suffix[i]=string.substring(i);suffix[i]=new String(string.substring(i));}Arrays.sort(suffix);for (int i = 0; i < suffix.length-1; i++) {int comlen=commonLength(suffix[i], suffix[i+1]);//maxlen=(comlen>maxlen)?comlen:maxlen;if (comlen>maxlen) {maxlen=comlen;position=i;}}System.out.println("最大重复子串："+suffix[position]+";长度为:"+maxlen);}/*获得相邻的两个字符串的公共长度*/public static int commonLength(String string1,String string2) {int counter=0;char[] arry1=string1.toCharArray();int i=0;char[] array2=string2.toCharArray();int j=0;while (i<arry1.length&&j<array2.length) {if (arry1[i++]==array2[j++]) {counter++;}else {break;}}return counter;}}

**************************************************************************

注意，我这里在获得后缀数组的时候使用了两个系统函数：

1） Arrays.sort()。Arrays.sort()对于基本数据类型使用快速排序，而对象的话是堆排序，这里是堆排序；

2） substring(intindex)。根据以下参考文献的说法，substring并没有生成新的String对象，而是引用了原来String对象的位置索引而已。（参考：http://www.cnblogs.com/tedzhao/archive/2012/07/31/Java_String_substring.html）。所以我使用的是后者而不是前者。

// suffix[i]=string.substring(i);

suffix[i]=new String(string.substring(i));

0 0