后缀数组学习笔记

来源:互联网 发布:淘宝活动大全2017 编辑:程序博客网 时间:2024/05/18 03:25

要用好后缀数组要先理解里面几个数组的概念:
sa[i]表示字典序第i大的后缀下标(字典序排名依次是1len(string));
rank[i]表示下标为i的后缀字典序排名;
height[i]表示sa[i]sa[i1]最长公共前缀的长度.

一个性质:

LCP(suffix[i],suffix[j])=min{height[i+1],height[i+2]height[j]}(rank[i]<rank[j])

倍增法求出这些数组, 记得在原串的末尾增加一个0:

int t1[maxn],t2[maxn],c[maxn];bool cmp(int *r,int a,int b,int l){    return r[a] == r[b] && r[a+l] == r[b+l];}void da(int str[],int sa[],int rank[],int height[],int n,int m){    n++;    int i, j, p, *x = t1, *y = t2;    //第一轮基数排序,如果s的最大值很大,可改为快速排序    for(i = 0; i < m; i++)c[i] = 0;    for(i = 0; i < n; i++)c[x[i] = str[i]]++;    for(i = 1; i < m; i++)c[i] += c[i-1];    for(i = n-1; i >= 0; i--)sa[--c[x[i]]] = i;    for(j = 1; j <= n; j <<= 1)    {        p = 0;        //直接利用sa数组排序第二关键字        for(i = n-j; i < n; i++)y[p++] = i;//后面的j个数第二关键字为空的最小        for(i = 0; i < n; i++)if(sa[i] >= j)y[p++] = sa[i] - j;        //这样数组y保存的就是按照第二关键字排序的结果        //基数排序第一关键字        for(i = 0; i < m; i++)c[i] = 0;        for(i = 0; i < n; i++)c[x[y[i]]]++;        for(i = 1; i < m; i++)c[i] += c[i-1];        for(i = n-1; i >= 0; i--)sa[--c[x[y[i]]]] = y[i];        //根据sa和x数组计算新的x数组        swap(x,y);        p = 1;        x[sa[0]] = 0;        for(i = 1; i < n; i++)            x[sa[i]] = cmp(y,sa[i-1],sa[i],j)?p-1:p++;        if(p >= n)break;        m = p;//下次基数排序的最大值    }    int k = 0;    n--;    for(i = 0; i <= n; i++)rank[sa[i]] = i;    for(i = 0; i < n; i++)    {        if(k)   k--;        j = sa[rank[i]-1];        while(str[i+k] == str[j+k])            k++;        height[rank[i]] = k;    }}

POJ 1743 最长不重叠相同子串
题意: 求两个最长的不重叠子串, 满足两个串对应下标的差值相等.
对于相邻的两个数直接做差得到一个新串, 直接对新串求最长不重叠子串. 非常经典的做法, 二分长度, 然后对height数组分组, 满足这个长度的分成一组, 然后判断组中下标最大最小之差.
数据很水~

#include <iostream>#include <cstdio>#include <cstring>#include <queue>#include <cmath>#include <string>#include <vector>#include <algorithm>#include <map>#include <set>#define maxn 20005using namespace std;int t1[maxn],t2[maxn],c[maxn];bool cmp(int *r,int a,int b,int l){    return r[a] == r[b] && r[a+l] == r[b+l];}void da(int str[],int sa[],int rank[],int height[],int n,int m){    n++;    int i, j, p, *x = t1, *y = t2;    //第一轮基数排序,如果s的最大值很大,可改为快速排序    for(i = 0; i < m; i++)c[i] = 0;    for(i = 0; i < n; i++)c[x[i] = str[i]]++;    for(i = 1; i < m; i++)c[i] += c[i-1];    for(i = n-1; i >= 0; i--)sa[--c[x[i]]] = i;    for(j = 1; j <= n; j <<= 1)    {        p = 0;        //直接利用sa数组排序第二关键字        for(i = n-j; i < n; i++)y[p++] = i;//后面的j个数第二关键字为空的最小        for(i = 0; i < n; i++)if(sa[i] >= j)y[p++] = sa[i] - j;        //这样数组y保存的就是按照第二关键字排序的结果        //基数排序第一关键字        for(i = 0; i < m; i++)c[i] = 0;        for(i = 0; i < n; i++)c[x[y[i]]]++;        for(i = 1; i < m; i++)c[i] += c[i-1];        for(i = n-1; i >= 0; i--)sa[--c[x[y[i]]]] = y[i];        //根据sa和x数组计算新的x数组        swap(x,y);        p = 1;        x[sa[0]] = 0;        for(i = 1; i < n; i++)            x[sa[i]] = cmp(y,sa[i-1],sa[i],j)?p-1:p++;        if(p >= n)break;        m = p;//下次基数排序的最大值    }    int k = 0;    n--;    for(i = 0; i <= n; i++)rank[sa[i]] = i;    for(i = 0; i < n; i++)    {        if(k)   k--;        j = sa[rank[i]-1];        while(str[i+k] == str[j+k])            k++;        height[rank[i]] = k;    }}int rank[maxn], height[maxn];int str[maxn];int sa[maxn];int n;#define INF 111111bool ok (int x) {    int Min = INF, Max = 0;    for (int i = 1; i <= n; i++) {        if (height[i] >= x) {            Min = min (Min, sa[i]);            Max = max (Max, sa[i]);        }        else {            if (Max-Min >= x)                return 1;            Max = sa[i];            Min = sa[i];        }    }    return Max-Min >= x;}int solve () {    int l = 0, r = n/2;    while (r-l > 1) {        int mid = (r+l)>>1;        if (ok (mid)) l = mid;        else r = mid;    }    return (ok (r) ? r : l);}int main(){    while (scanf ("%d", &n) == 1 && n) {        for (int i = 0; i < n; i++) {            scanf ("%d", &str[i]);        }        if (n <= 9) {            printf ("0\n");            continue;        }        n--;        for (int i = 0; i < n; i++) {            str[i] = 100+str[i+1]-str[i];        }        str[n] = 0;        da(str, sa, rank, height, n+1, 188);        int ans = solve ()+1;        printf ("%d\n", (ans >= 5 ? ans : 0));    }    return 0;}

POJ 3261 求重复k次的最长子串
还是二分结果, 按照height分组, 判断是不是有大于k的组.
数据还是很水, 不加离散化都能过~

#include <iostream>#include <cstdio>#include <cstring>#include <queue>#include <cmath>#include <string>#include <vector>#include <algorithm>#include <map>#include <set>#define maxn 200005using namespace std;int t1[maxn],t2[maxn],c[maxn];bool cmp(int *r,int a,int b,int l){    return r[a] == r[b] && r[a+l] == r[b+l];}void da(int str[],int sa[],int rank[],int height[],int n,int m){    n++;    int i, j, p, *x = t1, *y = t2;    //第一轮基数排序,如果s的最大值很大,可改为快速排序    for(i = 0; i < m; i++)c[i] = 0;    for(i = 0; i < n; i++)c[x[i] = str[i]]++;    for(i = 1; i < m; i++)c[i] += c[i-1];    for(i = n-1; i >= 0; i--)sa[--c[x[i]]] = i;    for(j = 1; j <= n; j <<= 1)    {        p = 0;        //直接利用sa数组排序第二关键字        for(i = n-j; i < n; i++)y[p++] = i;//后面的j个数第二关键字为空的最小        for(i = 0; i < n; i++)if(sa[i] >= j)y[p++] = sa[i] - j;        //这样数组y保存的就是按照第二关键字排序的结果        //基数排序第一关键字        for(i = 0; i < m; i++)c[i] = 0;        for(i = 0; i < n; i++)c[x[y[i]]]++;        for(i = 1; i < m; i++)c[i] += c[i-1];        for(i = n-1; i >= 0; i--)sa[--c[x[y[i]]]] = y[i];        //根据sa和x数组计算新的x数组        swap(x,y);        p = 1;        x[sa[0]] = 0;        for(i = 1; i < n; i++)            x[sa[i]] = cmp(y,sa[i-1],sa[i],j)?p-1:p++;        if(p >= n)break;        m = p;//下次基数排序的最大值    }    int k = 0;    n--;    for(i = 0; i <= n; i++)rank[sa[i]] = i;    for(i = 0; i < n; i++)    {        if(k)   k--;        j = sa[rank[i]-1];        while(str[i+k] == str[j+k])            k++;        height[rank[i]] = k;    }}int rank[maxn], height[maxn];int str[maxn];int sa[maxn];int n, k;bool ok (int x) {    int ans = 1;    for (int i = 2; i <= n; i++) {        if (height[i] >= x) {            ans++;            if (ans >= k)                return 1;        }        else            ans = 1;    }    return 0;}int solve () {    int l = 0, r = n;    while (r-l > 1) {        int mid = (l+r) >>1;        if (ok (mid)) l = mid;        else r=  mid;    }    return (ok (r) ? r : l);}int cnt, num[maxn], gg[maxn];int lisanhua () {    cnt = 0;    for (int i = 0; i < n; i++) num[i] = i;    sort (num, num+n);    for (int i = 0; i < n; i++) if (!i || num[i] != num[i-1])        gg[cnt++] = num[i];    for (int i = 0; i < n; i++)        str[i] = lower_bound (gg, gg+cnt, str[i])-gg+1;    return cnt+1;}int main(){    while (cin >> n >> k) {        int Max = 0;        for (int i = 0; i < n; i++) {            cin >> str[i];            Max = max (Max, str[i]);        }        str[n] = 0;        int m = lisanhua ();        da (str, sa, rank, height, n, m+2);        int ans = solve ();        cout << ans << endl;    }    return 0;}/*2 21 1*/

SPOJ 694 不重复子串个数
根据sa数组和height数组的含义, sa[i]后缀总共有nsa[i]个前缀, 有height[i]个前缀和之前的重复, 所以要减去. 最后答案是ni=1nsa[i]height[i].

#include <iostream>#include <cstdio>#include <cstring>#include <queue>#include <cmath>#include <string>#include <vector>#include <algorithm>#include <map>#include <set>#define maxn 200005using namespace std;int t1[maxn],t2[maxn],c[maxn];bool cmp(int *r,int a,int b,int l){    return r[a] == r[b] && r[a+l] == r[b+l];}void da(int str[],int sa[],int rank[],int height[],int n,int m){    n++;    int i, j, p, *x = t1, *y = t2;    //第一轮基数排序,如果s的最大值很大,可改为快速排序    for(i = 0; i < m; i++)c[i] = 0;    for(i = 0; i < n; i++)c[x[i] = str[i]]++;    for(i = 1; i < m; i++)c[i] += c[i-1];    for(i = n-1; i >= 0; i--)sa[--c[x[i]]] = i;    for(j = 1; j <= n; j <<= 1)    {        p = 0;        //直接利用sa数组排序第二关键字        for(i = n-j; i < n; i++)y[p++] = i;//后面的j个数第二关键字为空的最小        for(i = 0; i < n; i++)if(sa[i] >= j)y[p++] = sa[i] - j;        //这样数组y保存的就是按照第二关键字排序的结果        //基数排序第一关键字        for(i = 0; i < m; i++)c[i] = 0;        for(i = 0; i < n; i++)c[x[y[i]]]++;        for(i = 1; i < m; i++)c[i] += c[i-1];        for(i = n-1; i >= 0; i--)sa[--c[x[y[i]]]] = y[i];        //根据sa和x数组计算新的x数组        swap(x,y);        p = 1;        x[sa[0]] = 0;        for(i = 1; i < n; i++)            x[sa[i]] = cmp(y,sa[i-1],sa[i],j)?p-1:p++;        if(p >= n)break;        m = p;//下次基数排序的最大值    }    int k = 0;    n--;    for(i = 0; i <= n; i++)rank[sa[i]] = i;    for(i = 0; i < n; i++)    {        if(k)   k--;        j = sa[rank[i]-1];        while(str[i+k] == str[j+k])            k++;        height[rank[i]] = k;    }}int rank[maxn], height[maxn];int str[maxn];char s[maxn];int sa[maxn];int n, k;int main(){    ios::sync_with_stdio(0);    int t;    cin >> t;    while (t--) {        cin >> s;        n = strlen (s);        for (int i = 0; i < n; i++) str[i] = s[i];        str[n] = 0;        da (str, sa, rank, height, n, 233);        long long ans = 0;        for (int i = 1; i <= n; i++) {            ans += n-sa[i]-height[i];        }        cout << ans << endl;    }    return 0;}

POJ 2774 最长公共子串
把第二个串放到第一个串的后面, 中间用一个失配符隔开, 然后遍历height数组维护最大子串长度. 要避免出现在同一串中的公共子串.

#include <iostream>#include <cstdio>#include <cstring>#include <queue>#include <cmath>#include <string>#include <vector>#include <algorithm>#include <map>#include <set>#define maxn 200005using namespace std;int t1[maxn],t2[maxn],c[maxn];bool cmp(int *r,int a,int b,int l){    return r[a] == r[b] && r[a+l] == r[b+l];}void da(int str[],int sa[],int rank[],int height[],int n,int m){    n++;    int i, j, p, *x = t1, *y = t2;    //第一轮基数排序,如果s的最大值很大,可改为快速排序    for(i = 0; i < m; i++)c[i] = 0;    for(i = 0; i < n; i++)c[x[i] = str[i]]++;    for(i = 1; i < m; i++)c[i] += c[i-1];    for(i = n-1; i >= 0; i--)sa[--c[x[i]]] = i;    for(j = 1; j <= n; j <<= 1)    {        p = 0;        //直接利用sa数组排序第二关键字        for(i = n-j; i < n; i++)y[p++] = i;//后面的j个数第二关键字为空的最小        for(i = 0; i < n; i++)if(sa[i] >= j)y[p++] = sa[i] - j;        //这样数组y保存的就是按照第二关键字排序的结果        //基数排序第一关键字        for(i = 0; i < m; i++)c[i] = 0;        for(i = 0; i < n; i++)c[x[y[i]]]++;        for(i = 1; i < m; i++)c[i] += c[i-1];        for(i = n-1; i >= 0; i--)sa[--c[x[y[i]]]] = y[i];        //根据sa和x数组计算新的x数组        swap(x,y);        p = 1;        x[sa[0]] = 0;        for(i = 1; i < n; i++)            x[sa[i]] = cmp(y,sa[i-1],sa[i],j)?p-1:p++;        if(p >= n)break;        m = p;//下次基数排序的最大值    }    int k = 0;    n--;    for(i = 0; i <= n; i++)rank[sa[i]] = i;    for(i = 0; i < n; i++)    {        if(k)   k--;        j = sa[rank[i]-1];        while(str[i+k] == str[j+k])            k++;        height[rank[i]] = k;    }}int rank[maxn], height[maxn];int str[maxn];char s1[maxn], s2[maxn];int sa[maxn];int n, m, len;bool legal (int i, int j) {    if (i > j) swap (i, j);    return (i < n && j > n);}void solve () {    int Max = 0;    for (int i = 2; i <= len; i++) {        if (height[i] >= Max && legal (sa[i], sa[i-1]))            Max = height[i];    }     cout << Max << endl;}int main(){    ios::sync_with_stdio(0);    while (cin >> s1 >> s2) {        n = strlen (s1), m = strlen (s2);        for (int i = 0; i < n; i++) str[i] = s1[i];        str[n] = 1;        for (int i = n+1; i <= n+m; i++) str[i] = s2[i-n-1];        len = n+m+1;        str[len] = 0;        da (str, sa, rank, height, len, 233);        solve ();    }    return 0;}
0 0
原创粉丝点击