AC算法原理与实现(模式匹配)

来源:互联网 发布:windows live 账号注册 编辑:程序博客网 时间:2024/06/04 00:51
最近学习模式匹配知识,了解了KMP与BM算法后,AC的了解也必不可少。上网学习了下,把重要的知识总结一下。
Aho_Corasick自动机匹配算法是最著名的多模式匹配算法之一。AC自动机算法分为3步:构造一颗Trie树,构造失败指针和模式匹配过程。
1.建立一颗Trie的过程比较简单(可参考源代码)
2.构造失败指针
构造失败指针的过程概括起来就一句话:设这个节点上的字母为C,沿着他父亲的失败指针走,直到走到一个节点,他的儿子中也有字母为C的节点。然后把当前节点的失败指针指向那个字母也为C的儿子。如果一直走到了root都没找到,那就把失败指针指向root
3.模式匹配过程
匹配过程分两种情况:
(1)当前字符匹配,表示从当前节点沿着树边有一条路径可以到达目标字符,此时只需沿该路径走向下一个节点继续匹配即可,目标字符串指针移向下个字符继续匹配;
(2)当前字符不匹配,则去当前节点失败指针所指向的字符继续匹配,匹配过程随着指针指向root结束
下面给出AC算法的具体C++实现:
#include <iostream>
#include <cstring>
#include <cstdio>
using namespace std;
const int MAXN = 1000001; //模式串的最大长度MAXN - 1
const int MAXM = 51; //单词最大长度为MAXM - 1
const int KEYSIZE = 26; //26个小写字母
struct Node {
      Node *fail;  //失败指针
      Node *next[KEYSIZE]; //儿子结点个数
      int count; //单词个数
      Node() {
            fail = NULL;
            count = 0;
            memset(next, 0, sizeof(next));
      }
}*q[MAXN / 2];
void insert(char *str, Node *root)
{
      Node *p = root;
      int i = 0;
      while(str[i]) {
           int index = str[i] - 'a';
           if(p -> next[index] == NULL)
                  p -> next[index] = new Node();
           p = p -> next[index];
           i ++;
      }
      p -> count ++; //在单词的最后一个结点count + 1,代表一个单词
}
void build_ac_automation(Node *root)
{
      root -> fail = NULL;
      int head, tail;
      head = tail = 0;
      q[tail ++] = root;
      while(head < tail) {
            Node *temp = q[head ++];
            for(int i = 0; i < KEYSIZE; i ++) {
                if(temp -> next[i] != NULL) {
                     if(temp == root) {
                          temp -> next[i] -> fail = root;
                     }else {
                          Node *p = temp -> fail;
                          while(p != NULL) {
                               if(p -> next[i] != NULL) {
                                         temp -> next[i] -> fail = p -> next[i];
                                    break;
                               }
                               p = p -> fail;
                          }
                          if(p == NULL)
                               temp -> next[i] -> fail = root;
                     }
                     q[tail ++] = temp -> next[i];
                }
           }
      }
}
int AC_search(char *str, Node *root)
{
      int i = 0, cnt = 0;
      Node *p = root;
      while(str[i]) {
           int index = str[i] - 'a';
           while(p -> next[index] == NULL && p != root) p = p -> fail;
           p = p -> next[index];
           p = (p == NULL) ? root : p;
           Node *temp = p;
           while(temp != root && temp -> count != -1) {
                 cnt += temp -> count;
                 temp -> count = -1;
                 temp = temp -> fail;
           }
           i ++;
      }
      return cnt;
}
int main()
{
      int n;
      Node *root;
      char keyword[MAXM]; //单词
      char str[MAXN]; //模式串
   printf("scanf the number of words-->\n");
      scanf("%d", &n);
      root = new Node();
   printf("scanf the words-->\n");
      while(n --) {
  scanf("%s", keyword);
  insert(keyword, root);
      }
      build_ac_automation(root);
   printf("scanf the text-->\n");
      scanf("%s", str);
      printf("there are %d words match\n", AC_search(str, root));
      
      return(0);
}
0 0