AC自动机

来源:互联网 发布:redis c语言接口 编辑:程序博客网 时间:2024/06/07 09:21

要学会AC自动机,我们必须知道什么是Trie,也就是字典树。最好对KMP算法也有些了解。Trie树和KMP算法我之前博客都有写过,感兴趣的可以看看。


简单叙述下问题,现在给出
"hsay";
"ah";
"sahe";
"he";
"say";
"herhb";
"aher";
"erhs"

共8个关键词,要问字符串"yasaherhsay"中这8个关键词有几个出现过。

答案是7。

这就是一个多模式匹配问题。


AC自动机算法分为3步:构造一棵Trie树,构造失败指针和模式匹配过程。

失败指针和KMP算法中的next函数或称shift函数的功能类似。


                

上图解释了失败指针的作用。

// AC_automachine.cpp : 定义控制台应用程序的入口点。//#include "stdafx.h"#include<vector>#include<algorithm>#include<set>#include<iostream>  using namespace std;#define MAXSIZE 26  struct TrieNode{TrieNode* next[MAXSIZE];TrieNode*parent;vector<TrieNode*>fail;char p;int Num;bool isword;};set<string>re;//保存结果TrieNode*initiate_Trie(){TrieNode*root = new TrieNode;for (int i = 0; i < MAXSIZE; i++)root->next[i] = NULL;root->Num = 0;root->parent = NULL;root->isword = false;return root;}bool search(TrieNode*root, char*str){TrieNode*tn;tn = root;int k;while (*str != '\0'){k = *str - 'a';if (tn->next[k] == NULL)return false;tn = tn->next[k];str++;}if (tn->isword == false)return false;return true;}TrieNode*build_Trie_singleword(TrieNode*root, char*str){if (search(root, str))return root;root->Num = root->Num + 1;TrieNode*tn;tn = root;while (*str != '\0'){int k = *str - 'a';if (tn->next[k] == NULL){tn->next[k] = new TrieNode;for (int i = 0; i < MAXSIZE; i++){tn->next[k]->next[i] = NULL;}tn->next[k]->p = *str;tn->next[k]->Num = 1;tn->next[k]->parent = tn;tn->next[k]->isword = false;}else{tn->next[k]->Num = tn->next[k]->Num + 1;}tn = tn->next[k];str++;}tn->isword = true;return root;}void initiate_fail_pointer(TrieNode*root, TrieNode*node){//if (node == NULL)//return;if (node == root){for (int i = 0; i < MAXSIZE; i++)if (root->next[i] != NULL)initiate_fail_pointer(root, root->next[i]);}else{cout << node->p;TrieNode*n = node;vector<char>ss;ss.push_back(node->p);vector<TrieNode*>::iterator result = find(node->fail.begin(), node->fail.end(), root->next[node->p - 'a']); //查找if (root->next[node->p - 'a'] != NULL&&result == node->fail.end() && root->next[node->p - 'a'] != node)node->fail.push_back(root->next[node->p - 'a']);while (n->parent != root){TrieNode*mm = root;ss.push_back(n->parent->p);int i;for (i = ss.size() - 1; i >= 0; i--)if (mm->next[ss[i] - 'a'] != NULL)mm = mm->next[ss[i] - 'a'];elsebreak;if (i == -1 && mm != node){result = find(node->fail.begin(), node->fail.end(), mm);if (result == node->fail.end())node->fail.push_back(mm);}n = n->parent;}for (int i = 0; i < MAXSIZE; i++)if (node->next[i] != NULL)initiate_fail_pointer(root, node->next[i]);}}int AC_automachine(TrieNode*root, char*str){int count = 0;int len = strlen(str);int k = 0;while (k < len){while (root->next[str[k] - 'a'] == NULL){k++;}TrieNode*p,*node = root->next[str[k] - 'a'];p = NULL;while (node != NULL){if (node->isword == true){string aa;TrieNode*nn = node;while (nn != root){aa += nn->p;nn = nn->parent;}std::reverse(aa.begin(), aa.end());if (re.find(aa) == re.end()){re.insert(aa);count++;}}if (!node->fail.empty()){for (int i = 0; i < node->fail.size(); i++)if (node->fail[i]->isword){string aa;TrieNode*nn = node->fail[i];while (nn != root){aa += nn->p;nn = nn->parent;}std::reverse(aa.begin(), aa.end());if (re.find(aa) == re.end()){re.insert(aa);count++;}}}k++;p = node;node = node->next[str[k] - 'a'];}k--;node = p;_ASSERT(node);if (node->fail.empty()){k++;}else{int max = 0;TrieNode*tn, *tp;tn = NULL;int kk;for (int i = 0; i < node->fail.size(); i++){kk = 0;tp = node->fail[i];while (tp != NULL){if (tp->isword){string aa;TrieNode*nn = tp;while (nn != root){aa += nn->p;nn = nn->parent;}std::reverse(aa.begin(), aa.end());if (re.find(aa) == re.end()){re.insert(aa);count++;}}if (!tp->fail.empty()){for (int i = 0; i < tp->fail.size(); i++)if (tp->fail[i]->isword){string aa;TrieNode*nn = tp->fail[i];while (nn != root){aa += nn->p;nn = nn->parent;}std::reverse(aa.begin(), aa.end());if (re.find(aa) == re.end()){re.insert(aa);count++;}}}kk++;p = tp;tp = tp->next[str[k + kk] - 'a'];}if (kk > max){max = kk;tn = p;_ASSERT(tn);}}if (!tn->fail.empty()){int maxlen=0;for (int i = 0; i < tn->fail.size(); i++){TrieNode*mm = tn->fail[i];int kkk = 0;while (mm != root){mm = mm->parent;kkk++;}if (kkk > maxlen)maxlen = kkk;}k = k + kk - maxlen;}else{k = k + kk;}}//end of else}return count;}int _tmain(int argc, _TCHAR* argv[]){TrieNode*root = initiate_Trie();root = build_Trie_singleword(root, "hsay");root = build_Trie_singleword(root, "ah");root = build_Trie_singleword(root, "sahe");root = build_Trie_singleword(root, "he");root = build_Trie_singleword(root, "say");root = build_Trie_singleword(root, "herhb");root = build_Trie_singleword(root, "aher");root = build_Trie_singleword(root, "erhs");initiate_fail_pointer(root, root);cout << endl;cout << AC_automachine(root, "yasaherhsay") << endl;system("pause");return 0;}



0 0