多模式匹配算法:AC自动机的C++实现

来源:互联网 发布:淘宝违规处罚 编辑:程序博客网 时间:2024/05/17 23:27

AC自动机(Aho-Corasick automaton)是用来处理多模式匹配问题的。

基本可认为是TrieTree+KMP。其中KMP是一种单模式匹配算法。

AC自动机的构造要点是失败指针的设置,用于匹配失败时跳转到另一节点继续匹配。同时在匹配的过程中也用来检索其他“同尾”的模式。

失败指针的设置:

用BFS。

对于每个节点,我们可以这样处理:设这个节点上的字母为C,沿着他父亲的失败指针走,直到走到一个节点,他的儿子中也有字母为C的节点。然后把当前节点的失败指针指向那个字目也为C的儿子。如果一直走到了root都没找到,那就把失败指针指向root。

最开始,我们把root加入队列(root的失败指针显然指向自己),这以后我们每处理一个点,就把它的所有儿子加入队列,直到全部设置完毕。

要点1:root的孩子的那一层比较特殊,若按照上述算法,它们的失败指针会指向自己,这会在匹配的过程中导致死循环。显然root的子节点的失败指针应指向root,我们应对这一层单独处理。

要点2:沿着父节点的失败指针走到root之后并不是立即将子节点的失败指针设置为root,而是在root的子节点中找寻字母为C的节点,将它设置为失败指针。若没有才设置为root。这样不会丢失模式只有一个字母的情况。


匹配过程:

一开始,Trie中有一个指针t1指向root,待匹配串中有一个指针t2指向串头。

接下来的操作和KMP很相似:

若:t2指向的字母,是Trie树中,t1指向的节点的儿子,那么

①t2+1,t1改为那个儿子的编号

如果t1所在的点可以顺着失败指针走到一个绿色点(指TrieTree中单词结尾字母对应的节点),那么以那个绿点结尾的单词就算出现过了。

否则:t1顺这当前节点的失败指针向上找,直到t2是t1的一个儿子,或者t1指向根。如果t1路过了一个绿色的点,那么以这个点结尾的单词就算出现过了。


c++实现:

//TrieTreeNode.h#pragma once#include<iostream>using namespace std;template<class T>class TrieTreeNode{public:TrieTreeNode(int MaxBranch)//用于构造根节点{MaxBranchNum = MaxBranch;ChildNodes = new TrieTreeNode<T>*[MaxBranchNum];for (int i = 0; i < MaxBranchNum; i++)ChildNodes[i] = NULL;word = NULL;wordlen = 0;FailedPointer = NULL;Freq = 0;ID = -1;}public:int MaxBranchNum;//最大分支数;char* word;//单词字符串的指针int wordlen;TrieTreeNode<T> **ChildNodes;int Freq;//词频统计int ID;//构建TrieTree树时的插入顺序,可用来记录字符串第一次出现的位置TrieTreeNode<T> *FailedPointer;};


//TrieTree.h#pragma once#include<iostream>#include"TrieTreeNode.h"#include<queue>using namespace std;template<class T>class TrieTree{//Insert时为节点代表的单词word分配内存,Delete时只修改Freq而不删除word,Search时以Freq的数值作为判断依据,而不是根据word是否为NULLpublic:TrieTree(const int size);~TrieTree(){ Destroy(root); };void Insert(const T* str);//插入单词strvoid Insert(const T* str, const int num);//插入单词str,带有编号信息int Search(const T* str);//查找单词str,返回出现次数bool Delete(const T* str);//删除单词strvoid PrintALL();//打印trie树中所有节点对应的单词void PrintPre(const T* str);//打印以str为前缀的单词void SetFailedPointer();//设置匹配失效时的跳转指针int MatchKMP(char* str);//返回str中出现在该TrieTree中的单词个数private:void Print(const TrieTreeNode<T>* p);void Destroy(TrieTreeNode<T>* p);//由析构函数调用,释放以p为根节点的树的空间private:TrieTreeNode<T>* root;int MaxBranchNum;//最大分支数};template<class T>void TrieTree<T>::Destroy(TrieTreeNode<T>* p){if (!p)return;for (int i = 0; i < MaxBranchNum; i++)Destroy(p->ChildNodes[i]);if (!p->word){delete[] p->word;//只是释放了char数组word的空间,指针word本身的空间未释放,由后续的delete p释放p->word = NULL;}delete p;//释放节点空间p = NULL;//节点指针置为空//以上的置NULL的两句无太大意义,但是:编程习惯}template<class T>bool TrieTree<T>::Delete(const T* str){TrieTreeNode<T>* p = root;if (!str)return false;for (int i = 0; str[i]; i++){int index = str[i] - 'a';if (p->ChildNodes[index])p = p->ChildNodes[index];else return false;}p->Freq = 0;p->ID = -1;return true;}template<class T>void TrieTree<T>::PrintPre(const T* str){TrieTreeNode<T>* p = root;if (!str)return;for (int i = 0; str[i]; i++){int index = str[i] - 'a';if (p->ChildNodes[index])p = p->ChildNodes[index];else return;}cout << "以" << str << "为前缀的单词有:" << endl;Print(p);}template<class T>int TrieTree<T>::Search(const T* str){TrieTreeNode<T>* p = root;if (!str)return -1;for (int i = 0; str[i]; i++){int index = str[i] - 'a';if (p->ChildNodes[index])p = p->ChildNodes[index];else return 0;}return p->Freq;}template<class T>TrieTree<T>::TrieTree(const int size){MaxBranchNum = size;root = new TrieTreeNode<T>(MaxBranchNum);//根节点不储存字符root->FailedPointer = root;//设置失配指针}template<class T>void TrieTree<T>::Insert(const T* str){TrieTreeNode<T>* p = root;int i;for (i = 0; str[i]; i++){if (str[i]<'a' || str[i]>'z'){cout << "格式错误!" << endl;return;}int index = str[i] - 'a';//下溯的分支编号if (!p->ChildNodes[index])p->ChildNodes[index] = new TrieTreeNode<T>(MaxBranchNum);p = p->ChildNodes[index];}if (!p->word)//该词以前没有出现过{p->word = new char[strlen(str) + 1];strcpy_s(p->word, strlen(str) + 1, str);p->wordlen = i;//设置单词长度}p->Freq++;}template<class T>void TrieTree<T>::Insert(const T* str, const int num){TrieTreeNode<T>* p = root;int i;for (i = 0; str[i]; i++){if (str[i]<'a' || str[i]>'z'){cout << "格式错误!" << endl;return;}int index = str[i] - 'a';//下溯的分支编号if (!p->ChildNodes[index])p->ChildNodes[index] = new TrieTreeNode<T>(MaxBranchNum);p = p->ChildNodes[index];}if (!p->word)//该词以前没有出现过{p->word = new char[strlen(str) + 1];strcpy_s(p->word, strlen(str) + 1, str);p->wordlen = i;}p->Freq++;if (num < p->ID || p->ID == -1)//取最小的num作为当前节点代表的单词的IDp->ID = num;}template<class T>void TrieTree<T>::PrintALL(){Print(root);}template<class T>void TrieTree<T>::Print(const TrieTreeNode<T>* p){if (p == NULL)return;if (p->Freq > 0){cout << "单词:" << p->word << "频数:" << p->Freq;if (p->ID >= 0)cout << "ID:" << p->ID;cout << endl;}for (int i = 0; i < MaxBranchNum; i++){if (p->ChildNodes[i]){Print(p->ChildNodes[i]);}}}template<class T>int TrieTree<T>::MatchKMP(char* str){int count = 0;//str中出现的TrieTree中的单词个数char* p = str;//str中指针TrieTreeNode<T>* node = root;//TrieTree的节点指针while (*p){if (node->ChildNodes[*p - 'a'])//当前字符匹配成功{TrieTreeNode<T>* temp = node->ChildNodes[*p - 'a']->FailedPointer;while (temp != root)//在匹配的情况下,仍然沿FailedPointer搜索,可检索出所有模式。{if (temp->Freq > 0){count++;//cout << "temp->wordlen:" << temp->wordlen << endl;cout << (int)(p - str) - temp->wordlen + 1 << "" << temp->word << endl;//打印已匹配的模式的信息}temp = temp->FailedPointer;}node = node->ChildNodes[*p - 'a'];p++;if (node->Freq > 0){count++;//cout << "node->wordlen:" << node->wordlen << endl;cout << (int)(p - str) - node->wordlen << "" << node->word << endl;//打印已匹配的模式的信息}}else//失配,跳转{if (node == root)p++;elsenode = node->FailedPointer;}}return count;}template<class T>void TrieTree<T>::SetFailedPointer(){queue<TrieTreeNode<T>*> q;q.push(root);while (!q.empty()){TrieTreeNode<T>* father = q.front();//父节点q.pop();for (int i = 0; i < MaxBranchNum; i++)//对每一个子节点设置FailedPointer{if (father->ChildNodes[i]){TrieTreeNode<T>* child = father->ChildNodes[i];q.push(child);TrieTreeNode<T>* candidate = father->FailedPointer;//从father->FailedPointer开始游走的指针while (true){if (father == root){candidate = root;break;}if (candidate->ChildNodes[i])//有与child代表的字母相同的子节点{candidate = candidate->ChildNodes[i];break;}else{if (candidate == root)break;candidate = candidate->FailedPointer;//以上两句顺序不能交换,因为在root仍可以做一次匹配}}child->FailedPointer = candidate;}}}}



//main.cpp#pragma once#include<iostream>#include<fstream>#include"TrieTree.h"using namespace std;void test(TrieTree<char>* t){char* charbuffer = new char[50];char* cb = charbuffer;fstream fin("d:\\words.txt");if (!fin){cout << "File open error!\n";return;}char c;int num = 0;while ((c = fin.get()) != EOF){if (c >= '0'&&c <= '9')num = num * 10 + c - '0';if (c >= 'a'&&c <= 'z')*cb++ = c;if (c == '\n'){*cb = NULL;t->Insert(charbuffer, num);cb = charbuffer;num = 0;}}fin.close();}void main(){TrieTree<char>* t = new TrieTree<char>(26);char* c1 = "she";char* c2 = "shee";char* c3 = "he";char* c4 = "e";char* s = "shee";//要匹配的串t->Insert(c1);t->Insert(c2);t->Insert(c3);t->Insert(c4);//test(t);t->SetFailedPointer();t->PrintALL();cout << endl << "匹配结果为:" << endl;int result = t->MatchKMP(s);cout << "共匹配" << result << "处模式串" << endl;system("pause");}

运行结果:

对"shee"进行匹配

模式串为:"shee" "she" "he" "e"





0 0
原创粉丝点击