较简单的正则匹配引擎实现

来源:互联网 发布:阿里云 vps 编辑:程序博客网 时间:2024/05/18 14:46


这是较为简单的正则表达式引擎
目前只支持. + | 三种特殊字符
发现bug 请联系
已有封装好的函数,请直接使用
未来将逐渐更新

例子如图
// regulatr.cpp : 定义控制台应用程序的入口点。
//

/*______________________________________________________
                 regular expression engine
         written by 唐hz
            2015.8.10
_______________________________________________________*/
#include "stdafx.h"
#include<iostream>
#include<vector>
using namespace std;
struct State;
#define error -1//此处不确定
#define  normal 1
#define finish 16
#define mul_way 2
#define start 8
#define any 32
#define error_back 64
#define self_loop1 4
struct Linker{
State *from;
State *to;
char request;
};
struct LinkerList{
 Linker l;
 struct LinkerList *next;
};
struct State{
int status;
int position;
LinkerList * in;
LinkerList * out;
};
struct Position{
State* p;
int i;
};

State g_State[100];
int g_next[100];
int g_start,g_finish,g_maxi=-1;
int g_can_not_any=-1;

void next(char *p){
int n=strlen(p);
int i=1,j=0;
g_next[j]=-1;
while(i<n){
if(p[i]==p[j])
 g_next[i++]=j++;
else {
 if(j!=0)
 j=g_next[j-1]+1;
 else{
 j=0;
 g_next[i++]=-1;
 }
}

}
}

void addlinker(LinkerList **a,char b,State *c,State *d){
 if(*a==NULL){
  (*a)=new LinkerList;
  (*a)->l.request=b;
  (*a)->l.from=c;
  (*a)->l.to=d;
  (*a)->next=NULL;
  return ;
 }
 LinkerList * temp=*a;
 while(temp->next!=NULL){
  temp=temp->next;
 }
 temp->next=new LinkerList;
 temp=temp->next;
 temp->l.request=b;
 temp->l.from=c;
 temp->l.to=d;
 temp->next=NULL;
 return ;
}
void addlinker2(LinkerList **a,char b,State *c,State *d){
 if(*a==NULL){
  (*a)=new LinkerList;
  (*a)->l.request=b;
  (*a)->l.from=c;
  (*a)->l.to=d;
  (*a)->next=NULL;
  return ;
 }
 LinkerList * temp=*a,*temp1;
 while(temp->next&&temp->next->next!=NULL){
  temp=temp->next;
 }
 temp1=temp->next;
 temp->next=new LinkerList;
 temp=temp->next;
 temp->l.request=b;
 temp->l.from=c;
 temp->l.to=d;
 temp->next=temp1;
 return ;
}
void addlinker1(LinkerList **a,char b,State *c,State *d){
 if(*a==NULL){
  (*a)=new LinkerList;
  (*a)->l.request=b;
  (*a)->l.from=c;
  (*a)->l.to=d;
  (*a)->next=NULL;
  return ;
 }
 LinkerList * temp;
    temp=new LinkerList;
 temp->l.request=b;
 temp->l.from=c;
 temp->l.to=d;
 temp->next=*a;
 *a=temp;
 return ;
}
void connect(State &a,State &b,char r,int x=0){
 if(x==0){
 addlinker(&(a.out),r,&a,&b);
 addlinker(&(b.in),r,&a,&b);}
 else if(x==1){
 addlinker1(&(a.out),r,&a,&b);
 addlinker1(&(b.in),r,&a,&b);
 }
 else if(x==2){
 addlinker2(&(a.out),r,&a,&b);
 addlinker2(&(b.in),r,&a,&b);
 }
}

void compile(char *p,char*q){
int i=0,j=0;
while(i<(int)strlen(q)){
 if(q[i]=='|'){
 i+=2;
 }
 else if(q[i]=='+'){
 i++;
 }
 else p[j++]=q[i++];
}
p[j]='\0';
}

void construction(char *p){
 char q[100];
 compile(q,p);
 next(q);
 int i=0;
 for(int j=0;j<100;j++){
  g_State[j].in=g_State[j].out=NULL;
  g_State[j].status=0;//
  g_State[j].position=j;
 }
 g_State[0].status|=start;
 connect(g_State[0],g_State[0],error,0);
 while(*p!='\n')
 { 
  switch(*p){
  case '|':
  if(i>0) i--;
  p++;
  g_State[i].status|=mul_way;
  break;
  case '+':
  i--;
  p--;
        connect(g_State[i],g_State[i],*p,2);
  g_State[i].status|=self_loop1;
  i++;
  p+=2;
  break;
  default:
  if(!(g_State[i].status&start)&&!(g_State[i].status&error_back)){
   g_State[i].status|=error_back;
   connect(g_State[i],g_State[g_next[i-1]+1],error,0);}
  if(*p=='.'){
   g_State[i].status|=any;
  }
  connect(g_State[i],g_State[i+1],*p,1);
  g_State[i].status|=normal;
  i++;
  p++;}
 }
 g_State[i].status|=finish;

}//查int 字节
void realease(){
 LinkerList *temp,*temp1;
for(int i=0;i<100;i++)
{
 temp=g_State[i].in;
 while(temp){
 temp1=temp;
 temp=temp->next;
 delete temp1;
 }
 temp=g_State[i].out;
 while(temp){
 temp1=temp;
 temp=temp->next;
 delete temp1;
 }
}
}

bool NextState(Position a , char *p,vector<Position> & find,int i){
 auto temp=a.p->out;
 Position rec;
 bool j=false;
 while(temp){
  if(temp->l.request!=error&&temp->l.request==*(p+i)||(temp->l.request=='.'&&a.p->status&any&&a.p->position!=g_can_not_any)){
   j=true;
   if(temp->l.to->position==a.p->position&&temp->l.request!='.'&&a.p->status&any){
   g_can_not_any=a.p->position;
   }
   rec.p=((temp->l).to);
   rec.i=i+1;
   if(rec.i<(int)strlen(p))
   find.push_back(rec);
  }
  else
  {
   if((temp->l.request==error)&&(!(a.p->status&finish))&&!j){
   if(((temp->l).to)->position<g_can_not_any){
    cout<<"使g_can_not_any回复初值\n";
    g_can_not_any=-1;}
   rec.p=((temp->l).to);
   rec.i=i;
   if(!(rec.p->status&start))
   find.push_back(rec);
  }
  }
  temp=temp->next;
 }
 if(j) return true;

 return false;

}
bool isexist( char *p){
vector<Position> find;
Position temp;
temp.p=g_State;
temp.i=0;
find.push_back(temp);
g_start=1;
g_finish=0;
g_maxi=-1;
bool canfinish=false;
while(!find.empty()){
 if(temp.p->status&finish) {g_finish=temp.i;
 break;
 }
 if(temp.i>g_maxi)
  g_maxi=temp.i;
 find.pop_back();
 NextState(temp,p,find,temp.i);
 if(!find.empty())
 temp=find.back();
 else if(g_maxi<(int)strlen(p)){
 temp.i=g_maxi+1;
 temp.p=g_State;
 g_start=temp.i+1;//人是从1数而机器从0数,故加1
 find.push_back(temp);
 }
}
if(g_finish>=g_start) return true;
return false;
}
void output(char *p,int a,int b){
cout<<"\n位于"<<a<<"和"<<b<<"之间\n";
for(int i=a-1;i<b;i++)
 cout<<p[i];
cout<<"\n";
}
void findWholeMatch(char *p){
 char *str=p;
 int rec=0;
 while(isexist(p)&&rec<(int)strlen(str)){
 output(str,g_start+rec,g_finish+rec);
 rec+=g_finish;
 p+=g_finish;
 }

}
int main(){
char expression[100];
char content[1000];
fgets(expression,100,stdin);
fgets(content,1000,stdin);
construction(expression);
cout<<"构建完成 \n";
/*
if(isexist(content)){
 cout<<"存在\n";
 cout<<"开始在"<<g_start<<"结束在"<<g_finish<<"\n";
 output(content,g_start,g_finish);
}
else  cout<<"不存在\n";
*/

findWholeMatch(content);
system("pause");
realease();

}// regulatr.cpp : 定义控制台应用程序的入口点。
//

/*______________________________________________________
regular expression engine
written by 唐hz
2015.8.10
_______________________________________________________*/
#include "stdafx.h"
#include<iostream>
#include<vector>
using namespace std;
struct State;
#define error -1//此处不确定
#define normal 1
#define finish 16
#define mul_way 2
#define start 8
#define any 32
#define error_back 64
#define self_loop1 4
struct Linker{
State *from;
State *to;
char request;
};
struct LinkerList{
Linker l;
struct LinkerList *next;
};
struct State{
int status;
int position;
LinkerList * in;
LinkerList * out;
};
struct Position{
State* p;
int i;
};

State g_State[100];
int g_next[100];
int g_start,g_finish,g_maxi=-1;
int g_can_not_any=-1;

void next(char *p){
int n=strlen(p);
int i=1,j=0;
g_next[j]=-1;
while(i<n){
if(p[i]==p[j])
g_next[i++]=j++;
else {
if(j!=0)
j=g_next[j-1]+1;
else{
j=0;
g_next[i++]=-1;
}
}

}
}

void addlinker(LinkerList **a,char b,State *c,State *d){
if(*a==NULL){
(*a)=new LinkerList;
(*a)->l.request=b;
(*a)->l.from=c;
(*a)->l.to=d;
(*a)->next=NULL;
return ;
}
LinkerList * temp=*a;
while(temp->next!=NULL){
temp=temp->next;
}
temp->next=new LinkerList;
temp=temp->next;
temp->l.request=b;
temp->l.from=c;
temp->l.to=d;
temp->next=NULL;
return ;
}
void addlinker2(LinkerList **a,char b,State *c,State *d){
if(*a==NULL){
(*a)=new LinkerList;
(*a)->l.request=b;
(*a)->l.from=c;
(*a)->l.to=d;
(*a)->next=NULL;
return ;
}
LinkerList * temp=*a,*temp1;
while(temp->next&&temp->next->next!=NULL){
temp=temp->next;
}
temp1=temp->next;
temp->next=new LinkerList;
temp=temp->next;
temp->l.request=b;
temp->l.from=c;
temp->l.to=d;
temp->next=temp1;
return ;
}
void addlinker1(LinkerList **a,char b,State *c,State *d){
if(*a==NULL){
(*a)=new LinkerList;
(*a)->l.request=b;
(*a)->l.from=c;
(*a)->l.to=d;
(*a)->next=NULL;
return ;
}
LinkerList * temp;
temp=new LinkerList;
temp->l.request=b;
temp->l.from=c;
temp->l.to=d;
temp->next=*a;
*a=temp;
return ;
}
void connect(State &a,State &b,char r,int x=0){
if(x==0){
addlinker(&(a.out),r,&a,&b);
addlinker(&(b.in),r,&a,&b);}
else if(x==1){
addlinker1(&(a.out),r,&a,&b);
addlinker1(&(b.in),r,&a,&b);
}
else if(x==2){
addlinker2(&(a.out),r,&a,&b);
addlinker2(&(b.in),r,&a,&b);
}
}

void compile(char *p,char*q){
int i=0,j=0;
while(i<(int)strlen(q)){
if(q[i]=='|'){
i+=2;
}
else if(q[i]=='+'){
i++;
}
else p[j++]=q[i++];
}
p[j]='\0';
}

void construction(char *p){
char q[100];
compile(q,p);
next(q);
int i=0;
for(int j=0;j<100;j++){
g_State[j].in=g_State[j].out=NULL;
g_State[j].status=0;//
g_State[j].position=j;
}
g_State[0].status|=start;
connect(g_State[0],g_State[0],error,0);
while(*p!='\n')
{
switch(*p){
case '|':
if(i>0) i--;
p++;
g_State[i].status|=mul_way;
break;
case '+':
i--;
p--;
connect(g_State[i],g_State[i],*p,2);
g_State[i].status|=self_loop1;
i++;
p+=2;
break;
default:
if(!(g_State[i].status&start)&&!(g_State[i].status&error_back)){
g_State[i].status|=error_back;
connect(g_State[i],g_State[g_next[i-1]+1],error,0);}
if(*p=='.'){
g_State[i].status|=any;
}
connect(g_State[i],g_State[i+1],*p,1);
g_State[i].status|=normal;
i++;
p++;}
}
g_State[i].status|=finish;

}//查int 字节
void realease(){
LinkerList *temp,*temp1;
for(int i=0;i<100;i++)
{
temp=g_State[i].in;
while(temp){
temp1=temp;
temp=temp->next;
delete temp1;
}
temp=g_State[i].out;
while(temp){
temp1=temp;
temp=temp->next;
delete temp1;
}
}
}

bool NextState(Position a , char *p,vector<Position> & find,int i){
auto temp=a.p->out;
Position rec;
bool j=false;
while(temp){
if(temp->l.request!=error&&temp->l.request==*(p+i)||(temp->l.request=='.'&&a.p->status&any&&a.p->position!=g_can_not_any)){
j=true;
if(temp->l.to->position==a.p->position&&temp->l.request!='.'&&a.p->status&any){
g_can_not_any=a.p->position;
}
rec.p=((temp->l).to);
rec.i=i+1;
if(rec.i<(int)strlen(p))
find.push_back(rec);
}
else
{
if((temp->l.request==error)&&(!(a.p->status&finish))&&!j){
if(((temp->l).to)->position<g_can_not_any){
cout<<"使g_can_not_any回复初值\n";
g_can_not_any=-1;}
rec.p=((temp->l).to);
rec.i=i;
if(!(rec.p->status&start))
find.push_back(rec);
}
}
temp=temp->next;
}
if(j) return true;

return false;

}
bool isexist( char *p){
vector<Position> find;
Position temp;
temp.p=g_State;
temp.i=0;
find.push_back(temp);
g_start=1;
g_finish=0;
g_maxi=-1;
bool canfinish=false;
while(!find.empty()){
if(temp.p->status&finish) {g_finish=temp.i;
break;
}
if(temp.i>g_maxi)
g_maxi=temp.i;
find.pop_back();
NextState(temp,p,find,temp.i);
if(!find.empty())
temp=find.back();
else if(g_maxi<(int)strlen(p)){
temp.i=g_maxi+1;
temp.p=g_State;
g_start=temp.i+1;//人是从1数而机器从0数,故加1
find.push_back(temp);
}
}
if(g_finish>=g_start) return true;
return false;
}
void output(char *p,int a,int b){
cout<<"\n位于"<<a<<"和"<<b<<"之间\n";
for(int i=a-1;i<b;i++)
cout<<p[i];
cout<<"\n";
}
void findWholeMatch(char *p){
char *str=p;
int rec=0;
while(isexist(p)&&rec<(int)strlen(str)){
output(str,g_start+rec,g_finish+rec);
rec+=g_finish;
p+=g_finish;
}

}
int main(){
char expression[100];
char content[1000];
fgets(expression,100,stdin);
fgets(content,1000,stdin);
construction(expression);
cout<<"构建完成 \n";
/*
if(isexist(content)){
cout<<"存在\n";
cout<<"开始在"<<g_start<<"结束在"<<g_finish<<"\n";
output(content,g_start,g_finish);
}
else cout<<"不存在\n";
*/

findWholeMatch(content);
system("pause");
realease();

}

采用了NFA,未来将逐步更新。

0 0