
来源:互联网 发布:淘宝触摸屏版 编辑:程序博客网 时间:2024/06/04 00:31

1. C Regex

1.1. 前言

GUN C Library支持两种正则表达式的匹配接口,其一为标准的POSIX.2接口,其二为GUN C Library已经有的。两种接口都声明在regex.h中,如果#define _POSIX_C_SOURCE,将使用POSIX.2接口。

1.2. 编译正则表达式

int regcomp (regex_t *compiled, const char *restrict pattern, int cflags)
complied : 为前面定义的regex_t变量地址
pattern : 为自定义的正则表达式
cflags : 指定正则表达式语法及语义的选项,其中选项主要如下:
REG_NOSUB : 使用此标志,regcomp将不保存子表达式的相关信息,如果不使用此标志,则compiled->re_nsub将记录子表达式数目。
REG_EXTENDED : 将pattern作为扩展的正则表达式,而不是basic正则表达式。
REG_ICASE : 匹配过程中忽略大小写

1.3. 匹配正则表达式

int regexec (const regex_t *compiled, const char *string, size_t nmatch, regmatch_t matchptr[restrict], int eflags)
compiled :为编译后的regex_t变量
string :为待匹配的字符串
matchptr :存储匹配结果信息
eflags :位模式,主要选项如下:
Do not regard the beginning of the specified string as the beginning of a line; more generally, don't make any assumptions about what text might precede it.
Do not regard the end of the specified string as the end of a line; more generally, don't make any assumptions about what text might follow it.
rm_so :匹配成功的子串在原字符串中的起始偏移地址
rm_eo :匹配成功的子串在原字符串中的结束偏移地址
在传递的regmatch_t数组中,index 0保存匹配整个正则表达式的字符串信息,index i(i>0)则依次匹配第i个子表达式匹配信息。如果你不关心这些信息,可以设置nmatch为0,或者设置REG_NOSUB标志。

1.4. 释放

void regfree (regex_t *compiled)
size_t regerror (int errcode, const regex_t *compiled, char *restrict buffer, size_t length)
char *get_regerror (int errcode, regex_t *compiled){size_t length = regerror (errcode, compiled, NULL, 0);char *buffer = xmalloc (length);(void) regerror (errcode, compiled, buffer, length);return buffer;}

1.5. 示例

#include <sys/types.h>#include <regex.h>#include <stdio.h>#include <stdlib.h>#include <string.h>char* get_regerror(int errorcode , regex_t *complied){size_t length = regerror(errorcode , complied , NULL , 0);char *buffer = (char*)malloc(length);regerror(errorcode , complied , buffer , length);return buffer;}int main(int argc , char **argv){regex_t regex;const size_t nmatch = 1;regmatch_t pmatch[1];const char pattern[] = "[[:digit:]]+";char *buf = "12abc45";int status;if(0 != (status = regcomp(®ex , pattern , REG_EXTENDED))){printf("regcomp failed:%s\n",get_regerror(status , ®ex));regfree(®ex);exit(1);}while(1){status = regexec(®ex , buf , nmatch , pmatch , 0);if(REG_NOMATCH == status){printf("%s\n",get_regerror(status , ®ex));break;}else if(0 == status){printf("Match:");size_t length = pmatch[0].rm_eo - pmatch[0].rm_so;char *result = (char*)malloc(length+1);strncpy(result,buf+pmatch[0].rm_so,length);result[length]='\0';printf("%s\n",result);free(result);buf += pmatch[0].rm_eo;}else{printf("regexec failed :%s\n",get_regerror(status , ®ex));break;}}regfree(®ex);return 0;}

2. boost Regex

2.1. 前言

The algorithms regex_search and regex_match make use of match_results to report what matched; the difference between these algorithms is that regex_match will only find matches that consume all of the input text, where as regex_search will search for a match anywhere within the text being matched.

2.2. basic_regex

class basic_regex;typedef basic_regex<char> regex;typedef basic_regex<wchar_t> wregex;

bool empty() const;

size_type mark_count() const;

flag_type flags() const;

int status() const;

2.3. match_results


class match_results;typedef match_results<const char*> cmatch;typedef match_results<const wchar_t*> wcmatch;typedef match_results<string::const_iterator> smatch;typedef match_results<wstring::const_iterator> wsmatch;

size_type size() const;

bool empty()const;
Returns size() == 0.

string_type str(int sub = 0)const;string_type str(constchar_type* sub)const;

const_reference operator[](int n) const

const_reference prefix()const;
Effects: Returns a reference to the sub_match object representing the character sequence from the start of the string being matched or searched, to the start of the match found.

const_reference suffix()const;
Effects: Returns a reference to the sub_match object representing the character sequence from the end of the match found to the end of the string being matched or searched.

const_iterator begin()const;
Effects: Returns a starting iterator that enumerates over all the marked sub-expression matches stored in *this.

const_iterator end()const;
Effects: Returns a terminating iterator that enumerates over all the marked sub-expression matches stored in *this.

2.4. sub_match

When the marked sub-expression denoted by an object of type sub_match participated in a regular expression match then member matched evaluates to true, and members first and second denote the range of characters [first,second) which formed that match. Otherwise matched is false, and members first and second contained undefined values.
If an object of type sub_match represents sub-expression 0 - that is to say the whole match - then member matched is always true, unless a partial match was obtained as a result of the flag match_partial being passed to a regular expression algorithm, in which case member matched is false, and members first and second represent the character range that formed the partial match.

typedef BidirectionalIterator iterator;
The iterator type.
iterator first
An iterator denoting the position of the start of the match.
iterator second
An iterator denoting the position of the end of the match.
bool matched
A Boolean value denoting whether this sub-expression participated in the match.

static difference_type length();
Effects: returns the length of this matched sub-expression, or 0 if this sub-expression was not matched: matched ? dis-tance(first, second) : 0).

basic_string<value_type> str()const;
Effects: returns a string representation of *this: (matched ? basic_string<value_type>(first, second) : ba-sic_string<value_type>()).

2.5. regex_match

Note that the result is true only if the expression matches the whole of the input sequence. If you want to search for an expression somewhere within the sequence then use regex_search.

template <classBidirectionalIterator, classcharT, classtraits>bool regex_match(BidirectionalIterator first, BidirectionalIterator last,constbasic_regex <charT, traits>& e,match_flag_typeflags= match_default);template <classcharT, classAllocator, classtraits>bool regex_match(constcharT* str, match_results<constcharT*, Allocator>& m,constbasic_regex <charT, traits>& e,match_flag_typeflags= match_default);template <classcharT, classtraits>bool regex_match(constcharT* str,constbasic_regex <charT, traits>& e,match_flag_typeflags= match_default);

2.6. regex_search

template<classcharT, classAllocator, classtraits>bool regex_search(constcharT* str,match_results<constcharT*, Allocator>& m,constbasic_regex<charT, traits>& e,match_flag_typeflags= match_default);template <classBidirectionalIterator, classcharT, classtraits>bool regex_search(BidirectionalIterator first, BidirectionalIterator last,constbasic_regex<charT, traits>& e,match_flag_typeflags= match_default);template <classcharT, classtraits>bool regex_search(constcharT* str,constbasic_regex<charT, traits>& e,match_flag_typeflags= match_default);

2.7. regex_replace

The algorithm regex_replace searches through a string finding all the matches to the regular expression: for each match it then calls match_results<>::format to format the string and sends the result to the output iterator. Sections of text that do not match are copied to the output unchanged only if the flags parameter does not have the flag format_no_copy set. If the flag format_first_only is set then only the first occurrence is replaced rather than all occurrences.
template <classtraits, classFormatter>basic_string<charT> regex_replace(constbasic_string<charT>& s,constbasic_regex<charT, traits>& e,Formatter fmt,match_flag_typeflags= match_default);

2.8. regex_iterator

The iterator type regex_iterator will enumerate all of the regular expression matches found in some sequence。
typedef regex_iterator<constchar*> cregex_iterator;typedef regex_iterator<std::string::const_iterator> sregex_iterator;#ifndef BOOST_NO_WREGEXtypedef regex_iterator<constwchar_t*> wcregex_iterator;typedef regex_iterator<std::wstring::const_iterator> wsregex_iterator;#endif

2.9. regex_token_iterator

When class regex_token_iterator is used to enumerate a single sub-expression with index -1, then the iterator performs field splitting: that is to say it enumerates one character sequence for each section of the character container sequence that does not match the regular expression specified。
typedef regex_token_iterator<constchar*> cregex_token_iterator;typedef regex_token_iterator<std::string::const_iterator> sregex_token_iterator;#ifndef BOOST_NO_WREGEXtypedef regex_token_iterator<constwchar_t*> wcregex_token_iterator;typedef regex_token_iterator<<std::wstring::const_iterator> wsregex_token_iterator;#endif

2.10. 示例

//g++ demon_regex_boost.cc -o demon_regex_boost -lboost_regex#include <iostream>#include <string>#include <boost/regex.hpp>using namespace std;class regex_callback{public:template<typename T>void operator()(const T& what){std::cout<<what[0].str()<<std::endl;}};bool regex_callback2(const boost::cmatch& what){std::cout<<what[0].str()<<std::endl;}int main(int argc , char **argv){const char *text = " abc 10.5.1 ";    const char pattern[] = "(\\d)+\\.(\\d)+\\.(\\d)+\\.(\\d)+";        {    //字符串匹配    cout<<"[1]:"<<endl;    boost::regex reg(pattern);    bool ret = boost::regex_match("" , reg);    if(ret){    cout<<"match"<<endl;    }else{    cout<<"no match"<<endl;    }    }        {    //提取字串    cout<<"[2]:"<<endl;    boost::smatch sm;    boost::regex reg(pattern);    string text_str(text);    string::const_iterator start = text_str.begin();    string::const_iterator end = text_str.end();    while(boost::regex_search(start , end , sm , reg)){    cout<<sm[0]<<endl;    start = sm[0].second;    }    }        {    //替换    cout<<"[3]:"<<endl;    boost::regex reg(pattern);    string s = boost::regex_replace(string(text) , reg , "ftp://$2$5");    cout<<"ftp site:"<<s<<endl;    }        {    //使用迭代器找出所有数字    cout<<"[4]:"<<endl;    boost::regex reg("\\d+");    boost::cregex_iterator it_begin = make_regex_iterator(text , reg);    boost::cregex_iterator it_end;    //for(boost::cregex_iterator it = it_begin ; it != it_end ; ++it){    //cout<<it->str()<<endl;    //}    //for_each(it_begin,it_end,regex_callback());    for_each(it_begin,it_end,®ex_callback2);    }        {    cout<<"[5]:"<<endl;boost::regex ip_regex(pattern);string text1(text);boost::sregex_iterator it(text1.begin(), text1.end(), ip_regex);boost::sregex_iterator end;for (; it != end; ++it) {    std::cout << it->str() << std::endl;}    }}

3. Reference

