c++使用libpcre捕获多行数据

来源:互联网 发布:如何招聘php程序员 编辑:程序博客网 时间:2024/06/08 06:09

由于初次尝试用c++写个简单的爬虫系统,不免用到正则,正则有多行匹配等特性,以前一直用php开发pcre_match加上正则修正符m多行,i忽略大小写 s.代表任意字符,不要太方便,所以在使用cpp的时候遇到了困惑。

实际上pcre匹配数据是多次去匹配的,
我匹配到一个数据,然后对源数据进行偏移,然后匹配下一个,这样子,知道最后一个

#include "pcre.h"#include <stdio.h>#include <string.h>#include <regex.h>#include <iostream>#include <fstream>#define OVECCOUNT 256using namespace std;int tpcre();int tmysql();int main(int argc, char ** argv){    tpcre();    return 0;}int tmysql(){    return 0;}int tpcre(){     char pText[1024] = "\"21,537511285427,50005701,,shopsearch,1,shopcon,2950270077,,\"                               href=\"//detail.tmall.com/item.htm?id=537511285427&rn=54eb0efc1a7a49f5b93ed2051aa4fe9c&abbucket=0\" target=\"_blank\"  data-gold-url=\"/inshopse\"    href=\"//detail.tmall.com/item.htm?id=537511285421&rn=54eb0efc1a7a49f5b93ed2051aa4fe9c&abbucket=0\"href=\"//detail.tmall.com/item.htm?id=537511285422&rn=54eb0efc1a7a49f5b93ed2051aa4fe9c&abbucket=0\"href=\"//detail.tmall.com/item.htm?id=537511285423&rn=54eb0efc1a7a49f5b93ed2051aa4fe9c&abbucket=0\"href=\"//detail.tmall.com/item.htm?id=537511285424&rn=54eb0efc1a7a49f5b93ed2051aa4fe9c&abbucket=0\" ";     /*    string filename = "/Users/kang/Library/Developer/Xcode/DerivedData/TmailSpider-batmyukengwdwjcsejwqossttbhu/Build/Products/Debug/2.txt";    fstream fp;    fp.open(filename);    char buf[256];    string html;    while (!fp.eof()) {        fp.read(buf, 200);        html.append(buf);    }    fp.close();    char *pText = (char*)html.c_str();    */    std::cout << pText;    //const char * pPattern = "(\\d+)\\w+";    //const char *pPattern="href=\\\"(//detail.tmall.com/item.htm[^\\]+)\\";    //const char * pPattern = "(//detail.tmall.com/item.htm[^\\\\]+)";    const char * pPattern = "(//detail.tmall.com/item.htm\\?id=\\d+)";    const char * pErrMsg = NULL;    pcre * pPcre = NULL;    int nOffset = -1;    //PCRE_MULTILINE|PCRE_UTF8|PCRE_NO_AUTO_CAPTURE    //pPcre = pcre_compile(pPattern, PCRE_DOTALL|PCRE_CASELESS|PCRE_MULTILINE, &pErrMsg, &nOffset, NULL);    pPcre = pcre_compile(pPattern, PCRE_DOTALL|PCRE_CASELESS|PCRE_MULTILINE, &pErrMsg, &nOffset, NULL);    if(pPcre == NULL){        printf("pcre match error\n");        return 1;    }    int ovector[OVECCOUNT];    int matchFlag;    int exec_offset = 0;    int count = 0;    do{        matchFlag = (int)pcre_exec(pPcre, NULL, pText, (int)strlen(pText),exec_offset,0, ovector, OVECCOUNT);        if(matchFlag > 0){            ++count;            printf("\nOK, has matched ...\n\n");            for(int i=0;i<matchFlag;i++){                char *strStart = pText+ovector[2*i];                int substrLen = ovector[2*i+1] - ovector[2*i];                char matched[1024];                memset(matched, 0, 1024);                strncpy(matched, strStart, substrLen);                printf( "match:$%d=%s\n",i,matched );            }            exec_offset = ovector[1];        }    }while (matchFlag > 0);    cout << "count="<<count<<endl;    pcre_free(pPcre);    return 0;}
0 0
原创粉丝点击