c++爬虫大众点评数据(未完成)

来源:互联网 发布:淘宝的雷锋侠 编辑:程序博客网 时间:2024/04/30 09:06


#include <curl/curl.h>

#include <iostream>

#include <stdio.h>

#include <string.h>


#include <pcre.h>


#define OVECCOUNT 30/* should be a multiple of 3 */

#define EBUFLEN 128

#define BUFLEN 10240


using namespacestd;



size_t onWriteData(void *buffer,size_t size,size_t nmemb,void *str) {

    if(!str || !buffer) {

        return -1;

    }

    string *result = (string*)str;

    result->append((char*)buffer, size * nmemb);

    return nmemb;

}


//获取页面

int getWeb(string url,string &result)

{

    long code =0;

    string htmlpage;

    CURL *curl =curl_easy_init();

    curl_easy_setopt(curl,CURLOPT_URL,url.c_str());//设置url

    curl_easy_setopt(curl,CURLOPT_POST,0);//设置请求方法

    curl_easy_setopt(curl,CURLOPT_USERAGENT,"Mozilla/5.");//伪装客户端

    curl_easy_setopt(curl,CURLOPT_WRITEDATA,&htmlpage);//设置接受返回结果字符串

    curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,onWriteData);//设置处理方法

    curl_easy_perform(curl);//请求

    curl_easy_getinfo(curl,CURLINFO_RESPONSE_CODE,&code);

    if(code==200)

    {

        cout<<"request success"<<endl;

        result = htmlpage;

        //cout<<htmlpage<<endl;

        

    }

    curl_easy_cleanup(curl);

    return code;

}




int main(int argc,char **argv)

{

    

    

    pcre *re;

    constchar *error;

    int  erroffset;

    int  ovector[OVECCOUNT];

    int  rc, i;

    string url ="http://www.dianping.com/search/category/212/10/g103";

 

    string html;

    getWeb(url,html);

    

    //char src[] =   " ";

    //char pattern[] = "(<a>.+?</a>)";

    

    constchar *src = html.c_str();

    char pattern[] ="(<li class=\"\"[\\s\\S]*?</li>)";

    

    printf("String : %s\n", src);

    printf("Pattern: \"%s\"\n", pattern);

    

    

    re = pcre_compile(pattern,0, &error, &erroffset,NULL);

    if (re ==NULL) {

        printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);

        return1;

    }

    

    char *p = (char*)src;

    while ( ( rc =pcre_exec(re,NULL, p,strlen(p),0,0, ovector,OVECCOUNT)) !=PCRE_ERROR_NOMATCH )

    {

        printf("\nOK, %d matched ...\n\n",rc);

        

        for (i =0; i < rc-1; i++)

        {

            char *substring_start = p + ovector[2*i];

            int substring_length = ovector[2*i+1] - ovector[2*i];

            char matched[10240];

            memset( matched,0,10240);

            strncpy( matched, substring_start, substring_length );

            

            printf("match:%s\n", matched );

        }

        

        p += ovector[1];

        if ( !p )

        {

            break;

        }

    }

    pcre_free(re);

    

    return0;  

}

0 0
原创粉丝点击