curl抓取

来源:互联网 发布:怎么区分淘宝二手镜头 编辑:程序博客网 时间:2024/05/17 23:30
<?php
/**
 * 抓取信息操作
 */
class Reptile{
    
    /**
     * 通过url获得网页内容【单线程】
     * @param $url
     * @return array
     */
    function getContent($url, $proxy = 0){
        $header = array(
            //]]'Accept:*/*',
            'Accept-Encoding:gzip',
/*            'Accept-Language:zh-CN,zh;q=0.8',
            'Cache-Control:no-cache',
            'Host:che.xin.com',*/
        );
        $ch = curl_init();
        $timeout = 5;
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);//不自动输出内容
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
        curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
        if($proxy == 1) {
            curl_setopt($ch, CURLOPT_PROXY, '127.0.0.1:9090');
        }
        curl_setopt($ch, CURLOPT_USERAGENT, ':Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36');
        //在需要用户检测的网页里需要增加下面两行
        //curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
        //curl_setopt($ch, CURLOPT_USERPWD, US_NAME.":".US_PWD);
        $contents = curl_exec($ch);
        curl_close($ch);
        /*if(empty($contents)){
            $contents = file_get_contents($url);
        }*/
        //echo $url;exit;
        return gzdecode($contents);
    }

    /**
     * 通过url获得网页内容【多线程】
     * @param $urlArr 网页地址结合
     * @return array
     */
    function getContentByMulti($urlArr, $proxy = 0){

        /*$urlArr = array(
            "http://www.baidu.com/",
            "http://www.baidu.com/"
        );*/
        $mh = curl_multi_init();
        $header = array(
            'Accept-Encoding:gzip',
        );
        foreach ($urlArr as $i => $url) {
            $conn[$i]=curl_init($url);
            curl_setopt($conn[$i],CURLOPT_RETURNTRANSFER,1);
            curl_setopt($conn[$i], CURLOPT_CONNECTTIMEOUT, 5);
            curl_setopt($conn[$i], CURLOPT_HTTPHEADER, $header);
            curl_setopt($conn[$i], CURLOPT_USERAGENT, ':Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36');
            if($proxy == 1) {
                curl_setopt($conn[$i], CURLOPT_PROXY, '127.0.0.1:9090');
            }
            curl_multi_add_handle ($mh,$conn[$i]);
        }

        do { $n=curl_multi_exec($mh,$active);} while ($active);

//        do {
//            $mrc = curl_multi_exec($mh,$active);
//        } while ($mrc == CURLM_CALL_MULTI_PERFORM);
//        while ($active and $mrc == CURLM_OK) {
//            if (curl_multi_select($mh) != -1) {
//                do {
//                    $mrc = curl_multi_exec($mh, $active);
//                } while ($mrc == CURLM_CALL_MULTI_PERFORM);
//            }
//        }
            $res = array();
        foreach ($urlArr as $i => $url) {
            $res[$i]=mb_convert_encoding(gzdecode(curl_multi_getcontent($conn[$i])), 'UTF-8', 'UTF-8,GBK,GB2312,BIG5');
            curl_close($conn[$i]);
        }
        return $res;
    }
    
}//class end
0 0