获取内容的一个curl类(采集专用)

来源:互联网 发布:cd windows syswow64 编辑:程序博客网 时间:2024/05/17 09:19

一般用于采集,防止ip被禁以后还在运行



<?php     //curl扩展类    include "Curl.class.php";     //实例化对象    $curl = new Curl();     //调用方法    $con = $curl->get("http://www.lampbrother.net");     echo $con;     //作业  采集讲师页面  ?><?php /** * Wget Curl驱动核心 * * @author     jonwang(jonwang@myqee.com) * @category   MyQEE * @package    System * @subpackage Core * @copyright  Copyright (c) 2008-2012 myqee.com * @license    http://www.myqee.com/license.html */class Curl{     protected $http_data = array();     protected $agent;     protected $cookies;     protected $referer;     protected $ip;     protected $header = array();     protected $_option = array();     protected $_post_data = array();     /**     * 多列队任务进程数,0表示不限制     *     * @var int     */    protected $multi_exec_num = 100;     const ERROR_HOST = '请求的URL错误';     const ERROR_GET = 'GET请求错误';     const ERROR_POST = 'POST请求错误';     function __construct()    {     }     /**     * 设置$cookie     *     * @param $agent     * @return HttpClient_Driver_Curl     */    public function set_agent($agent)    {        $this->agent = $agent;        return $this;    }     /**     * 设置$cookie     *     * @param string $cookie     * @return HttpClient_Driver_Curl     */    public function set_cookies($cookies)    {        $this->cookies = $cookies;        return $this;    }     /**     * 设置$referer     *     * @param string $referer     * @return HttpClient_Driver_Curl     */    public function set_referer($referer)    {        $this->referer = $referer;        return $this;    }     /**     * 设置IP     *     * @param string $ip     * @return HttpClient_Driver_Curl     */    public function set_ip($ip)    {        $this->ip = $ip;        return $this;    }     /**     * 设置curl参数     *     * @param string $key     * @param value $value     * @return HttpClient_Driver_Curl     */    public function set_option($key, $value)    {        if ( $key===CURLOPT_HTTPHEADER )        {            $this->header = array_merge($this->header,$value);        }        else        {            $this->_option[$key] = $value;        }        return $this;    }     /**     * 设置多个列队默认排队数上限     *     * @param int $num     * @return HttpClient_Driver_Curl     */    public function set_multi_max_num($num=0)    {        $this->multi_exec_num = (int)$num;        return $this;    }     /**     * 用POST方式提交,支持多个URL     *     *   $urls = array     *   (     *     'http://www.baidu.com/',     *     'http://mytest.com/url',     *     'http://www.abc.com/post',     *   );     *   $data = array     *   (     *      array('k1'=>'v1','k2'=>'v2'),     *      array('a'=>1,'b'=>2),     *      'aa=1&bb=3&cc=3',     *   );     *   HttpClient::factory()->post($url,$data);     *     * @param $url     * @param string/array $vars     * @param $timeout 超时时间,默认120秒     * @return string, false on failure     */    public function post($url, $vars, $timeout = 60)    {        # POST模式        $this->set_option( CURLOPT_HTTPHEADER, array('Expect:') );        $this->set_option( CURLOPT_POST, true );         if (is_array($url))        {            $myvars = array();            foreach ($url as $k=>$url)            {                if (isset($vars[$k]))                {                    if (is_array($vars[$k]))                    {                        $myvars[$url] = http_build_query($vars[$k]);                    }                    else                    {                        $myvars[$url] = $vars[$k];                    }                }            }        }        else        {            $myvars = array($url=>$vars);        }        $this->_post_data = $myvars;         return $this->get($url,$timeout);    }     /**     * GET方式获取数据,支持多个URL     *     * @param string/array $url     * @param $timeout     * @return string, false on failure     */    public function get($url, $timeout = 10)    {         if ( is_array($url) )        {            $getone = false;            $urls = $url;        }        else        {            $getone = true;            $urls = array($url);        }         $data = $this->request_urls($urls, $timeout);         $this->clear_set();         if ( $getone )        {            $this->http_data = $this->http_data[$url];            return $data[$url];        }        else        {            return $data;        }    }     /**     * 创建一个CURL对象     *     * @param string $url URL地址     * @param int $timeout 超时时间     * @return curl_init()     */    public function _create($url,$timeout)    {        if ( false===strpos($url, '://') )        {            preg_match('#^(http(?:s)?\://[^/]+/)#', $_SERVER["SCRIPT_URI"] , $m);            $the_url = $m[1].ltrim($url,'/');        }        else        {            $the_url = $url;        }         if ($this->ip)        {            # 如果设置了IP,则把URL替换,然后设置Host的头即可            if ( preg_match('#^(http(?:s)?)\://([^/\:]+)(\:[0-9]+)?/#', $the_url.'/',$m) )            {                $this->header[] = 'Host: '.$m[2];                $the_url = $m[1].'://'.$this->ip.$m[3].'/'.substr($the_url,strlen($m[0]));            }        }         $ch = curl_init();        curl_setopt($ch, CURLOPT_URL, $the_url);        curl_setopt($ch, CURLOPT_HEADER, true);        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);        curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);         if ( preg_match('#^https://#i', $the_url) )        {            curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);            curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);        }         if ( $this->cookies )        {            curl_setopt($ch, CURLOPT_COOKIE, http_build_query($this->cookies, '', ';'));        }         if ( $this->referer )        {            curl_setopt($ch, CURLOPT_REFERER, $this->referer);        }         if ( $this->agent )        {            curl_setopt($ch, CURLOPT_USERAGENT, $this->agent);        }        elseif ( array_key_exists('HTTP_USER_AGENT', $_SERVER) )        {            curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);        }         foreach ( $this->_option as $k => $v )        {            curl_setopt($ch, $k, $v);        }         if ( $this->header )        {            $header = array();            foreach ($this->header as $item)            {                # 防止有重复的header                if (preg_match('#(^[^:]*):.*$#', $item,$m))                {                    $header[$m[1]] = $item;                }            }            curl_setopt($ch, CURLOPT_HTTPHEADER, array_values($header));        }         # 设置POST数据        if (isset($this->_post_data[$the_url]))        {            curl_setopt($ch , CURLOPT_POSTFIELDS , $this->_post_data[$the_url]);        }         return $ch;    }     /**     * 支持多线程获取网页     *     * @see http://cn.php.net/manual/en/function.curl-multi-exec.php#88453     * @param Array/string $urls     * @param Int $timeout     * @return Array     */    protected function request_urls($urls, $timeout = 10)    {        # 去重        $urls = array_unique($urls);         if (!$urls)return array();         $mh = curl_multi_init();         # 监听列表        $listener_list = array();         # 返回值        $result = array();         # 总列队数        $list_num = 0;         # 排队列表        $multi_list = array();        foreach ( $urls as $url )        {            # 创建一个curl对象            $current = $this->_create($url, $timeout);             if ( $this->multi_exec_num>0 && $list_num>=$this->multi_exec_num )            {                # 加入排队列表                $multi_list[] = $url;            }            else            {                # 列队数控制                curl_multi_add_handle($mh, $current);                $listener_list[$url] = $current;                $list_num++;            }             $result[$url] = null;            $this->http_data[$url] = null;        }        unset($current);         $running = null;         # 已完成数        $done_num = 0;         do        {            while ( ($execrun = curl_multi_exec($mh, $running)) == CURLM_CALL_MULTI_PERFORM );            if ( $execrun != CURLM_OK ) break;             while ( true==($done = curl_multi_info_read($mh)) )            {                foreach ( $listener_list as $done_url=>$listener )                {                    if ( $listener === $done['handle'] )                    {                         # 获取内容                        $this->http_data[$done_url] = $this->get_data(curl_multi_getcontent($done['handle']), $done['handle']);                         if ( $this->http_data[$done_url]['code'] != 200 )                        {                             $result[$done_url] = false;                        }                        else                        {                            # 返回内容                            $result[$done_url] = $this->http_data[$done_url]['data'];                         }                         curl_close($done['handle']);                         curl_multi_remove_handle($mh, $done['handle']);                         # 把监听列表里移除                        unset($listener_list[$done_url],$listener);                        $done_num++;                         # 如果还有排队列表,则继续加入                        if ( $multi_list )                        {                            # 获取列队中的一条URL                            $current_url = array_shift($multi_list);                             # 创建CURL对象                            $current = $this->_create($current_url, $timeout);                             # 加入到列队                            curl_multi_add_handle($mh, $current);                             # 更新监听列队信息                            $listener_list[$current_url] = $current;                            unset($current);                             # 更新列队数                            $list_num++;                        }                         break;                    }                }            }             if ($done_num>=$list_num)break;         } while (true);         # 关闭列队        curl_multi_close($mh);         return $result;    }     public function get_resut_data()    {        return $this->http_data;    }     protected function get_data($data, $ch)    {        $header_size      = curl_getinfo($ch, CURLINFO_HEADER_SIZE);        $result['code']   = curl_getinfo($ch, CURLINFO_HTTP_CODE);        $result['data']   = substr($data, $header_size);        $result['header'] = explode("\r\n", substr($data, 0, $header_size));        $result['time']   = curl_getinfo($ch, CURLINFO_TOTAL_TIME);         return $result;    }     /**     * 清理设置     */    protected function clear_set()    {        $this->_option = array();        $this->header = array();        $this->ip = null;        $this->cookies = null;        $this->referer = null;        $this->_post_data = array();    }}


0 0
原创粉丝点击