php网页采集 想对高效版

来源:互联网 发布:csgo网络参数显示 编辑:程序博客网 时间:2024/04/28 22:23

想对前面写的版本,极大的减小了IO开销,减小了对主机的解析

<?phpheader("content-type: text/html; charset=utf-8");class HttpWrap{    public $timeout=10;    public $status='';    public $host;    public $port=80;    private $ip;    private $conn;    private $path;    private $url;    private $scheme;    public $http_method='GET';    public $http_version="HTTP/1.1";    public $agent="Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0";    public $accept="image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*";    public $gzip="gzip";    public $referer;    public $cookie;    public $submit_type="application/x-www-form-urlencoded";    private $accept_language="zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";    public $connection="keep-alive";    private $cmd_line;    private $header;    public $post_content;    private $redirect;    private $is_gzip;    public $response_num;    public $response_header;    public $response_body_length=0;    public $response_body;    public $roll_link;    public $roll_group;    public $filename;    public $encoding;   public  function init($url)    {        $this->url=$url;        $url_pair = parse_url($url);        $this->host = $url_pair['host'];        $this->path = $url_pair['path'];        $this->scheme = $url_pair['scheme'];        if(empty($this->ip))        {            $this->ip = gethostbyname($this->host);        }        if(!empty($url_pair['port']))        {            $this->port = $url_pair['port'];        }        $this->connect();       // echo $this->status; exit();        $this->sendRequest();        //如果响应头部存在重定向,则对重定向发送请求        if($this->redirect)        {            if(preg_match("#^http://".preg_quote($this->host)."#i",$this->redirect))            {                $this->referer=$this->host."/".parse_url($this->redirect)['path'];                $this->init($this->redirect);            }        }        if($this->roll_link)        {            $next_url = substr($this->url,0,strripos($this->url, '/')+1).$this->roll_link;            //如果下一页等于当前页            if(strtolower(trim(basename($this->url,'.html'))) == strtolower(trim(basename($next_url,'.html'))))            {               $next_group = $this->getNextGroup($this->response_body);               echo "<font color='color'>即将采集下一组</font><br />";               sleep(1);               $this->init($next_group);            }            else            {                $this->init($next_url);            }        }        else        {            die('没有下一页');        }    }   private function connect()   {       $this->conn = fsockopen($this->ip,$this->port,$errno,$errstr,$this->timeout);       if($this->conn)       {           $this->status = '链接成功';           return true;       }       else       {            switch($errno)            {                case -3:                        $this->status="创建socket链接失败";                case -4:                        $this->status="dns查询失败";                case -5:                        $this->status="链接被拒绝或超时";                default:                        $this->status="创建连接失败";            }            return false;       }   }   private function sendRequest()   {       if(empty($this->path))       {           $this->path="/";       }       $this->cmd_line=$this->http_method." ".$this->path." ".$this->http_version."\r\n";       if(!empty($this->host))       {           $this->header .= "Host: ".$this->host."\r\n";       }       if(!empty($this->agent))       {           $this->header .="User-Agent: ".$this->agent."\r\n";       }       if(!empty($this->accept))       {           $this->header .= "Accept: ". $this->accept ."\r\n";       }       if(!empty($this->gzip))       {           if ( function_exists("gzinflate") )           {                $this->header .= "Accept-encoding: gzip\r\n";            }            else            {                $this->status = "不支持压缩";            }       }       if(!empty($this->referer))       {           $this->header .= "Referer: ".$this->referer."\r\n";       }       if(!empty($this->accept_language))       {           $this->header .= "Accept-Language: ".$this->accept_language."\r\n";       }       if(!empty($this->cookie))       {           if(!is_array($this->cookie))           {               $this->header .="Cookie: ".$this->cookie;           }           else           {               if(count($this->cookie) >0)               {                   $cookie = "Cookie: ";                   foreach($this->cookie as $key => $val)                   {                       $cookie.=$key."=".urlencode($val).";";                   }                  $cookie = substr($cookie, 0, strlen($cookie)-1)."\r\n";               }               $this->header .= $cookie;           }       }       if(!empty($this->submit_type))       {           $this->header .="Content-Type: ".$this->submit_type."\r\n";       }       if(!empty($this->post_content))       {           $this->header .= "Content-length: ".strlen($this->post_content)."\r\n";       }       if(!empty($this->connection))       {           $this->header .= "Connection: ".$this->connection."\r\n";       }       $this->header .="\r\n";       //上面是HTTP请求头部信息       //echo $this->cmd_line.$this->header.$this->post_content; exit();       //发送请求       $len = strlen($this->cmd_line.$this->header.$this->post_content);      if($len != fwrite($this->conn, $this->cmd_line.$this->header.$this->post_content,$len))      {          $this->status = "发送请求failed";      }       //接受响应,每次读取一行内容,首先解析响应头       while($response_header = fgets($this->conn, 1024))       {           if(preg_match("#^HTTP/#",$response_header))            {                //匹配状态数字,200表示请求成功                if(preg_match("#^HTTP/[^\s]*\s(.*?)\s#",$response_header, $status))                {                        $this->response_num= $status[1];//返回代表数字的状态                }            }            //echo $this->response_num; exit();            // 判断是否需要重定向            if(preg_match("#^(Location:|URI:)#i",$response_header))            {                // 获取重定向地址                preg_match("#^(Location:|URI:)\s+(.*)#",trim($response_header),$matches);                //如果重定向字段不包含主机名,不是以以://开头的,则拼接王完整的请求地址,模式+主机+端口                if(!preg_match("#\:\/\/#",$matches[2]))                {                    // 补全主机名                    $this->redirect = "http://".$this->host.":".$this->port;                    //添加路径                    if(!preg_match("|^/|",$matches[2]))                           $this->redirect .= "/".$matches[2];                    else                           $this->redirect .= $matches[2];                }                else                //包含完整的主机地址                        $this->redirect = $matches[2];            }        //判断返回的数据的压缩格式if (preg_match("#^Content-Encoding: gzip#", $response_header) )          {                $this->is_gzip = true;          }          if(preg_match('#^Content-Length:\s*(\d+)#i', $response_header, $len))          {              $this->response_body_length = $len[1];          }        //解析完响应头部        if(preg_match("/^\r?\n$/", $response_header) )            break;        $this->response_header[]=$response_header;       }       //可以成功返回响应头部信息,响应状态码也为200      // var_dump($this->response_header); exit();        if($this->response_num==200)        {            //问题出在这里            //echo "ok"; exit();            $sub_dir;            $dirname;            $path;            $filename;            if(preg_match('#/(\d+)/#', $this->url, $sub_dir))            {                $dirname = "./download/".$sub_dir[1];            }            else            {                $dirname = "./download/".date("Ymd");            }            $len=0;            while($items = fread($this->conn, $this->response_body_length))            {                if(!is_dir($dirname))                {                    $path = mkdir($dirname,0777,true);                }                $filename = $dirname.'/'.basename($this->url);                $len = $len+strlen($items);                $this->response_body = $items;                file_put_contents($filename, $items, FILE_APPEND);                //这里必须判断读取的长度,不然会在这里阻塞                if($len >= $this->response_body_length) break;            }            if($this->is_gzip)            {                $this->response_body = gzinflate ($this->response_body);            }            echo str_repeat("  ", 2048);            echo "对链接".$this->url."发起请求<br />";            $this->getRollLink($this->response_body);           // sleep(1);        }   }    private function getRollLink($filename)   {         $content='';        if(empty($this->encoding))        {            $this->encoding=mb_detect_encoding(substr($filename,0,32), array('GB2312','GBK','UTF-8','BIG5','LATIN1'));            if($this->encoding !='UTF-8')            {               $content = mb_convert_encoding($filename, 'UTF-8', $this->encoding);            }        }       else       {           $content = mb_convert_encoding($filename, 'UTF-8', $this->encoding);       }       if(preg_match('#<ul\s+class="image"[^>]*?>.*?</ul>#is', $content, $match))       {           if(preg_match('#<a\s+href="([^"]+?)">下一页</a>#ui', $match[0], $next))           {                $this->roll_link =  trim($next[1]);           }       }       else       {          $this->roll_link = false;       }   }   private  function getNextGroup($filename)   {        if(empty($this->encoding))        {            $this->encoding=mb_detect_encoding(substr($filename,0,32), array('GB2312','GBK','UTF-8','BIG5','LATIN1'));            if($this->encoding !='UTF-8')            {               $content = mb_convert_encoding($filename, 'UTF-8', $this->encoding);            }        }       else       {           $content = mb_convert_encoding($filename, 'UTF-8', $this->encoding);       }if(preg_match('#<ul\s+class="page"[^>]*?>.*?</ul>#is', $content, $match))       {//echo $match[0]."<br />";           if(preg_match_all('#<a\s+href="([^"]*?)">.*?</a>#usi', $match[0], $next))           {                //var_dump($next[1]);                $choice;                if(count($next[1])==2)                {                    $first = basename($next[1][0], ".html");                    $second = basename($next[1][1], ".html");                    //往前翻页,进入下一组                    if(intval($first) < intval($second))                    {                            $choice = $first;                    }                    else                    {                            $choice = $second;                    }                    //h获取下一组                    foreach($next[1] as $item)                    {                        if(strripos($item, $choice) !=false )                        {                            if(substr($item, 0,2) =='..')                            {                                    $link=  substr($item, 2);                                    $sub_path = explode('/', $this->path);                                    $url = $this->scheme.'://'.$this->host.'/'.$sub_path[1].$link;                                    return $url;                            }                        }                    }                }                //如果是最后一组,即没有下一组了                else if(count($next[1])==1)                {                      if(substr($next[1][0],0,2)=='..')                      {                        $link = substr($next[1][0],2);                        $sub_path = explode('/', $this->path);                        $url = $this->scheme.'://'.$this->host.'/'.$sub_path[1].$link;                        return $url;                      }                }           }           else           {               $this->status = "failed to match href";           }       }       else       {            $this->status = "failed to match class=page";       }   }}ob_implicit_flush(true);set_time_limit(0);$url = $url = "http://www.mmkao.com/Beautyleg/201412/7066.html";$http = new HttpWrap();$http->cookie = "safedog-flow-item=41E2DBFEF121A8A2835ADB4476E5D3EC";$http->referer = "www.mmkao.com";$http->init($url);?>


0 0
原创粉丝点击