php网页采集 想对高效版
来源:互联网 发布:csgo网络参数显示 编辑:程序博客网 时间:2024/04/28 22:23
想对前面写的版本,极大的减小了IO开销,减小了对主机的解析
<?phpheader("content-type: text/html; charset=utf-8");class HttpWrap{ public $timeout=10; public $status=''; public $host; public $port=80; private $ip; private $conn; private $path; private $url; private $scheme; public $http_method='GET'; public $http_version="HTTP/1.1"; public $agent="Mozilla/5.0 (Windows NT 6.1; rv:33.0) Gecko/20100101 Firefox/33.0"; public $accept="image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*"; public $gzip="gzip"; public $referer; public $cookie; public $submit_type="application/x-www-form-urlencoded"; private $accept_language="zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; public $connection="keep-alive"; private $cmd_line; private $header; public $post_content; private $redirect; private $is_gzip; public $response_num; public $response_header; public $response_body_length=0; public $response_body; public $roll_link; public $roll_group; public $filename; public $encoding; public function init($url) { $this->url=$url; $url_pair = parse_url($url); $this->host = $url_pair['host']; $this->path = $url_pair['path']; $this->scheme = $url_pair['scheme']; if(empty($this->ip)) { $this->ip = gethostbyname($this->host); } if(!empty($url_pair['port'])) { $this->port = $url_pair['port']; } $this->connect(); // echo $this->status; exit(); $this->sendRequest(); //如果响应头部存在重定向,则对重定向发送请求 if($this->redirect) { if(preg_match("#^http://".preg_quote($this->host)."#i",$this->redirect)) { $this->referer=$this->host."/".parse_url($this->redirect)['path']; $this->init($this->redirect); } } if($this->roll_link) { $next_url = substr($this->url,0,strripos($this->url, '/')+1).$this->roll_link; //如果下一页等于当前页 if(strtolower(trim(basename($this->url,'.html'))) == strtolower(trim(basename($next_url,'.html')))) { $next_group = $this->getNextGroup($this->response_body); echo "<font color='color'>即将采集下一组</font><br />"; sleep(1); $this->init($next_group); } else { $this->init($next_url); } } else { die('没有下一页'); } } private function connect() { $this->conn = fsockopen($this->ip,$this->port,$errno,$errstr,$this->timeout); if($this->conn) { $this->status = '链接成功'; return true; } else { switch($errno) { case -3: $this->status="创建socket链接失败"; case -4: $this->status="dns查询失败"; case -5: $this->status="链接被拒绝或超时"; default: $this->status="创建连接失败"; } return false; } } private function sendRequest() { if(empty($this->path)) { $this->path="/"; } $this->cmd_line=$this->http_method." ".$this->path." ".$this->http_version."\r\n"; if(!empty($this->host)) { $this->header .= "Host: ".$this->host."\r\n"; } if(!empty($this->agent)) { $this->header .="User-Agent: ".$this->agent."\r\n"; } if(!empty($this->accept)) { $this->header .= "Accept: ". $this->accept ."\r\n"; } if(!empty($this->gzip)) { if ( function_exists("gzinflate") ) { $this->header .= "Accept-encoding: gzip\r\n"; } else { $this->status = "不支持压缩"; } } if(!empty($this->referer)) { $this->header .= "Referer: ".$this->referer."\r\n"; } if(!empty($this->accept_language)) { $this->header .= "Accept-Language: ".$this->accept_language."\r\n"; } if(!empty($this->cookie)) { if(!is_array($this->cookie)) { $this->header .="Cookie: ".$this->cookie; } else { if(count($this->cookie) >0) { $cookie = "Cookie: "; foreach($this->cookie as $key => $val) { $cookie.=$key."=".urlencode($val).";"; } $cookie = substr($cookie, 0, strlen($cookie)-1)."\r\n"; } $this->header .= $cookie; } } if(!empty($this->submit_type)) { $this->header .="Content-Type: ".$this->submit_type."\r\n"; } if(!empty($this->post_content)) { $this->header .= "Content-length: ".strlen($this->post_content)."\r\n"; } if(!empty($this->connection)) { $this->header .= "Connection: ".$this->connection."\r\n"; } $this->header .="\r\n"; //上面是HTTP请求头部信息 //echo $this->cmd_line.$this->header.$this->post_content; exit(); //发送请求 $len = strlen($this->cmd_line.$this->header.$this->post_content); if($len != fwrite($this->conn, $this->cmd_line.$this->header.$this->post_content,$len)) { $this->status = "发送请求failed"; } //接受响应,每次读取一行内容,首先解析响应头 while($response_header = fgets($this->conn, 1024)) { if(preg_match("#^HTTP/#",$response_header)) { //匹配状态数字,200表示请求成功 if(preg_match("#^HTTP/[^\s]*\s(.*?)\s#",$response_header, $status)) { $this->response_num= $status[1];//返回代表数字的状态 } } //echo $this->response_num; exit(); // 判断是否需要重定向 if(preg_match("#^(Location:|URI:)#i",$response_header)) { // 获取重定向地址 preg_match("#^(Location:|URI:)\s+(.*)#",trim($response_header),$matches); //如果重定向字段不包含主机名,不是以以://开头的,则拼接王完整的请求地址,模式+主机+端口 if(!preg_match("#\:\/\/#",$matches[2])) { // 补全主机名 $this->redirect = "http://".$this->host.":".$this->port; //添加路径 if(!preg_match("|^/|",$matches[2])) $this->redirect .= "/".$matches[2]; else $this->redirect .= $matches[2]; } else //包含完整的主机地址 $this->redirect = $matches[2]; } //判断返回的数据的压缩格式if (preg_match("#^Content-Encoding: gzip#", $response_header) ) { $this->is_gzip = true; } if(preg_match('#^Content-Length:\s*(\d+)#i', $response_header, $len)) { $this->response_body_length = $len[1]; } //解析完响应头部 if(preg_match("/^\r?\n$/", $response_header) ) break; $this->response_header[]=$response_header; } //可以成功返回响应头部信息,响应状态码也为200 // var_dump($this->response_header); exit(); if($this->response_num==200) { //问题出在这里 //echo "ok"; exit(); $sub_dir; $dirname; $path; $filename; if(preg_match('#/(\d+)/#', $this->url, $sub_dir)) { $dirname = "./download/".$sub_dir[1]; } else { $dirname = "./download/".date("Ymd"); } $len=0; while($items = fread($this->conn, $this->response_body_length)) { if(!is_dir($dirname)) { $path = mkdir($dirname,0777,true); } $filename = $dirname.'/'.basename($this->url); $len = $len+strlen($items); $this->response_body = $items; file_put_contents($filename, $items, FILE_APPEND); //这里必须判断读取的长度,不然会在这里阻塞 if($len >= $this->response_body_length) break; } if($this->is_gzip) { $this->response_body = gzinflate ($this->response_body); } echo str_repeat(" ", 2048); echo "对链接".$this->url."发起请求<br />"; $this->getRollLink($this->response_body); // sleep(1); } } private function getRollLink($filename) { $content=''; if(empty($this->encoding)) { $this->encoding=mb_detect_encoding(substr($filename,0,32), array('GB2312','GBK','UTF-8','BIG5','LATIN1')); if($this->encoding !='UTF-8') { $content = mb_convert_encoding($filename, 'UTF-8', $this->encoding); } } else { $content = mb_convert_encoding($filename, 'UTF-8', $this->encoding); } if(preg_match('#<ul\s+class="image"[^>]*?>.*?</ul>#is', $content, $match)) { if(preg_match('#<a\s+href="([^"]+?)">下一页</a>#ui', $match[0], $next)) { $this->roll_link = trim($next[1]); } } else { $this->roll_link = false; } } private function getNextGroup($filename) { if(empty($this->encoding)) { $this->encoding=mb_detect_encoding(substr($filename,0,32), array('GB2312','GBK','UTF-8','BIG5','LATIN1')); if($this->encoding !='UTF-8') { $content = mb_convert_encoding($filename, 'UTF-8', $this->encoding); } } else { $content = mb_convert_encoding($filename, 'UTF-8', $this->encoding); }if(preg_match('#<ul\s+class="page"[^>]*?>.*?</ul>#is', $content, $match)) {//echo $match[0]."<br />"; if(preg_match_all('#<a\s+href="([^"]*?)">.*?</a>#usi', $match[0], $next)) { //var_dump($next[1]); $choice; if(count($next[1])==2) { $first = basename($next[1][0], ".html"); $second = basename($next[1][1], ".html"); //往前翻页,进入下一组 if(intval($first) < intval($second)) { $choice = $first; } else { $choice = $second; } //h获取下一组 foreach($next[1] as $item) { if(strripos($item, $choice) !=false ) { if(substr($item, 0,2) =='..') { $link= substr($item, 2); $sub_path = explode('/', $this->path); $url = $this->scheme.'://'.$this->host.'/'.$sub_path[1].$link; return $url; } } } } //如果是最后一组,即没有下一组了 else if(count($next[1])==1) { if(substr($next[1][0],0,2)=='..') { $link = substr($next[1][0],2); $sub_path = explode('/', $this->path); $url = $this->scheme.'://'.$this->host.'/'.$sub_path[1].$link; return $url; } } } else { $this->status = "failed to match href"; } } else { $this->status = "failed to match class=page"; } }}ob_implicit_flush(true);set_time_limit(0);$url = $url = "http://www.mmkao.com/Beautyleg/201412/7066.html";$http = new HttpWrap();$http->cookie = "safedog-flow-item=41E2DBFEF121A8A2835ADB4476E5D3EC";$http->referer = "www.mmkao.com";$http->init($url);?>
0 0
- php网页采集 想对高效版
- php网页采集 测试版
- php采集网页 alpha版
- php网页采集 修正版
- php网页采集 修改版
- php simple_html_dom网页采集
- 【PHP网页采集】 —— Snoopy
- PHP的curl_init采集网页数据 实例教程
- php获取网页内容方法 采集程序
- 五子棋对战PHP网页版
- php爬虫采集网页需求和原理分析-php采集网页-php爬虫视频教程2
- php多线程采集网页数据-php采集网页-php爬虫视频教程8
- 采集上万,百万的网页内容网址-php采集网页-php爬虫视频教程5
- 批量采集上百万网页内容-php采集网页-php爬虫视频教程6
- post方式采集网页数据-php采集网页-php爬虫视频教程7
- php采集环境wamp搭建-php采集网页-php爬虫视频教程1
- php采集文章内容列表链接-php采集网页-php爬虫视频教程3
- 采集网页
- 百度胖老师吧论坛视频管理员同志你是相信人民网新华网光明网中新网中经网中青网千龙网民主与法制网还是相信要求删除胖老师的救命帖子为了要封杀胖老师的嘴巴的上海宝钢集团出钱雇佣的删帖公司职业枪手.
- Android动画
- 百度胖老师吧论坛管理员同志核实相互转告要求删除胖老师的帖子就是上海宝钢集团雇佣的删贴公司网络危机公关公司所为
- Python error: Unable to find vcvarsall.bat
- 渍滋阻坠茁啄籽左祝最注拽坐住佐滋自谞
- php网页采集 想对高效版
- 纂撰字滓桩尊锥座抓醉纵佐奏揍醉啄棕捉
- 座族住祝酌做字抓撞妆专锥佐谆专籽琢追
- 最族撰着庄做子妆邹住卒嘴缀租谆桌棕灼
- 拙揍坐准奏赘仔酌渍自祝最酌走茁佐篆阻
- 安卓金币字符串转换成三位一个逗号的格式
- MFC消息映射机制概述
- 踪足谞琢祝庄砖谆尊做遵鬃拽缀姿酌灼淄
- 拙滋柞阻卒足嘴赘祝转咨自作做椎棕柞坐