获取前端网页 php爬虫 get_html.php
来源:互联网 发布:帝国cms小说 编辑:程序博客网 时间:2024/06/06 23:52
<!DOCTYPE html><html><head><title>spider</title></head><body><form method="get" action="get_html.php">crawl web html address:<input type="text" name="url" ><input type="submit" value="crawl"></form><?phpdate_default_timezone_set('PRC');function dump($var){echo "<pre>";var_dump($var);echo "<pre>";exit(date("Y-m-d H:i:s",time()));}//catetory html resource into local project file.class spider{public $url;public $http;public $host;public $html;public $path;public $title;function __construct($url,$imagesPath=''){set_time_limit(60);//dump($url);//$url="http://www.hose.com";preg_match('#(https?)\s?:\s?//([\w\.-]+)/?#', $url,$matches);$this->http=$matches[1];$this->host=$matches[2];//dump($this->http.$this->host);if ($url) {$ch=curl_init($url); //curl_setopt($ch,CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($ch, CURLOPT_TIMEOUT, 5); $res=curl_exec($ch); curl_close($ch); $this->html=$res; $this->url=$url; /*if(preg_match('#<title>(.*?)</title>#', $res,$matches)){ $this->title=substr($matches[1],0,9); }*/ $this->title=$this->host; if($res){ $this->path=dirname(__FILE__).'/'.$this->title; if(!file_exists($this->path)){ mkdir($this->path); chmod($this->path,0777); } if(!file_exists($this->path.'/style')){ mkdir($this->path.'/style'); chmod($this->path.'/style/',0777); } }else{exit('could not load html webpage.');}}else{exit("Please input url!");}}function get_resource($url_array){foreach ($url_array as $key => $url) {$ch=curl_init();curl_setopt($ch, CURLOPT_URL, $url);curl_setopt($ch, CURLOPT_HEADER, 0);curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, 5);$res=curl_exec($ch);file_put_contents($this->path.'/'.basename($url), $res);chmod($this->path.'/'.basename($url), 0777);curl_close($ch);}}function get_image(){$matches=array();preg_match_all("/<img.*?src=['\"](.*?\/[\w-]+\.(gif|png|jpg)).*?['\"]/i",$this->html, $matches);foreach ($matches[1] as $key => $url) {if(strpos($url, '/')===0){$url=$this->http."://".$this->host.$url;}elseif (strpos($url,'//')===false) {$url=$this->http."://".$this->host.'/'.$url;}$ch=curl_init();curl_setopt($ch, CURLOPT_URL, $url);curl_setopt($ch, CURLOPT_HEADER, 0);curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);curl_setopt($ch, CURLOPT_TIMEOUT, 5);$res=curl_exec($ch);if(!file_exists($this->path.'/style/img')){mkdir($this->path.'/style/img');chmod($this->path.'/style/img',0777);}file_put_contents($this->path.'/style/img/'.basename($url), $res);chmod($this->path.'/style/img/'.basename($url), 0777);curl_close($ch);}echo "<br />get image over.";}function get_css(){$matches=array();//var_dump($this->html);preg_match_all("/<link.*?href=['\"](.*?\.css).*?>/i",$this->html, $matches);//var_dump($matches);foreach ($matches[1] as $key => $url) {if(strpos($url, '/')===0){$url=$this->http."://".$this->host.$url;}elseif (strpos($url,'//')===false) {$url=$this->http."://".$this->host.'/'.$url;}//dump($url);$ch=curl_init();curl_setopt($ch, CURLOPT_URL, $url);curl_setopt($ch, CURLOPT_HEADER, 0);curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, 5);$res=curl_exec($ch);if(!file_exists($this->path.'/style/css')){mkdir($this->path.'/style/css');chmod($this->path.'/style/css', 0777);}file_put_contents($this->path.'/style/css/'.basename($url), $res);chmod($this->path.'/style/css/'.basename($url),0777);curl_close($ch);}echo "<br />get css over.";}function get_js(){$matches=array();//.js文件后面带参数一般是为了不要让浏览器读缓存,过旧的js版本preg_match_all("/<script.*?src=['\"](.*?\.js).*?>/i",$this->html, $matches);foreach ($matches[1] as $key => $url) {if(strpos($url, '/')===0){$url=$this->http."://".$this->host.$url;}elseif (strpos($url,'//')===false) {$url=$this->http."://".$this->host.'/'.$url;}$ch=curl_init();curl_setopt($ch, CURLOPT_URL, $url);curl_setopt($ch, CURLOPT_HEADER, 0);curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);curl_setopt($ch, CURLOPT_TIMEOUT, 5);$res=curl_exec($ch);if(!file_exists($this->path.'/style/js')){mkdir($this->path.'/style/js');chmod($this->path.'/style/js',0777);}file_put_contents($this->path.'/style/js/'.basename($url), $res);chmod($this->path.'/style/js/'.basename($url), 0777);curl_close($ch);}echo "<br />get js over.";}function formate_html(){$res=$this->html;$url=$this->url;//process the source link$res=preg_replace("/<link.*?href=['\"].*?\/([\w-]+\.css).*?>/i", '<link href="./style/css/$1" rel="stylesheet" type="text/css" />', $res);$res=preg_replace("/<script.*?src=['\"].*?\/([\w-\.]+\.js).*?>/i", '<script type="text/javascript" src="./style/js/$1">', $res);$res=preg_replace_callback("/<img.*?src=['\"].*?\/([\w-]+\.(gif|png|jpg)).*?>/i",function ($res){return preg_replace("/src=['\"].*?\/([\w-]+\.(gif|png|jpg)).*?['\"]/i", "src='./style/img/".$res[1]."'", $res[0]); //dump($res);} , $res); $file_name=preg_replace('/\.\w+$/', '', basename($url)); file_put_contents($this->path.'/'.$file_name.'.html', $res); chmod($this->path.'/'.$file_name.'.html', 0777);}}function crawl($url){$spider=new spider($url);$spider->get_css();$spider->get_js();$spider->get_image();$spider->formate_html();}if (!empty($_GET['url'])) {crawl($_GET['url']);}?></body></html>
0 0
- 获取前端网页 php爬虫 get_html.php
- php网页简单爬虫
- 【php网页爬虫】php抓取网页数据
- php 前端获取数据
- php获取网页内容
- PHP 获取网页title
- php获取网页内容
- php获取网页内容
- php获取网页源代码
- 【php】获取网页内容
- PHP获取网页内容
- php 获取网页内容
- php爬虫采集网页需求和原理分析-php采集网页-php爬虫视频教程2
- [PHP代码] PHP远程获取网页内容
- 使用php获取网页内容
- 使用php获取网页内容
- php获取当前网页名称
- php获取网页内容方法
- 移动互联网终端的touch事件,touchstart, touchend, touchmove
- Java学习总结第十五天Java多态性
- Linux常用命令-------软件包
- 大数据系列修炼-Scala课程69
- Java中只有值传递,而不存在引用传递
- 获取前端网页 php爬虫 get_html.php
- 企业内部信息安全管理——(一)风险识别和管控
- UOJ 184 [ZJOI2016]旅行者
- #软工的小小萌新
- 多线程学习总结(一)——认识多线程
- The server quit without updating PID file (/var/run/mysqld/mysqld.pid). ...
- POJ 2187 Beauty Contest (平面最远点对 凸包+旋转卡壳 推荐)
- JavaScript学习笔记(0)
- gnuplot根据csv的字符串字段值绘制不同颜色的点