获取前端网页 php爬虫 get_html.php

来源：互联网发布：帝国cms小说编辑：程序博客网时间：2024/06/06 23:52
<!DOCTYPE html><html><head><title>spider</title></head><body><form method="get" action="get_html.php">crawl web html address:<input type="text" name="url" ><input type="submit" value="crawl"></form><?phpdate_default_timezone_set('PRC');function dump($var){echo "<pre>";var_dump($var);echo "<pre>";exit(date("Y-m-d H:i:s",time()));}//catetory html resource into local project file.class spider{public $url;public $http;public $host;public $html;public $path;public $title;function __construct($url,$imagesPath=''){set_time_limit(60);//dump($url);//$url="http://www.hose.com";preg_match('#(https?)\s?:\s?//([\w\.-]+)/?#', $url,$matches);$this->http=$matches[1];$this->host=$matches[2];//dump($this->http.$this->host);if ($url) {$ch=curl_init($url);    //curl_setopt($ch,CURLOPT_URL, $url);    curl_setopt($ch, CURLOPT_HEADER, 0);    curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);    curl_setopt($ch, CURLOPT_TIMEOUT, 5);    $res=curl_exec($ch);    curl_close($ch);    $this->html=$res;    $this->url=$url;    /*if(preg_match('#<title>(.*?)</title>#', $res,$matches)){    $this->title=substr($matches[1],0,9);    }*/    $this->title=$this->host;    if($res){    $this->path=dirname(__FILE__).'/'.$this->title;    if(!file_exists($this->path)){    mkdir($this->path);    chmod($this->path,0777);    }    if(!file_exists($this->path.'/style')){    mkdir($this->path.'/style');    chmod($this->path.'/style/',0777);    }        }else{exit('could not load html webpage.');}}else{exit("Please input url!");}}function get_resource($url_array){foreach ($url_array as $key => $url) {$ch=curl_init();curl_setopt($ch, CURLOPT_URL, $url);curl_setopt($ch, CURLOPT_HEADER, 0);curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);    curl_setopt($ch, CURLOPT_TIMEOUT, 5);$res=curl_exec($ch);file_put_contents($this->path.'/'.basename($url), $res);chmod($this->path.'/'.basename($url), 0777);curl_close($ch);}}function get_image(){$matches=array();preg_match_all("/<img.*?src=['\"](.*?\/[\w-]+\.(gif|png|jpg)).*?['\"]/i",$this->html, $matches);foreach ($matches[1] as $key => $url) {if(strpos($url, '/')===0){$url=$this->http."://".$this->host.$url;}elseif (strpos($url,'//')===false) {$url=$this->http."://".$this->host.'/'.$url;}$ch=curl_init();curl_setopt($ch, CURLOPT_URL, $url);curl_setopt($ch, CURLOPT_HEADER, 0);curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);curl_setopt($ch, CURLOPT_TIMEOUT, 5);$res=curl_exec($ch);if(!file_exists($this->path.'/style/img')){mkdir($this->path.'/style/img');chmod($this->path.'/style/img',0777);}file_put_contents($this->path.'/style/img/'.basename($url), $res);chmod($this->path.'/style/img/'.basename($url), 0777);curl_close($ch);}echo "<br />get image over.";}function get_css(){$matches=array();//var_dump($this->html);preg_match_all("/<link.*?href=['\"](.*?\.css).*?>/i",$this->html, $matches);//var_dump($matches);foreach ($matches[1] as $key => $url) {if(strpos($url, '/')===0){$url=$this->http."://".$this->host.$url;}elseif (strpos($url,'//')===false) {$url=$this->http."://".$this->host.'/'.$url;}//dump($url);$ch=curl_init();curl_setopt($ch, CURLOPT_URL, $url);curl_setopt($ch, CURLOPT_HEADER, 0);curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);    curl_setopt($ch, CURLOPT_TIMEOUT, 5);$res=curl_exec($ch);if(!file_exists($this->path.'/style/css')){mkdir($this->path.'/style/css');chmod($this->path.'/style/css', 0777);}file_put_contents($this->path.'/style/css/'.basename($url), $res);chmod($this->path.'/style/css/'.basename($url),0777);curl_close($ch);}echo "<br />get css over.";}function get_js(){$matches=array();//.js文件后面带参数一般是为了不要让浏览器读缓存，过旧的js版本preg_match_all("/<script.*?src=['\"](.*?\.js).*?>/i",$this->html, $matches);foreach ($matches[1] as $key => $url) {if(strpos($url, '/')===0){$url=$this->http."://".$this->host.$url;}elseif (strpos($url,'//')===false) {$url=$this->http."://".$this->host.'/'.$url;}$ch=curl_init();curl_setopt($ch, CURLOPT_URL, $url);curl_setopt($ch, CURLOPT_HEADER, 0);curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);curl_setopt($ch, CURLOPT_TIMEOUT, 5);$res=curl_exec($ch);if(!file_exists($this->path.'/style/js')){mkdir($this->path.'/style/js');chmod($this->path.'/style/js',0777);}file_put_contents($this->path.'/style/js/'.basename($url), $res);chmod($this->path.'/style/js/'.basename($url), 0777);curl_close($ch);}echo "<br />get js over.";}function formate_html(){$res=$this->html;$url=$this->url;//process the source link$res=preg_replace("/<link.*?href=['\"].*?\/([\w-]+\.css).*?>/i", '<link href="./style/css/$1" rel="stylesheet" type="text/css" />', $res);$res=preg_replace("/<script.*?src=['\"].*?\/([\w-\.]+\.js).*?>/i", '<script type="text/javascript" src="./style/js/$1">', $res);$res=preg_replace_callback("/<img.*?src=['\"].*?\/([\w-]+\.(gif|png|jpg)).*?>/i",function ($res){return preg_replace("/src=['\"].*?\/([\w-]+\.(gif|png|jpg)).*?['\"]/i", "src='./style/img/".$res[1]."'", $res[0]); //dump($res);} , $res);    $file_name=preg_replace('/\.\w+$/', '', basename($url));       file_put_contents($this->path.'/'.$file_name.'.html', $res);       chmod($this->path.'/'.$file_name.'.html', 0777);}}function crawl($url){$spider=new spider($url);$spider->get_css();$spider->get_js();$spider->get_image();$spider->formate_html();}if (!empty($_GET['url'])) {crawl($_GET['url']);}?></body></html>
0 0