curl采集

来源:互联网 发布:淘宝模特拍摄动作 编辑:程序博客网 时间:2024/05/16 10:10
<?phpclass CurlImitate {function curl($url, $data = '', $method = 'GET', $setcooke = false, $cookie_file = '') {//0.设置时间无限制set_time_limit(0);//1.初始化$curl = curl_init();//2.请求地址curl_setopt($curl, CURLOPT_URL, $url);//3.请求方式curl_setopt($curl, CURLOPT_CUSTOMREQUEST, $method);//4.用于绕开https加密curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE);//4.0 配置curl解压缩方式(默认的压缩方式)curl_setopt($curl, CURLOPT_HTTPHEADER, array('Accept-Encoding:gzip'));curl_setopt($curl, CURLOPT_ENCODING, "gzip");//5.指明以哪种方式进行访问curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0');//6.服务器服务器返回的"Location: "放在header中递归的返回给服务器 自动设置header中的referer信息curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);curl_setopt($curl, CURLOPT_AUTOREFERER, 1);//8.post方式的时候添加数据 注意大写if ($method == 'POST') {curl_setopt($curl, CURLOPT_POSTFIELDS, $data);}//9.如果设置要请求的cookie,那么把cookie值保存在指定的文件中if ($setcooke == true) {//创建cookie文件保存的位置curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie_file);} else {//10.就从文件中读取cookie的信息curl_setopt($curl, CURLOPT_COOKIEFILE, $cookie_file);}//11.将数据返回curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);//12.执行一个会话$tmpInfo = curl_exec($curl);//13.错误编码if (curl_errno($curl)) {//14.错误信息return curl_error($curl);}//15 .关闭一个资源curl_close($curl);return $tmpInfo;}/* * 模拟get请求 */function getImitate($url, $utf, $gbk) {$url = iconv($utf, $gbk, $url);$str = $this -> curl($url);return $str;}/* * 模拟post请求 */function postImitate($url1, $url2) {$cookie_file = tempnam('./temp', 'cookie');$url = iconv('utf-8', 'utf-8', $url1);$data = array('user_name' => 'manman', 'user_paw' => '8520');//post的提交$str = curl($url, $data, 'POST', true, $cookie_file);//get获取$ur2 = $url = iconv('utf-8', 'utf-8', $url1);$str = curl($url2, '', 'GET', false, $cookie_file);return $str;}}/* * 实例化class * */$CurlImitate = new CurlImitate();/* * 你想要链接的地址 * */$url = 'https://www.huxiu.com/startups.html?f=index_nav_article';/* * 转码 * */$str = $CurlImitate -> getImitate($url, 'UTF-8', 'UTF-8');/* * 正则匹配 * */$preg = '#<div class="mod-b mod-art" data-aid="162628">(.*)<div class="get-mod-more js-get-mod-more-list transition" .*>#isU';preg_match($preg, $str, $data);/* *再次匹配 * */$str = '#<div class="mod-b mod-art" .*>.*<img class="lazy" data-original="(.*)" alt="(.*)">.*<a .*><img src="(.*)"></a>.*<div class="mob-sub">(.*)</div>.*#isU';preg_match_all($str, $data[0], $expression);/* * 你想要的数据 对于数据处理 无所谓 * */$arr = array($expression[1], $expression[2], $expression[3], $expression[4]);//定义个数组$now = array();for ($i = 0; $i < count($arr); $i++) {for ($k = 0; $k < count($arr[$i]); $k++) {$now[$k][] = $arr[$i][$k];}}//采集图片foreach($now as $v){/* * 得到图片的一些信息 * */$path = pathinfo($v[0]);/* * $img 图片资源 * */$img = $CurlImitate -> curl($v[0]);/* * 定义个文件名 * */$filename = './image/'.time().rand(1000, 9999).'.'.$path['basename'];/* * 写到你自己想要写入的目录 * */file_put_contents($filename, $img);}/* * 经本人测试没有问题  可以直接测试 …… PDO操作? * 测试网站是 https://www.huxiu.com/startups.html?f=index_nav_article  * */

1 0
原创粉丝点击