CURL采集

来源:互联网 发布:ae数据模板 编辑:程序博客网 时间:2024/05/16 10:49
<?phpdefined('BASEPATH') OR exit('No direct script access allowed');class Welcome extends CI_Controller {   public function index()   {        $this->load->library('Curl');        $url="http://www.techweb.com.cn/";        $preg='#<div class="photo3">(.*)<a href="(.*)" target="_blank">(.*)<img (.*) src="(.*)">(.*)</a>(.*)</div>(.*)<h4><a title="(.*)" href="(.*)" target="_blank">(.*)</a></h4>(.*)<p>(.*)<a href="(.*)" target="_blank">(.*)</a>(.*)</p>#isU';        $arr=$this->curl->get_info($url,$preg);        $info['title']=$arr[9];//标题        $info['href']=$arr[14];//超链接        $info['img']=$arr[5];//图片        $info['content']=$arr[13];//内容        //保存图片        $path='./public/img/';        $info['img']=$this->curl->get_img($info['img'],$path);        $sql="insert into techweb (title,href,img,content) VALUES ";        foreach($info['title'] as $k=>$v){            $sql.="('".mysql_escape_string($v)."','".mysql_escape_string($info['href'][$k])."','".mysql_escape_string($info['img'][$k])."','".mysql_escape_string($info['content'][$k])."'),";        }        $sql=trim($sql,',');        $res=$this->db->conn_id->exec($sql);        if($res){            echo "<script>alert('添加成功');location.href='".site_url('Welcome/show')."'</script>";        }else{            echo "添加失败";        }   }    public function show(){        $file_name="./application/views/show_cat.html";        if(file_exists($file_name) && (time()-filemtime($file_name)<60)){            echo "静态页面";            echo file_get_contents($file_name);        }else{            $stmt=$this->db->conn_id->prepare("select * from techweb");            $stmt->execute();            $arr=$stmt->fetchAll(PDO::FETCH_ASSOC);            //开启缓冲            ob_start();            $this->load->vars('arr',$arr);            $this->load->view('show.html');            $ob=ob_get_contents();            //关闭缓冲            ob_clean();            file_put_contents($file_name,$ob);            echo "动态读取";            echo $ob;        }    }    //分页加伪静态    public function search(){        $search=$this->input->get('search');        $memcache = new Memcache();        $memcache->connect("127.0.0.1",11211);        if($memcache->get('search'))        {            $arr=$memcache->get('search');            if(array_key_exists("$search",$arr))            {                $arr["$search"]=$arr["$search"]+1;                $memcache->replace("search",$arr);            }else            {                $arr["$search"]=1;                $memcache->replace("search",$arr);            }        }else        {            $memcache->set("search",array("$search"=>1));        }        $pdo = new PDO('mysql:host=127.0.0.1;dbname=seven_month','root','root');        $file_name="./application/views/page.xml";        //使用SimpleXMLElement 类的构造器构造实例化对象并创建根节点        $xml = new SimpleXMLElement('<Messages></Messages>');        if(file_exists($file_name)){            $xml=simplexml_load_file($file_name);        }else{            if($search!='')            {                $where=$search;            }else            {                $where=1;            }            /***             *分页             */            //计算总条数            $nums=$this->db->conn_id->query("select * from techweb WHERE title LIKE '%$where%'");            $nums=$nums->fetchAll(PDO::FETCH_ASSOC);            $count=count($nums);            //每页显示条数            $page_show=3;            //当前页            $page=isset($_GET['page'])?$_GET['page']:1;            //总页数            $page_num=ceil($count/$page_show);            //偏移量            $limit=($page-1)*$page_show;            //上一页            $prev=$page-1>1?$page-1:1;            //下一页            $next=$page+1<$page_num?$page+1:$page_num;            //分页后数据            $goods_data=$pdo->query("select * from techweb WHERE title LIKE '%$where%' limit $limit,$page_show")->fetchAll(PDO::FETCH_ASSOC);            for($i=0;$i<count($goods_data);$i++){                //想循环创建$i 个 message标签                $xml->message[$i]='';                $xml->message[$i]['id']=$goods_data[$i]['id'];//将id放到message标签中去  作为一个属性                $xml->message[$i]->title=$goods_data[$i]['title'];                $xml->message[$i]->href=$goods_data[$i]['href'];                $xml->message[$i]->img=$goods_data[$i]['img'];                $xml->message[$i]->content=$goods_data[$i]['content'];            } $xml->asXML($file_name);            $search_data=$memcache->get("search");            $this->load->vars('search_data',$search_data);            $this->load->vars('page',$page);            $this->load->vars('prev',$prev);            $this->load->vars('next',$next);            $this->load->vars('page_num',$page_num);            $this->load->vars('search',$search);            $this->load->vars('arr',$goods_data);            $this->load->view('search.html');        }    }}CURL.php
<?phpclass Curl{    /**     * 构造方法,初始化成员变量     */    public function __construct(){        $this->curl=curl_init();    }    /**     * @param $url     * @param null $preg     * @return int|mixed     */    public function get_info($url,$preg=null){        $params[CURLOPT_URL]=$url;        $params[CURLOPT_HEADER]=false;        $params[CURLOPT_RETURNTRANSFER]=true;        $params[CURLOPT_FOLLOWLOCATION]=true;        $params[CURLOPT_USERAGENT]='Mozilla/5.0 (Windows NT 5.1; rv:9.0.1) Gecko/20100101 Firefox/9.0.1';        $params[CURLOPT_POSTFIELDS] = '';        //$this->cookies();        curl_setopt_array($this->curl, $params);        $content=curl_exec($this->curl);        if(!empty($preg)&&isset($preg)){            preg_match_all($preg,$content,$arr);            unset($arr[0]);            return $arr;        }else{            return $content;        }    }    /**     * @param $data 登录所需要的信息     * @param $url 表单提交的地址     * @return mixed 返回登录后页面内容     */    public function login_get_info($data,$url){        $params[CURLOPT_URL]=$url;        $params[CURLOPT_HEADER]=false;        $params[CURLOPT_RETURNTRANSFER]=true;        $params[CURLOPT_FOLLOWLOCATION]=true;        $params[CURLOPT_USERAGENT]='Mozilla/5.0 (Windows NT 5.1; rv:9.0.1) Gecko/20100101 Firefox/9.0.1';        $postfields= '';        foreach ($data as $key => $value){            $postfields .= urlencode($key) . '=' . urlencode($value) . '&';        }        $params[CURLOPT_POST] = true;        $params[CURLOPT_POSTFIELDS]=$postfields;        $this->cookies();        curl_setopt_array($this->curl, $params);        return curl_exec($this->curl);    }    /**     * 模拟采集的cookie信息     */    private function cookies(){        $cookie_path = './';        if(isset($_COOKIE['cookie_jar'])&&($_COOKIE['cookie_jar']||is_file($_COOKIE['cookie_jar']))){            $params[CURLOPT_COOKIEFILE] = $_COOKIE['cookie_jar'];        }else{            $cookie_jar=tempnam($cookie_path, 'cookie');            $params[CURLOPT_COOKIEJAR] = $cookie_jar;            setcookie('cookie_jar', $cookie_jar);        }    }    /**     * 采集远程图片     * @param $img  图片路径  是一个数组     * @param $save_path   图片保存在你本地的路径     * @return bool     */    public function get_img($img,$save_path){        for($i=0;$i<count($img);$i++) {            $res=@file_get_contents($img[$i]);            $img_type=substr($img[$i], strrpos($img[$i], "."));            $path=$save_path.time().rand(1,9999999).mt_rand() .$img_type;            $img[$i] = $path;            file_put_contents($path,$res);        }        return $img;    }}
 
0 0
原创粉丝点击