微信小说的爬取

来源:互联网 发布:热力地图制作软件 编辑:程序博客网 时间:2024/05/22 14:00

需求:输入小说名字,然后到微信端去爬取所有小说的内容,保存到自己的数据库中

第一步:需要制作一个表单提交小说名字

第二步:后台将提交过来的小说名字接收过来拼接url地址,然后再获取小说的内容,通过正则匹配或者字符串截取就可以拿到想要的东西了,最后插入数据库

个人技术难点:取得数据之后,要求50条一批插入数据库,结果我不会,值得反思

解决办法:先声明一个临时数组,将数据表中的字段都用这个临时数组来保存,最后再声明一个$data[] 将临时数组赋值给$data[],再加一个索引,最后将$data插入数据库中

具体代码:


/** * 伪造IP * */function fakeIp(){   $binfo =array('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.2; AskTbPTV/5.17.0.25589; Alexa Toolbar)','Mozilla/5.0 (Windows NT 5.1; rv:22.0) Gecko/20100101 Firefox/22.0','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET4.0C; Alexa Toolbar)','Mozilla/4.0(compatible; MSIE 6.0; Windows NT 5.1; SV1)',$_SERVER['HTTP_USER_AGENT']);   //123.125.68.*   //125.90.88.*      //定义伪造IP来源段,这里我找的是百度的IP地址   //复制代码 代码如下:   $cip = '14.111.58.'.mt_rand(0,254);   $xip = '125.90.88.'.mt_rand(0,254);   $header = array(         'CLIENT-IP:'.$cip,         'X-FORWARDED-FOR:'.$xip,          'Accept: text/html,application/xhtml+xml,application/xml搜索;q=0.9,*/*;q=0.8',         );      return $header;}

/*获取网页内容*/

function getUrlContent($url,$header = null,$cookie = null, $is_https = 0){   $ch = curl_init();   //$timeout = 5;   curl_setopt ($ch, CURLOPT_URL, "$url");   if(!empty($header))      curl_setopt ($ch, CURLOPT_HTTPHEADER, $header);   curl_setopt ($ch, CURLOPT_REFERER, "zsxs.shu22.cn");   curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);   curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);   curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.493.400 QQBrowser/9.0.2524.400");   curl_setopt ($ch, CURLOPT_ENCODING, "gzip, deflate");     //curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);      if($is_https) {      curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);      curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);   }      if(!empty($cookie))      curl_setopt($ch,CURLOPT_COOKIE,$cookie);   $contents = curl_exec($ch);   curl_close($ch);   return $contents;}


/*抓取小说*/

public function getquanwen1(){//设置超时        set_time_limit(0);//        dump($_POST);die;       $keywords = $_POST['kw'];       //接收post传过来的值       $class_id = $_POST['class_id'];       $other_class_id = $_POST['other_class_id'];        if (empty($class_id) || empty($other_class_id)) {            $this->error('请选择小说分类');        }       $class_model = M("NovelClass");       $class_name_arr = $class_model->where("class_id=$class_id")->select();       //小说分类名字       $class_name = $class_name_arr[0]['class_name'];       //其他分类的名字        $other_class_arr = $class_model->where("class_id=$other_class_id")->select();        $other_class_name = $other_class_arr[0]['class_name'];        //echo $other_class_name;die;       //dump($keywords);       //模拟发送一个get请求到最爽小说中去        $url = 'http://liweih5.ikanshu.cn/searchbook.aspx?key='.$keywords.'&type=1';        //echo $url;        $header = fakeIp();        //王溪的cookie        $cookie = "ikanshuuser=userid=141760945&username=170619092244708337966005&channelid=3725&sessionid=hxkb4mcm4tdpopcvdpi41v4g&checkcode=27ad789c2e14a1286c1b28282876e5aa&version=1;                    Hm_lpvt_d3a69bde0c513dc29f82d2f573aaceff=1497836439;                    Hm_lvt_d3a69bde0c513dc29f82d2f573aaceff=1497835352;                    ASP.NET_SessionId=hzppzb5m0ejj01dumpeqdvt2;                    ikscnid=rndid=170619092308513978&channelid=3603; ";        //刘熙的cookie;        //获取网页的内容        $content = getUrlContent($url, $header, $cookie, 1);//        echo $content;die;        //得到书的id        $start = strpos($content,'<a href="/book/')+strlen('<a href="/book/');        // a标签结束的位置和<div>开始的位置        $end = strpos($content,'.html" class="zw_box');        $book_id = substr($content,$start,$end-$start);        //preg_match('|<a href="\/bookcatalog\/(.*?).html">|i',$content,$book_id);        //抓取书的简介        $book_url = 'http://liweih5.ikanshu.cn/book/'.$book_id.'.html';        //web_from的链接地址保存到数据库        $web_from = 'http://liweih5.ikanshu.cn/bookcatalog/'.$book_id.'.html';        $book_content = getUrlContent($book_url, $header, $cookie, 1);//成功搜索到了小说对应名        //书的简介//        echo $book_content;die;        //获取小说基本信息        //获取书的名字开始的位置        $name_start = strpos($book_content,'<h3>')+strlen('<h3>');        $name_end = strpos($book_content,'</h3>');        //获取小说名字        $novel_name = substr($book_content,$name_start,$name_end-$name_start);        //echo $keywords;echo '<br>';echo $novel_name;die;        //判断是否是和搜索的小说一样        if($keywords != $novel_name){            $this->error('您搜索的小说不存在');        }        //获取作者        preg_match('|<a href="\/authorinfo.aspx(.*?)>(.*?)<\/a>|i',$book_content,$novel_author);        //echo $novel_author[2];        //获取字数        //字数开始的位置        $zishu_start = strpos($book_content,'<p>字数')+strlen('<p>字数:');        $zishu_end = strpos($book_content,'&nbsp;万字');        //字数        $novel_zishu = substr($book_content,$zishu_start,$zishu_end-$zishu_start);        //echo $novel_zishu.'万字';        //是否完结        $end_start = strpos($book_content,'<p>状态')+strlen('<p>状态:');//        echo $end_start;die;        $is_end = substr($book_content,$end_start,6);        if($is_end == '完本'){            $is_end = 1;        }else{            $is_end = 0;        }        //获取图片信息        preg_match('|<img src="(.*?)\.jpg|i',$book_content,$novel_img);        $img_url = $novel_img[1].'.jpg';        $img = getWebImg($img_url);        $novel_base["img"] = $img["save_path"].$img["file_name"];        $novel_base["mini_img"] = $img["save_path"].$img["file_name"];        //描述信息        //dump( $book_content);        //preg_match('|<div class="subject-intro" id="uiMoreIntro">(.*?)<\/div>|i',$book_content,$novel_description);        $novel_description_start = strpos($book_content,'<div class="subject-intro" id="uiMoreIntro">') + strlen('<div class="subject-intro" id="uiMoreIntro">');        $novel_description_end = strpos($book_content,'</div>',$novel_description_start);        //echo $novel_description_start; echo $novel_description_end;        $novel_description = substr($book_content,$novel_description_start,$novel_description_end-$novel_description_start);        //构造数据添加进数据库        $novel_base['name'] = $keywords;        $novel_base['description'] = isset($novel_description) ? $novel_description:' ';        $novel_base['class_id'] = $class_id;        $novel_base['class_name'] = $class_name;        $novel_base['other_class_name'] = $other_class_name;        $novel_base['other_class_id'] = $other_class_id;        $novel_base['publish_time'] = date("Y-m-d H:i:s");        $novel_base['word_count'] = $novel_zishu.'万字';        $novel_base['is_end'] = $is_end;        $novel_base['writer'] = $novel_author[2];        $novel_base['from'] = '全本小说';        $novel_base['web_from'] = $web_from;        $novel_base['is_shelf'] = 0;        //实行添加小说基本信息        $novel_model = M('Novel');        //判断小说名是否存在        if ($novel_model->getByName($keywords) != null ) {            $this->error('您搜索的小说已经存在');        }        //获取小说id 并同时插入数据库        $novel_id = $novel_model->data($novel_base)->add();        //得到所有目录        preg_match('|<a href="\/bookcatalog\/(.*?).html">|i',$book_content,$catelog);        $catalog_url = 'http://liweih5.ikanshu.cn/bookcatalog/'.$catelog[1].'.html';        $catelog_content = getUrlContent($catalog_url, $header, $cookie, 1);        //获取章节的链接地址        preg_match_all('/\/book\/\d+\/\d+\.html/',$catelog_content,$lianjia);        $data = array();        $index = 0;        //目录列表        //dump($lianjia[0]) ;        foreach($lianjia[0] as $key=>$item){            //声明一个临时数组            $tmp_arr = array();            if($index >= 50){                //dump($data);                D('novel_chapter_add')->addAll($data);                //清空数据                $data = array();                $index = 1;            }            //小说具体内容的链接            preg_match('|\/(.*?)\/(.*?)\/(.*?)\.|i',$item,$zhangjieshu);            //拼接链接地址            $novel_address = 'http://liweih5.ikanshu.cn'.$item;//            echo $novel_address;echo '<br/>';//            continue;            //抓取小说内容            $chapter_content = getUrlContent($novel_address,$header,$cookie,1);//            echo $chapter_content;die;            //判断余额不足            preg_match('|您的账户余额不足,请进入充值或选择下面的快捷充值|i',$chapter_content,$nomoney);            //dump($nomoney);            if(!empty($nomoney)) {                //删除抓取的小说                //echo '111';                $novel_model->where('id='.$novel_id)->delete();                D('novel_chapter_add')->where('novel_id='.$novel_id)->delete();                //die('就是这里1444');                $this->error('您的余额不足了,请及时充值');            }//            //判断是否章节重复////            $map["novel_id"] = $novel_id;////            $map["chapter"] = isset($zhangjieshu[3]) ? ''.$zhangjieshu[3].'' : '';////            $count = D("novel_chapter")->where($map)->count();////            //echo $count;die;////            if($count > 0)//                //continue;//            //获取标题            $title_start = strpos($chapter_content,'<div class="r-chaptername" id="lbChapterName">')+strlen('<div class="r-chaptername" id="lbChapterName">');            $title_end = strpos($chapter_content,'</div>',$title_start);            $title_count = substr($chapter_content,$title_start,$title_end-$title_start);//                echo $title_count;echo '<br>';            if (strpos($title_count,'')) {                $title_count_arr = explode(' ',$title_count);                $title = $title_count_arr[1];            }else{                $title = $title_count;                //echo $title;            }            //过滤特殊符号和不标准的字符串            if(strpos($title,'hapter')){                $title = ' ';            }            $tmp_arr['title'] = isset($title) ? $title : '';            $tmp_arr['chapter'] = isset($zhangjieshu[3]) ? ''.$zhangjieshu[3].'' : '';            //获取小说内容            $novel_content_start = strpos($chapter_content,'<div class="r-content" id="uiContent">')+strlen('<div class="r-content" id="uiContent">');            $novel_content_end = strpos($chapter_content,'</div>',$novel_content_start);            $novel_content = substr($chapter_content,$novel_content_start,$novel_content_end - $novel_content_start);            //替换标签            $novel_content = '<p>'.str_replace('<br/>','</p><p>',$novel_content);            //将小说内容装进$data中去            $tmp_arr['content'] = $novel_content;            $tmp_arr['order_set'] = $zhangjieshu[3];            $tmp_arr["add_time"] = date("Y-m-d H:i:s");            $tmp_arr['novel_name'] = $keywords;            $tmp_arr['novel_id'] = $novel_id;            $tmp_arr['novel_mini_img'] = $novel_base["img"];            //将临时数组赋值给$data            $data[] = $tmp_arr;//            //自动增加索引            $index++;        }        //循环外面        if(!empty($data)){            D("novel_chapter_add")->addAll($data);        }        //将从表(novel_chapter_add)数据同步到主表(novel_chapter)中去        //将从表的数据查找出来        $cong_data = D('novel_chapter_add')->field('chapter,title,content,novel_name,novel_id,novel_mini_img,order_set,add_time')->        where('novel_id='.$novel_id)->select();        //dump($cong_data);die;        //dump($cong_data);die('3');        //将所有数据插入主表中        $zhu_data = D('novel_chapter')->addAll($cong_data);        //echo $zhu_data;die('2');        if(empty($zhu_data)){            $this->error('同步失败');        }        //echo $novel_id;die('1');        //删除从表的数据        D('novel_chapter_add')->where('novel_id='.$novel_id)->delete();       // echo $res2;die;        $this->success('恭喜您,已经抓取完毕');    }

原创粉丝点击