微信小说的爬取
来源:互联网 发布:热力地图制作软件 编辑:程序博客网 时间:2024/05/22 14:00
需求:输入小说名字,然后到微信端去爬取所有小说的内容,保存到自己的数据库中
第一步:需要制作一个表单提交小说名字
第二步:后台将提交过来的小说名字接收过来拼接url地址,然后再获取小说的内容,通过正则匹配或者字符串截取就可以拿到想要的东西了,最后插入数据库
个人技术难点:取得数据之后,要求50条一批插入数据库,结果我不会,值得反思
解决办法:先声明一个临时数组,将数据表中的字段都用这个临时数组来保存,最后再声明一个$data[] 将临时数组赋值给$data[],再加一个索引,最后将$data插入数据库中
具体代码:
/** * 伪造IP * */function fakeIp(){ $binfo =array('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.2; AskTbPTV/5.17.0.25589; Alexa Toolbar)','Mozilla/5.0 (Windows NT 5.1; rv:22.0) Gecko/20100101 Firefox/22.0','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET4.0C; Alexa Toolbar)','Mozilla/4.0(compatible; MSIE 6.0; Windows NT 5.1; SV1)',$_SERVER['HTTP_USER_AGENT']); //123.125.68.* //125.90.88.* //定义伪造IP来源段,这里我找的是百度的IP地址 //复制代码 代码如下: $cip = '14.111.58.'.mt_rand(0,254); $xip = '125.90.88.'.mt_rand(0,254); $header = array( 'CLIENT-IP:'.$cip, 'X-FORWARDED-FOR:'.$xip, 'Accept: text/html,application/xhtml+xml,application/xml搜索;q=0.9,*/*;q=0.8', ); return $header;}
/*获取网页内容*/
function getUrlContent($url,$header = null,$cookie = null, $is_https = 0){ $ch = curl_init(); //$timeout = 5; curl_setopt ($ch, CURLOPT_URL, "$url"); if(!empty($header)) curl_setopt ($ch, CURLOPT_HTTPHEADER, $header); curl_setopt ($ch, CURLOPT_REFERER, "zsxs.shu22.cn"); curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.493.400 QQBrowser/9.0.2524.400"); curl_setopt ($ch, CURLOPT_ENCODING, "gzip, deflate"); //curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout); if($is_https) { curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE); } if(!empty($cookie)) curl_setopt($ch,CURLOPT_COOKIE,$cookie); $contents = curl_exec($ch); curl_close($ch); return $contents;}
/*抓取小说*/
public function getquanwen1(){//设置超时 set_time_limit(0);// dump($_POST);die; $keywords = $_POST['kw']; //接收post传过来的值 $class_id = $_POST['class_id']; $other_class_id = $_POST['other_class_id']; if (empty($class_id) || empty($other_class_id)) { $this->error('请选择小说分类'); } $class_model = M("NovelClass"); $class_name_arr = $class_model->where("class_id=$class_id")->select(); //小说分类名字 $class_name = $class_name_arr[0]['class_name']; //其他分类的名字 $other_class_arr = $class_model->where("class_id=$other_class_id")->select(); $other_class_name = $other_class_arr[0]['class_name']; //echo $other_class_name;die; //dump($keywords); //模拟发送一个get请求到最爽小说中去 $url = 'http://liweih5.ikanshu.cn/searchbook.aspx?key='.$keywords.'&type=1'; //echo $url; $header = fakeIp(); //王溪的cookie $cookie = "ikanshuuser=userid=141760945&username=170619092244708337966005&channelid=3725&sessionid=hxkb4mcm4tdpopcvdpi41v4g&checkcode=27ad789c2e14a1286c1b28282876e5aa&version=1; Hm_lpvt_d3a69bde0c513dc29f82d2f573aaceff=1497836439; Hm_lvt_d3a69bde0c513dc29f82d2f573aaceff=1497835352; ASP.NET_SessionId=hzppzb5m0ejj01dumpeqdvt2; ikscnid=rndid=170619092308513978&channelid=3603; "; //刘熙的cookie; //获取网页的内容 $content = getUrlContent($url, $header, $cookie, 1);// echo $content;die; //得到书的id $start = strpos($content,'<a href="/book/')+strlen('<a href="/book/'); // a标签结束的位置和<div>开始的位置 $end = strpos($content,'.html" class="zw_box'); $book_id = substr($content,$start,$end-$start); //preg_match('|<a href="\/bookcatalog\/(.*?).html">|i',$content,$book_id); //抓取书的简介 $book_url = 'http://liweih5.ikanshu.cn/book/'.$book_id.'.html'; //将web_from的链接地址保存到数据库 $web_from = 'http://liweih5.ikanshu.cn/bookcatalog/'.$book_id.'.html'; $book_content = getUrlContent($book_url, $header, $cookie, 1);//成功搜索到了小说对应名 //书的简介// echo $book_content;die; //获取小说基本信息 //获取书的名字开始的位置 $name_start = strpos($book_content,'<h3>')+strlen('<h3>'); $name_end = strpos($book_content,'</h3>'); //获取小说名字 $novel_name = substr($book_content,$name_start,$name_end-$name_start); //echo $keywords;echo '<br>';echo $novel_name;die; //判断是否是和搜索的小说一样 if($keywords != $novel_name){ $this->error('您搜索的小说不存在'); } //获取作者 preg_match('|<a href="\/authorinfo.aspx(.*?)>(.*?)<\/a>|i',$book_content,$novel_author); //echo $novel_author[2]; //获取字数 //字数开始的位置 $zishu_start = strpos($book_content,'<p>字数')+strlen('<p>字数:'); $zishu_end = strpos($book_content,' 万字'); //字数 $novel_zishu = substr($book_content,$zishu_start,$zishu_end-$zishu_start); //echo $novel_zishu.'万字'; //是否完结 $end_start = strpos($book_content,'<p>状态')+strlen('<p>状态:');// echo $end_start;die; $is_end = substr($book_content,$end_start,6); if($is_end == '完本'){ $is_end = 1; }else{ $is_end = 0; } //获取图片信息 preg_match('|<img src="(.*?)\.jpg|i',$book_content,$novel_img); $img_url = $novel_img[1].'.jpg'; $img = getWebImg($img_url); $novel_base["img"] = $img["save_path"].$img["file_name"]; $novel_base["mini_img"] = $img["save_path"].$img["file_name"]; //描述信息 //dump( $book_content); //preg_match('|<div class="subject-intro" id="uiMoreIntro">(.*?)<\/div>|i',$book_content,$novel_description); $novel_description_start = strpos($book_content,'<div class="subject-intro" id="uiMoreIntro">') + strlen('<div class="subject-intro" id="uiMoreIntro">'); $novel_description_end = strpos($book_content,'</div>',$novel_description_start); //echo $novel_description_start; echo $novel_description_end; $novel_description = substr($book_content,$novel_description_start,$novel_description_end-$novel_description_start); //构造数据添加进数据库 $novel_base['name'] = $keywords; $novel_base['description'] = isset($novel_description) ? $novel_description:' '; $novel_base['class_id'] = $class_id; $novel_base['class_name'] = $class_name; $novel_base['other_class_name'] = $other_class_name; $novel_base['other_class_id'] = $other_class_id; $novel_base['publish_time'] = date("Y-m-d H:i:s"); $novel_base['word_count'] = $novel_zishu.'万字'; $novel_base['is_end'] = $is_end; $novel_base['writer'] = $novel_author[2]; $novel_base['from'] = '全本小说'; $novel_base['web_from'] = $web_from; $novel_base['is_shelf'] = 0; //实行添加小说基本信息 $novel_model = M('Novel'); //判断小说名是否存在 if ($novel_model->getByName($keywords) != null ) { $this->error('您搜索的小说已经存在'); } //获取小说id 并同时插入数据库 $novel_id = $novel_model->data($novel_base)->add(); //得到所有目录 preg_match('|<a href="\/bookcatalog\/(.*?).html">|i',$book_content,$catelog); $catalog_url = 'http://liweih5.ikanshu.cn/bookcatalog/'.$catelog[1].'.html'; $catelog_content = getUrlContent($catalog_url, $header, $cookie, 1); //获取章节的链接地址 preg_match_all('/\/book\/\d+\/\d+\.html/',$catelog_content,$lianjia); $data = array(); $index = 0; //目录列表 //dump($lianjia[0]) ; foreach($lianjia[0] as $key=>$item){ //声明一个临时数组 $tmp_arr = array(); if($index >= 50){ //dump($data); D('novel_chapter_add')->addAll($data); //清空数据 $data = array(); $index = 1; } //小说具体内容的链接 preg_match('|\/(.*?)\/(.*?)\/(.*?)\.|i',$item,$zhangjieshu); //拼接链接地址 $novel_address = 'http://liweih5.ikanshu.cn'.$item;// echo $novel_address;echo '<br/>';// continue; //抓取小说内容 $chapter_content = getUrlContent($novel_address,$header,$cookie,1);// echo $chapter_content;die; //判断余额不足 preg_match('|您的账户余额不足,请进入充值或选择下面的快捷充值|i',$chapter_content,$nomoney); //dump($nomoney); if(!empty($nomoney)) { //删除抓取的小说 //echo '111'; $novel_model->where('id='.$novel_id)->delete(); D('novel_chapter_add')->where('novel_id='.$novel_id)->delete(); //die('就是这里1444'); $this->error('您的余额不足了,请及时充值'); }// //判断是否章节重复//// $map["novel_id"] = $novel_id;//// $map["chapter"] = isset($zhangjieshu[3]) ? '第'.$zhangjieshu[3].'章' : '';//// $count = D("novel_chapter")->where($map)->count();//// //echo $count;die;//// if($count > 0)// //continue;// //获取标题 $title_start = strpos($chapter_content,'<div class="r-chaptername" id="lbChapterName">')+strlen('<div class="r-chaptername" id="lbChapterName">'); $title_end = strpos($chapter_content,'</div>',$title_start); $title_count = substr($chapter_content,$title_start,$title_end-$title_start);// echo $title_count;echo '<br>'; if (strpos($title_count,'章 ')) { $title_count_arr = explode(' ',$title_count); $title = $title_count_arr[1]; }else{ $title = $title_count; //echo $title; } //过滤特殊符号和不标准的字符串 if(strpos($title,'hapter')){ $title = ' '; } $tmp_arr['title'] = isset($title) ? $title : ''; $tmp_arr['chapter'] = isset($zhangjieshu[3]) ? '第'.$zhangjieshu[3].'章' : ''; //获取小说内容 $novel_content_start = strpos($chapter_content,'<div class="r-content" id="uiContent">')+strlen('<div class="r-content" id="uiContent">'); $novel_content_end = strpos($chapter_content,'</div>',$novel_content_start); $novel_content = substr($chapter_content,$novel_content_start,$novel_content_end - $novel_content_start); //替换标签 $novel_content = '<p>'.str_replace('<br/>','</p><p>',$novel_content); //将小说内容装进$data中去 $tmp_arr['content'] = $novel_content; $tmp_arr['order_set'] = $zhangjieshu[3]; $tmp_arr["add_time"] = date("Y-m-d H:i:s"); $tmp_arr['novel_name'] = $keywords; $tmp_arr['novel_id'] = $novel_id; $tmp_arr['novel_mini_img'] = $novel_base["img"]; //将临时数组赋值给$data $data[] = $tmp_arr;// //自动增加索引 $index++; } //循环外面 if(!empty($data)){ D("novel_chapter_add")->addAll($data); } //将从表(novel_chapter_add)数据同步到主表(novel_chapter)中去 //将从表的数据查找出来 $cong_data = D('novel_chapter_add')->field('chapter,title,content,novel_name,novel_id,novel_mini_img,order_set,add_time')-> where('novel_id='.$novel_id)->select(); //dump($cong_data);die; //dump($cong_data);die('3'); //将所有数据插入主表中 $zhu_data = D('novel_chapter')->addAll($cong_data); //echo $zhu_data;die('2'); if(empty($zhu_data)){ $this->error('同步失败'); } //echo $novel_id;die('1'); //删除从表的数据 D('novel_chapter_add')->where('novel_id='.$novel_id)->delete(); // echo $res2;die; $this->success('恭喜您,已经抓取完毕'); }
阅读全文
0 0
- 微信小说的爬取
- Python爬虫:爬取小说花千骨--转自微信
- Python爬取小说
- 爬取起点小说
- python爬取小说
- requests爬取小说
- Python 爬取小说(诛仙)
- Python3爬取小说《昆仑》
- 练:python爬取小说
- scrapy实例 ----- 爬取小说
- python爬取某一小说
- 使用Scrapy爬取顶点小说整个网站的小说,入库Mysql!
- 利用Python爬取妙笔阁小说网的《我是至尊》小说
- python3.5爬虫完成笔趣阁小说的爬取
- Scala语言编写的爬虫应用-爬取一部小说
- python爬取的小说存入mysql数据库
- 基于JSoup的网络爬虫爬取小说内容
- 爬取小说网站小说信息以及内容
- MSSQL:vb数据查询
- iOS 多线程下NSOperation、NSBlockOperation、NSInvocationOperation、NSOperationQueue的使用
- [Linux]运输层的端口
- nm命令的-C参数
- c# 经验谈:巧用Expression表达式 解决类似于sql中 select in 的查询
- 微信小说的爬取
- 微擎原生上传文件
- nginx服务器有什么作用以及什么是反向代理?
- 正态分布的最大似然估计
- SVN几个常用的命令(windows)
- iOS 文件目录及NSUserDefaults
- Entity Framework 动态构造select表达式
- 安卓开发进阶之文字上标(翻译)
- 如何快速转载CSDN中的博客