PHPCrawler抓取酷狗精选集歌单
来源:互联网 发布:fastadmin cms插件 编辑:程序博客网 时间:2024/05/16 07:25
一、PHPCrawler的介绍与安装
先了解一下什么是抓取?
抓取就是网络爬虫,也就是人们常说的网络蜘蛛(spider)。是搜索引擎的一个重要组成部分,按照一定的逻辑和算法抓取和下载互联网上的信息和网页。一般的爬虫从一个start url开始,按照一定的策略开始爬取,把爬取到的新的url放入爬取队列中,然后进行新一轮的爬取,直到抓取完毕为止。
PHPCrawler是一个国外开源的爬虫系统,它的源码托管在sourceforge里,这是它的下载地址:点击打开链接
,根据自己电脑里安装的PHP版本选择合适的版本下载。下载完毕之后,解压到服务器网站根目录下,复制example.php文件,并重命名。
二、完整源码
<?php// It may take a whils to crawl a site ...set_time_limit(10000);// Inculde the phpcrawl-mainclassinclude("libs/PHPCrawler.class.php");// Extend the class and override the handleDocumentInfo()-method class MyCrawler extends PHPCrawler { //在这里解析页面内容 function handleDocumentInfo($DocInfo) { // Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>"). if (PHP_SAPI == "cli") $lb = "\n"; else $lb = "<br />"; // Print the URL and the HTTP-status-Code echo "Page requested: ".$DocInfo->url." (".$DocInfo->http_status_code.")".$lb; // Print the refering URL echo "Referer-page: ".$DocInfo->referer_url.$lb; // Print if the content of the document was be recieved or not if ($DocInfo->received == true) echo "Content received: ".$DocInfo->bytes_received." bytes".$lb; else echo "Content not received".$lb; // Now you should do something with the content of the actual // received page or file ($DocInfo->source), we skip it in this example //echo $DocInfo->source; //echo $lb; $url=$DocInfo->url; $pat="/http:\/\/www\.kugou\.com\/yy\/special\/single\/\d+\.html/"; if(preg_match($pat,$url)>0){ $this->parseSonglistDetail($DocInfo); } flush(); } public function parseSonglistDetail($DocInfo){ $songlistArr=array(); $songlistArr['raw_url']=$DocInfo->url; $content=$DocInfo->content; //名称 $matches=array(); $pat="/<span>名称:<\/span>([^(<br)]+)<br \/>/"; $res=preg_match($pat, $content,$matches); if($res>0){ $songlistArr['title']=$matches[1]; }else{ $songlistArr['title']=""; print "error:get title fail<br/>"; } //创建人 $matches=array(); $pat="/<span>创建人:<\/span>([^(<br)]+)<br \/>/"; $res=preg_match($pat, $content,$matches); if($res>0){ $songlistArr['creator']=$matches[1]; }else{ $songlistArr['creator']=""; print "error:get creator fail<br/>"; } //创建时间 $matches=array(); $pat="/<span>更新时间:<\/span>([^(<br)]+)<br \/>/"; $res=preg_match($pat, $content,$matches); if($res>0){ $songlistArr['create_date']=$matches[1]; }else{ $songlistArr['create_date']=""; print "error:get create_date fail<br/>"; } //简介 $matches=array(); $pat="/<span>简介:<\/span>([^(<\/p)]*)<\/p>/"; $res=preg_match($pat, $content,$matches); if($res>0){ $songlistArr['info']=$matches[1]; }else{ $songlistArr['info']=""; print "error:get info fail<br/>"; } //歌曲 $matches=array(); $pat="/<a title=\"([^\"]+)\" hidefocus=\"/"; $res=preg_match_all($pat, $content,$matches); if($res>0){ $songlistArr['songs']=array(); for($i=0;$i<count($matches[1]);$i++){ $song_title=$matches[1][$i]; array_push($songlistArr['songs'],array('title'=>$song_title)); } }else{ $songlistArr['song']=""; print "error:get song fail<br/>"; } echo "<pre>"; print_r($songlistArr); echo "</pre>"; $this->saveSonglist($songlistArr); } public function saveSonglist($songlistArr){ //连接数据库 $conn=mysql_connect("localhost","root","root"); mysql_select_db("songlist",$conn); mysql_query("set names utf8"); $songlist=array(); $songlist['title']=mysql_escape_string($songlistArr['title']); $songlist['create_time']=mysql_escape_string($songlistArr['create_date']); $songlist['creator']=mysql_escape_string($songlistArr['creator']); $songlist['raw_url']=mysql_escape_string($songlistArr['raw_url']); $songlist['info']=mysql_escape_string($songlistArr['info']); $sql="insert into songlist set". "title=''".$songlist['title']."'". ",creat_time=''".$songlist['create_time']."'". ",creator=''".$songlist['creator']."'". ",raw_url=''".$songlist['raw_url']."'". ",info=''".$songlist['info']."';"; mysql_query($sql,$conn); $songlist_id=mysql_insert_id(); foreach($songlistArr['songs'] as $song){ $title=mysql_escape_string($song['title']); $sql="insert into song set title='".$title."'" .",songlist_id=".$songlist_id.";"; mysql_query($sql); } mysql_close($conn); }}// Now, create a instance of your class, define the behaviour// of the crawler (see class-reference for more options and details)// and start the crawling-process.//创建一个爬虫$crawler = new MyCrawler();//设置一个开始的连接// URL to crawl$start_url="www.kugou.com/yy/special/index/1-0-2.html";$crawler->setURL($start_url);//设置内容的类型// Only receive content of files with content-type "text/html"$crawler->addContentTypeReceiveRule("#text/html#");//忽略图片,设置那些连接不需要下载//每一个精选集的连接$crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/single/\d+\.html# i");//i 忽略大小写//精选集页面的链接 下一页$crawler->addURLFollowRule("#http://www\.kugou\.com/yy/special/index/\d+-0-2.html# i");// Ignore links to pictures, dont even request pictures$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");// Store and send cookie-data like a browser does$crawler->enableCookieHandling(true);// Set the traffic-limit to 1 MB (in bytes,// for testing we dont want to "suck" the whole site)//数据内容的容量,多少m,0是无限的$crawler->setTrafficLimit(1000 * 1024);// Thats enough, now here we go$crawler->go();// At the end, after the process is finished, we print a short// report (see method getProcessReport() for more information)$report = $crawler->getProcessReport();if (PHP_SAPI == "cli") $lb = "\n";else $lb = "<br />"; echo "Summary:".$lb;echo "Links followed: ".$report->links_followed.$lb;echo "Documents received: ".$report->files_received.$lb;echo "Bytes received: ".$report->bytes_received." bytes".$lb;echo "Process runtime: ".$report->process_runtime." sec".$lb; ?>
3 0
- PHPCrawler抓取酷狗精选集歌单
- 抓取搜狗图片
- 抓取
- 抓取
- [插件精选] 炫酷粒子特效
- [技术] [插件精选] 炫酷粒子特效(下)
- 搜狗抓取雅虎邮箱的个人信息
- 伍佰----精选!
- 网络服务精选
- xml精选
- 精选网站
- 视频精选
- 语句精选
- CSS--精选
- 电影精选***
- 段子精选
- 精选博客
- 配色精选
- Linux 下使用 Eclipse 调试 apache 模块
- 三.文件备份和压缩命令
- Grep 和正则表达式
- codeforces 650E
- spark内核架构
- PHPCrawler抓取酷狗精选集歌单
- 设计模式(11)--命令模式
- 七步从Angular.JS菜鸟到专家(2):Scopes
- 关于bootstrap中提示框插件modal无法显示bug
- Back键和Home键的屏蔽
- EXISTS --in
- iOS使用CAShapeLayer实现复杂的View遮罩效果
- Python使用taskkill 报错 系统找不到指定的路径
- Android开发 第三方库