爬虫豆瓣 -- v1.0

来源:互联网 发布:淘宝个性签名大全 编辑:程序博客网 时间:2024/04/29 20:40

     尚书有云:尔不与吾,吾自索之,今之宇内,物属我众,今吾提笔,享与众人!

     In this article, i want to illustrate how to crawl the DouBan website. if you can't understand some detail, just leave a message.

     今天我想谈谈如何爬虫豆瓣的资源,首先声明,本来我不想爬虫豆瓣,这样对豆瓣网站并不好,但是由于其提供的API有诸多限制,严重影响了我的想象力,所以我觉定自己写程序提取他的数据。读者可以研读此文章和程序,但万不可去挑战豆瓣。

     近来需要开发一个视频网站系统,既是此类网站,那么就需要大量的视频资源数据,但是一般这类数据是不会有人双手送与你,所以你得自己准备好所有数据。今日在此文中,吾只谈数据资源的准备,不谈如何开发网站系统!有违令者,发配戍边!

       其实豆瓣向开发者提供了API接口,通过此API,开发者可以提取到有关电影视频的一些元信息,但是作为普通第三方开发者,此API返回的数据实在有限,不能满足一般需求。那么还有另一条路可走吗?当属爬虫了!爬虫其实就是自己写程序,此程序会遍历豆瓣的视频资源页面,然后从中提取你所需要的字段。众所周知的Google搜索引擎中,一个重要部分就是其Google-Spider,即谷歌蜘蛛,此蜘蛛时刻在互联网上行走遍历,每到一个页面或者网页,就提取一些关键性信息,谷歌搜索引擎既是根据这些信息来提供检索功能的。

     爬虫设计:此爬虫使用PHP编程语言,在Linux的Lamp环境下实现,其实PHP是跨平台的,读者可原封不动的将代码在Windows上运行。此爬虫主要是由类videometadata实现,videometadata类通过使用HTML的DOM-Document Object Model 和正则表达式来具体实现信息的提取。具体代码代码如下:

<?php/* * Author: Archer * Date: 20/Jun/2013 * File: videometadata.php * Des: this file is responsible for getting the video meta data from the video *info page in the douban website. * * Note: at the beginning, i use the html dom so solve that problem, but that is *not robust, so this time, i am gonna to use regular expression solve the *problem. -- 20/Jun/2013 * * Produced By CSRG. */require_once ('simple_html_dom.php');require_once ('netlib.php');class Videometadata {/* * Des: some useful properties. */public $html = FALSE;public $htmldom = '';public $doubanid = 0;/* * Des: temporary don't know the things need to do by the constructor. */public function __construct ($url) {// TO-DO// use my own function get the html contents$this->html = get_data($url);$this->htmldom = str_get_html($this->html);// following two line for doubanid$tmp = explode('/', $url);$this->doubanid = $tmp[4];//$this->htmldom = file_get_html($url);//echo "Running construct method ...\n";}/** * DES: method getPoster is used to get the main poster from the html dom * @parm: none * @return: url of the poster link * Note: at beginning, i think i don't need to writer this method to get to *get the poster, becuase there is proramme dedicated to get all the *related photos, but sometimes, it is not that reliable. */public function getPoster () {if ($this->html == FALSE) {// 404return FALSE;}$tmpdom = $this->htmldom->getElementById('mainpic');if ( !is_object($tmpdom)) {return FALSE;}// here use regular expression, for html dom is not reliable sometimes$partern = '/<img src="(.*)" .*\/>/';$partern2 = '/<img src="(.*)"[ ]*title=".*"[ ]*alt=".*"[ ]*rel=".*"[ ]*\/>/';if (preg_match($partern2, $tmpdom, $matches)) {// yes, matched//print_r($matches);return $matches[1];} else {// no, not metachedreturn FALSE;}}/* * Des: method getTitle is used to get the title from the html dom. * @parm: none * @return: title string */public function getTitle () {if ($this->html == FALSE) {// 404return FALSE;}$titledom = $this->htmldom->find('h1', 0);return $titledom->plaintext;}/* * Des: method getDirectors is used to get the director from the html *dom. * @parm: none * @return: array of the directors and it's links */public function getDirectors () {if ($this->html == FALSE) {// 404return FALSE;}// locate to the directors block of code$directorsdom = $this->htmldom->getElementById('info');if (is_object($directorsdom)) {$directorsdom = $directorsdom->childNodes(0);} else {return FALSE;}// sometimes old movie don't have too much metadataif ( !is_object($directorsdom)) {return FALSE;}$directors = array();$index = 0;foreach ($directorsdom->find('a') as $element) {//echo $element->plaintext . "\n";$directors[$index]['director'] = $element->plaintext;//echo $element->href . "\n";$directors[$index]['link'] = $element->href;$index++;}return $directors;}/* * Des: method getWriters is used to get the writers of the video from the *html dom. * @parm: none * @return: array of the writers and its links */public function getWriters () {if ($this->html == FALSE) {// 404return FALSE;}// locate to the writers block of code$writersdom = $this->htmldom->getElementById('info');if (is_object($writersdom)) {$writersdom = $writersdom->childNodes(2);} else {return FALSE;}// sometimes, old movie don't have too much meta data, so there is no childnodes 2 in writersdom.if ( !is_object($writersdom)) {return FALSE;}$writers = array();$index = 0;foreach ($writersdom->find('a') as $element) {//echo $element->plaintext . "\n";$writers[$index]['writer'] = $element->plaintext;//echo $element->href . "\n";$writers[$index]['link'] = $element->href;$index++;}return $writers;}/* * Des: method getActors is used to get the actors from the video html *dom. * @parm: none * @return: array of the actors and links */public function getActors () {if ($this->html == FALSE) {// 404return FALSE;}// locate to the actors block of code$actorsdom = $this->htmldom->getElementById('info');if (is_object($actorsdom)) {$actorsdom = $actorsdom->childNodes(4);} else {return FALSE;}if ( !is_object($actorsdom)) {return FALSE;}$actors = array();$index = 0;foreach ($actorsdom->find('a') as $element) {//echo $element->plaintext . "\n";$actors[$index]['actor'] = $element->plaintext;//echo $element->href . "\n";$actors[$index]['link'] = $element->href;$index++;}return $actors;}/* * Des: method getGenres is used to get the genres from the video html *dom. * @parm: none * @return: array of the genres */public function getGenres () {if ($this->html == FALSE) {// 404return FALSE;}// locate to the actors block of code$genresdom = $this->htmldom->getElementById('info');if (is_object($genresdom)) {$genresdom = $genresdom->childNodes(6);} else {return FALSE;}// test againif ( !is_object($genresdom)) {return FALSE;}$genres = array();$index = 0;// generaly speaking, there are at most 10 genresfor ($i = 0; $i < 10; $i++) {if ($genresdom->getAttribute('property') == "v:genre") {//echo $genresdom->plaintext . "\n";$genres[$index]['genre'] = $genresdom->plaintext;$index++;}$genresdom = $genresdom->next_sibling();}//$genres[$index]['genre'] = $genresdom->plaintext;return $genres;}/* * Des: method getGenresa is used to get the genres from the video html *dom. * @parm: none * @return: array of the genres * Note: method getGenres is not robust by using the html dom, so this *one I use regular expression do the dirty work. */public function getGenresa () {if ($this->html == FALSE) {// 404return FALSE;}$tmp = $this->htmldom->getElementById('info');if ( !is_object($tmp)) {// no objectreturn FALSE;}// the following partern is used to match utf-8 chinese code$partern2 = '/<span property="v:genre">([\x{4e00}-\x{9fa5}]+)<\/span>/u';if (preg_match_all($partern2, $tmp, $matches)) {// matches//print_r($matches);//echo $matches[0];//echo "wow";return $matches[1];} else {// no match//echo "No\n";return FALSE;}}/* * Des: method getOfficiallink is used to get the officail link from the video html *dom. * @parm: none * @return: string of the link *//*public function getOfficialLink () {if ($this->html == FALSE) {// 404return FALSE;}// locate to the official link block of code$tmpdom = $this->htmldom->getElementById('info');if (is_object($tmpdom)) {echo "yes";} else {echo "no";}// get the child span relatednumbers$nodenum = count($this->htmldom->getElementById('info')->getElementsByTagName('span'));// locate to the exact location and get the contentfor ($i = 0; $i < $nodenum; $i++) {// jump over tags not belong to span.$tmp = $this->htmldom->find("#info", 0)->getElementByTagName('br');if (!$tmp) {$tmp = $tmp->next_sibling();echo "fuck\n";$i--;continue;}// the dom method will append an blank space sometimes$str1 = $tmpdom->plaintext;//echo $str1;$str2 = "官方网站: ";if (strncmp ($str1, $str2, 12) == 0) {//echo $str1 . "\t" . strlen('官方网站:') . "\t";return $tmpdom->next_sibling()->plaintext;}//echo $i . "\t" . $tmpdom->plaintext . "\n";$tmpdom = $tmpdom->next_sibling();}}*//* * Des: method getOfficiallink is used to get the officail link from the video html *dom. * @parm: none * @return: string of the link * Note: use regular expression get the data you want. */public function getOfficialLink () {if ($this->html == FALSE) {// 404return FALSE;}$tmp = $this->htmldom->getElementById('info');if (is_object($tmp)) {//echo "Yes\n";$tmp = $tmp->plrelatedaintext;} else {//echo "No\n";return FALSE;}// use reg get the thing you want$partern = '/官方网站:[ ]*(.*)/';if (preg_match($partern, $tmp, $matches)) {// matchesreturn $matches[1];} else {// no matchreturn FALSE;}}/*related * Des: method getCountries is used to get the countries link from the video html *dom. * @parm: none * @return: string of the contries * Note: countries are not include in any tag, so html dom can not solve this problem. *use regular expression solve this problem. * */public function getCountries () {if ($this->html == FALSE) {// 404return FALSE;}// put the block of code into the string$countries = $this->htmldom->getElementById('info');$partern = '/<span class="pl">制片国家\/地区:<\/span> (.*)<br\/>/';if (preg_match($partern, $countries, $matches)) {// Ah! finally function explode solved my problem.$countries = explode("<br/>", $matches[1], 2);return $countries[0];} else {return FALSE;}}/* * Des: method getLanguages is used to get the languages link from the video html *dom. * @parm: none * @return: string of the languages * Note: languages are not include in any tag, so html dom can not solve this problem. *use regular expression solve this problem. * */public function getLanguages () {if ($this->html == FALSE) {// 404return FALSE;}// put the block of code into the string$countries = $this->htmldom->getElementById('info');$partern = '/<span class="pl">语言:<\/span> (.*)<br\/>/';if (preg_match($partern, $countries, $matches)) {// Ah! finally function explode solved my problem.$countries = explode("<br/>", $matches[1], 2);return $countries[0];} else {return FALSE;}}/* * Des: method getPubDate is used to get the publate date from the video html *dom. * @parm: none * @return: string of the pub date * Note: wired! not working on some tv series */public function getPubDate () {if ($this->html == FALSE) {// 404return FALSE;}// locate to the official link block of code$tmpdom = $this->htmldom->getElementById('info');if (is_object($tmpdom)) {$tmpdom = $tmpdom->getElementsByTagName('span', 0);} else {return FALSE;}if ( !is_object($tmpdom)) {return FALSE;}// get the child span numbers$nodenum = count($this->htmldom->getElementById('info')->getElementsByTagName('span'));// locate to the exact location and get the contentfor ($i = 0; $i < $nodenum; $i++) {// jump over tags not belong to span.if ($tmpdom->getElementByTagName('br')) {$tmpdom = $tmpdom->next_sibling();$i--;continue;}// the dom method will append an blank space sometimes$str1 = $tmpdom->plaintext;$str2 = "上映日期: ";if (strncmp ($str1, $str2, 12) == 0) {//echo $str1 . "\t" . strlen('官方网站:') . "\t";return $tmpdom->next_sibling()->plaintext;}//echo $i . "\t" . $tmpdom->plaintext . "\n";$tmpdom = $tmpdom->next_sibling();}}/* * Des: method getPubDatea is used to get the pubdates from the video html *dom. * @parm: none * @return: string of the pubdate * Note: method getPubdate is not robust by using the html dom, so this *one I use regular expression do the dirty work. */public function getPubDatea () {if ($this->html == FALSE) {// 404return FALSE;}$tmp = $this->htmldom->getElementById('info');if ( !is_object($tmp)) {// no object//echo "No Object\n";return FALSE;}//echo $tmp;// the following partern is used to match utf-8 chinese code//$partern = '/[ ]*<span property="v:initialReleaseDate" content=".*">(.*)<\/span><br\/>/';$partern2 = '/[ ]*<span property="v:initialReleaseDate" content=".*">([0-9]{4}-[0-9]{2}-[0-9]+\(.*\))<\/span><br\/>/';if (preg_match($partern2, $tmp, $matches)) {// matches//echo "Yes\n";//print_r($matches);//echo $matches[1];return $matches[1];} else {// no match//echo "No\n";return FALSE;}}/* * Des: method getLength is used to get the video length from the video html *dom. * @parm: none * @return: string of the video length * Note: wired! not working on some tv series */public function getLength () {if ($this->html == FALSE) {// 404return FALSE;}// locate to the official link block of code$tmpdom = $this->htmldom->getElementById('info');if (is_object($tmpdom)) {$tmpdom = $tmpdom->getElementsByTagName('span', 0);} else {return FALSE;}// get the child span numbers$nodenum = count($this->htmldom->getElementById('info')->getElementsByTagName('span'));if ( !is_object($tmpdom)) {return FALSE;}// locate to the exact location and get the contentfor ($i = 0; $i < $nodenum; $i++) {// jump over tags not belong to span.if ($tmpdom->getElementByTagName('br')) {$tmpdom = $tmpdom->next_sibling();$i--;continue;}// the dom method will append an blank space sometimes$str1 = $tmpdom->plaintext;$str2 = "片长: ";if (strncmp ($str1, $str2, 6) == 0) {return $tmpdom->next_sibling()->plaintext;}$tmpdom = $tmpdom->next_sibling();}}/* * Des: method getLengtha is used to get the length from the video html *dom. * @parm: none * @return: string of the length * Note: method getPubdate is not robust by using the html dom, so this *one I use regular expression do the dirty work. */public function getLengtha () {if ($this->html == FALSE) {// 404return FALSE;}$tmp = $this->htmldom->getElementById('info');if ( !is_object($tmp)) {// no object//echo "No Object\n";return FALSE;}//echo $tmp;// the following partern is used to match utf-8 chinese code$partern = '/[ ]*<span property="v:runtime" content=".*">(.*)<\/span><br\/>/';if (preg_match($partern, $tmp, $matches)) {// matches//echo "Yes\n";//print_r($matches);//echo $matches[0];return $matches[1];} else {// no match//echo "No\n";return FALSE;}}/* * Des: method getAka is used to get the aka from the video html *dom. * @parm: none * @return: string of the aka * Note: aka are not include in any tag, so html dom can not solve this problem. *use regular expression solve this problem. * */public function getAka () {if ($this->html == FALSE) {// 404return FALSE;}// put the block of code into the string$tmp = $this->htmldom->getElementById('info');$partern = '/<span class="pl">又名:<\/span> (.*)<br\/>/';if (preg_match($partern, $tmp, $matches)) {// Ah! finally function explode solved my problem.$akas = explode("<br/>", $matches[1], 2);return $akas[0];} else {return FALSE;}}/* * Des: method getImdb is used to get the imdb id from the video html *dom. * @parm: none * @return: string of the imdb id *//*public function getImdb () {if ($this->html == FALSE) {// 404return FALSE;}// locate to the official link block of code$tmpdom = $this->htmldom->getElementById('info');if (is_object($tmpdom)) {$tmpdom = $tmpdom->getElementsByTagName('span', 0);} else {return FALSE;}// get the child span numbers$nodenum = count($this->htmldom->getElementById('info')->getElementsByTagName('span'));if ( !is_object($tmpdom)) {return FALSE;}// locate to the exact location and get the contentfor ($i = 0; $i < $nodenum; $i++) {// jump over tags not belong to span.if ($tmpdom->getElementByTagName('br')) {$tmpdom = $tmpdom->next_sibling();$i--;continue;}// the dom method will append an blank space sometimes$str1 = $tmpdom->plaintext;$str2 = "IMDb链接: ";if (strncmp ($str1, $str2, 4) == 0) {return $tmpdom->next_sibling()->plaintext;}$tmpdom = $tmpdom->next_sibling();}}*//* * Des: method getImdb is used to get the imdb id from the video html *dom. * @parm: none * @return: string of the imdb id */public function getImdb () {if ($this->html == FALSE) {// 404return FALSE;}$tmp = $this->htmldom->getElementById('info');if (is_object($tmp)) {//echo "Yes\n";$tmp = $tmp->plaintext;} else {//echo "No\n";return FALSE;}// use reg get the thing you want$partern = '/IMDb链接:[ ]*(.*)/';if (preg_match($partern, $tmp, $matches)) {// matchesreturn $matches[1];} else {// no matchreturn FALSE;}}/* * Des: getDoubanId is used to get the douban video id from the url. this *task is already done in the constructor. so just return the value. * @parm: none * @return: the string of the douban id */public function getDoubanId () {return $this->doubanid;}/* * DES: method getRating is used to get the douban rating data from the  *original html file by using the html dom and regular expression. * @parm: none * @return: string of the rating */public function getRating () {if ($this->html == FALSE) {// 404return FALSE;}$tmp = $this->htmldom->getElementById('interest_sectl');//echo $tmp->plaintext;if (is_object($tmp)) {//echo "Yes\n";$tmp = $tmp->plaintext;} else {//echo "No\n";return FALSE;}// use reg get the thing you want$partern = '/[ ]*([0-9]\.[0-9])[ ]*/';if (preg_match($partern, $tmp, $matches)) {// matches//echo $matches[1] . "\n";return $matches[1];} else {// no matchreturn FALSE;}}/* * DES: method getViewNum is used to get the view number data from the  *original html file by using the html dom and regular expression. * @parm: none * @return: string of the view number */public function getViewNum () {if ($this->html == FALSE) {// 404return FALSE;}$tmp = $this->htmldom->getElementById('interest_sectl');//echo $tmp->plaintext;if (is_object($tmp)) {//echo "Yes\n";$tmp = $tmp->plaintext;} else {//echo "No\n";return FALSE;}// use reg get the thing you want//$partern = '/[ ]*([0-9]\.[0-9])[ ]*/';$partern = '/[ ]*[0-9]\.[0-9][ ]*\(([0-9]*)/';if (preg_match($partern, $tmp, $matches)) {// matches//echo $matches[1] . "\n";return $matches[1];} else {// no matchreturn FALSE;}}/* * DES: method getDescription is used to get the description data from the  *original html file by using the html dom and regular expression. * @parm: none * @return: string of the description */public function getDescription () {if ($this->html == FALSE) {// 404return FALSE;}$tmp = $this->htmldom->getElementById('link-report');//echo $tmp->plaintext;if (is_object($tmp)) {//echo "Yes\n";$tmp = $tmp->plaintext;return $tmp;} else {//echo "No\n";return FALSE;}}/* * Des: method getTraillerLink is used to get the links of the photos. * @parm: none * @return: string of the link */public function getTraillerLink () {if ($this->html == FALSE) {// 404return FALSE;}$tmp = $this->htmldom->find('h2', 1);if (is_object($tmp)) {//echo "Yes\n";$partern = '/<a href=(.*)>预告片[0-9]*<\/a>/';if (preg_match($partern, $tmp, $matches)) {// matched//print_r($matches);$res = str_replace("\"", "", $matches[1]);return $res;} else {return FALSE;}//return $tmp;} else {//echo "No\n";return FALSE;}}/* * Des: method getPhotosLink is used to get the links of the photos. * @parm: none * @return: string of the link */public function getPhotosLink () {if ($this->html == FALSE) {// 404echo "Videometadata::getPhotosLink 404\n";return FALSE;}$tmp = $this->htmldom->find('h2', 1);if (is_object($tmp)) {//echo "Yes\n";$partern = '/<a href=.*>预告片[0-9]*<\/a>.*<a href=(.*)>图片[0-9]*<\/a>/';$partern2 = '/<a href="(.*)">全部[0-9]*<\/a>/';if (preg_match($partern, $tmp, $matches) || preg_match($partern2, $tmp, $matches)) {// matched//print_r($matches);$res = str_replace("\"", "", $matches[1]);return $res;} else {echo "Videometadata::getPhotosLink NO Match\n";return FALSE;}} else {//echo "No\n";echo "Videometadata::getPhotosLink Not Object\n";return FALSE;}}/* * DES: method getPhotoLinks is used to get the image url link. this method is different *from getPhotoLink. * @parm: string photourl * @return: array of the url links */public function getPhotoLinks ($photourl) {if ( !$photourl) {return FALSE;}// first get the html file$html = get_data($photourl);if ($html == FALSE) {// can not get the all_photos pagereturn FALSE;}$partern = '/<img src="(.*)">/';if (preg_match_all($partern, $html, $matches, PREG_PATTERN_ORDER)) {// matchedreturn $matches[1];} else {// not matchereturn FALSE;}}/* * DES: method getRecommendId is used to get the recommendation videos from *douban. * @parm: douban video id * @return: array contains the recommendation video id */public function getRecommendId () {if ($this->html == FALSE) {// 404return FALSE;}$tmp = $this->htmldom->getElementById('recommendations');if ( !is_object($tmp)) {return FALSE;}$tmp = $tmp->find('dd');/*if ( !is_object($tmp)) {return FALSE;}*///echo count($tmp);$res = array();$index = 0;for ($i = 0; $i < count($tmp); $i++) {$tmpa = $this->htmldom->getElementById('recommendations')->find('dd', $i);//echo $tmp;$partern = '/<dd>[ ]*<a href="(.*)\?from=subject-page" class="">/';if (preg_match($partern, $tmpa, $matches)) {// matched$tmpidarray = explode('/', $matches[1]);$res[$index] = $tmpidarray[4];$index++;}}if ($index == 二来

0) {// index can be a flag, if not match at all, return falsereturn FALSE;} else {// return the id arrayreturn $res;}}/* * DES: method getRecommendIda is used to get the recommendation videos from *douban. * @parm: douban video id * @return: array contains the recommendation video id * Note: method getRecommendId is facing some object problem, so i add a if test *to solve that problem. */public function getRecommendIda () {if ($this->html == FALSE) {// 404echo "Videometadata::getRecommendIda 404\n";return FALSE;}$tmp = $this->htmldom->getElementById('recommendations');if ( !is_object($tmp)) {echo "Videometadata::getRecommendIda Not Object\n";return FALSE;}$tmp = $tmp->find('dd');/*if ( !is_object($tmp)) {return FALSE;}*///echo count($tmp);$res = array();$index = 0;for ($i = 0; $i < count($tmp); $i++) {$tmpa = $this->htmldom->getElementById('recommendations')->find('dd', $i);//echo $tmp;$partern = '/<dd>[ ]*<a href="(.*)\?from=subject-page" class="">/';if (preg_match($partern, $tmpa, $matches)) {// matched$tmpidarray = explode('/', $matches[1]);$res[$index] = $tmpidarray[4];$index++;}}if ($index == 0) {// index can be a flag, if not match at all, return falsereturn FALSE;} else {// return the id arrayreturn $res;}}/* * DES: method getReviewTitle is used to get the review title of the video from *douban. * @parm: none * @return: string of the title * Note: use html dom can solve this problem very good. */public function getReviewTitle ($index = 0) {if ($this->html == FALSE) {// 404return FALSE;}$titlenum = count($this->htmldom->getElementById('review_section')->find('h3'));$tmp = $this->htmldom->getElementById('review_section')->find('h3', $index);return $tmp->lastChild()->innertext;}/* * DES: method getReviewBody is used to get the review body of the video from *douban. * @parm: none * @return: array of the body * Note: use html dom can solve this problem very good. */public function getReviewsBody () {if ($this->html == FALSE) {// 404return FALSE;}$tmp = $this->htmldom->getElementById('review_section');if (is_object($tmp)) {// html dom return right, continuing$reviewdoms = count($tmp->find('div'));//echo $reviewdoms . "\n";$reviewdoms = $tmp->find('div');$index = 0;$res = array();foreach ($reviewdoms as $item) {if ($item->hasAttribute('class') && $item->getAttribute('class') == 'review-short') {//echo "Do 'Oh!\n";//echo $item->plaintext . "\n";//echo self::getReviewTitle($index) . "\n";$res[$index]['title'] = self::getReviewTitle($index);$res[$index]['body'] = $item->firstChild()->plaintext;$index++;//echo $item->firstChild()->plaintext . "\n\n";}}} else {// not dom return FALSE;}}/* * DES: method getReviews is used to get the review info of the video from *douban. * @parm: none * @return: array contains the title and the body, i.e. $res[0]['title']['body'] * Note: use html dom can solve this problem very good. */public function getReviews () {if ($this->html == FALSE) {// 404echo "Videometadata::getReviews 404\n";return FALSE;}$tmp = $this->htmldom->getElementById('review_section');if (is_object($tmp)) {// html dom return right, continuing$reviewdoms = count($tmp->find('div'));//echo $reviewdoms . "\n";$reviewdoms = $tmp->find('div');$index = 0;$res = array();foreach ($reviewdoms as $item) {if ($item->hasAttribute('class') && $item->getAttribute('class') == 'review-short') {//echo self::getReviewTitle($index) . "\n";$res[$index]['title'] = self::getReviewTitle($index);$res[$index]['body'] = $item->firstChild()->plaintext;$index++;//echo $item->firstChild()->plaintext . "\n\n";}}if ($index == 0) {// no result in arrayreturn FALSE;} else {// some result in arrayreturn $res;}} else {// not dom echo "Videometadata::getReviews Not Object\n";return FALSE;}}/* * Des: temporary don't know the things need to do by the deconstructor. */public function __destruct () {// TO-DO//echo $this->htmldom;//echo "Running destruct method ...\n";}}?>

    上述PHP 类接收使用方法如下:

<?php/** * DES: an example to illustrate how to use the videometadata class. * * Produced By CSRG. **/require_once ('videometadata.php');// the video page url in douban$url = "http://movie.douban.com/subject/123393/"// to new an object from videometadata, you need provide the url$videoobj = new videometadata($url);echo $videoobj->getTitle();?>

   

    我知道将主要代码贴在文章中实在是不可取,一是因为没人回去仔细研读,二来,这样影响文章的阅读。我本是想将所有的项目代码放在附件中的,可琢磨了下,博客系统没提供这个功能,所以只能将就了。但愿诸位能理解,意见我也接受。博客系统的使用,我还不是很熟悉,望各位指导!