curl应用，中转搜索图书

来源：互联网发布：域名收费标准编辑：程序博客网时间：2024/06/05 08:38

<?phpclass nwu{public function search_book($keyword)//抓取第一页目录{     $url='http://202.117.102.160/cgi-bin/IlaswebBib';    $fields="v_index=TITLE&v_value=".$keyword."&FLD_DAT_BEG=&FLD_DAT_END=&v_pagenum=20&v_seldatabase=0&v_LogicSrch=0&submit=%B2%E9%26%23160%3B%D1%AF";$ch=curl_init();curl_setopt($ch, CURLOPT_URL,$url);curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36");curl_setopt($ch, CURLOPT_POST, 1);curl_setopt($ch, CURLOPT_POSTFIELDS, $fields);$content = curl_exec($ch);curl_close($ch);$pattern = '/<TR><td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=center bgcolor="#FFFFFF"><a href="([\d\D]*)">/Us';preg_match_all($pattern, $content, $matches);$count=count($matches[1]);$res=array();for ($i=0; $i<$count; $i++) {$res[$i]['bookname']=iconv("GB2312","UTF-8//IGNORE",$matches[1][$i]);$res[$i]['author']=iconv("GB2312","UTF-8//IGNORE",$matches[2][$i]);$res[$i]['publish']=iconv("GB2312","UTF-8//IGNORE",$matches[3][$i]);$res[$i]['page']=iconv("GB2312","UTF-8//IGNORE",$matches[4][$i]);$res[$i]['price']=iconv("GB2312","UTF-8//IGNORE",$matches[5][$i]);$res[$i]['booknum']=iconv("GB2312","UTF-8//IGNORE",$matches[6][$i]);$res[$i]['detail']=iconv("GB2312","UTF-8//IGNORE",$matches[7][$i]);}$pat='/<INPUT  TYPE=hidden NAME=v_count  value=\'([\d\D]*)\'>[\d\D]*<INPUT  TYPE=hidden  NAME=v_curkey  value=\'([\d\D]*)\'>[\d\D]*<INPUT  TYPE=hidden  NAME=v_addr  value=\'([\d\D]*)\'>[\d\D]*<INPUT  TYPE=hidden  NAME=v_curscr  value=([\d\D]*)>/Us';        preg_match($pat, $content, $postopt);                $opt="v_index=TITLE&v_value=".$keyword."&v_pagenum=20&v_count=".$postopt[1]."&FLD_DAT_BEG=&FLD_DAT_END=&v_LogicSrch=0&v_LogicKeyLen=0&v_seldatabase=0&v_LogicSrch=0&v_curkey=".urldecode($postopt[2])."&v_addr=".$postopt[3]."&v_curscr=".$postopt[4]."&v_curdbno=0";    $res[0]=$opt;return $res;}public function nextpage($keyword,$fields) //传入第一个或这个函数返回的$res[0]，获取第二页及以后的目录页用这个函数{     $url='http://202.117.102.160/cgi-bin/IlaswebBib';$ch = curl_init();curl_setopt($ch, CURLOPT_URL,$url);curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36");curl_setopt($ch, CURLOPT_POST, 1);curl_setopt($ch, CURLOPT_POSTFIELDS, $fields);$content = curl_exec($ch);curl_close($ch);$pattern = '/<TR><td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=center bgcolor="#FFFFFF"><a href="([\d\D]*)">/Us';preg_match_all($pattern, $content, $matches);$count=count($matches[1]);$res=array();for ($i=0; $i<$count; $i++) {$res[$i]['bookname']=iconv("GB2312","UTF-8//IGNORE",$matches[1][$i]);$res[$i]['author']=iconv("GB2312","UTF-8//IGNORE",$matches[2][$i]);$res[$i]['publish']=iconv("GB2312","UTF-8//IGNORE",$matches[3][$i]);$res[$i]['page']=iconv("GB2312","UTF-8//IGNORE",$matches[4][$i]);$res[$i]['price']=iconv("GB2312","UTF-8//IGNORE",$matches[5][$i]);$res[$i]['booknum']=iconv("GB2312","UTF-8//IGNORE",$matches[6][$i]);$res[$i]['detail']=iconv("GB2312","UTF-8//IGNORE",$matches[7][$i]);}$pat='/<INPUT  TYPE=hidden NAME=v_count  value=\'([\d\D]*)\'>[\d\D]*<INPUT  TYPE=hidden  NAME=v_curkey  value=\'([\d\D]*)\'>[\d\D]*<INPUT  TYPE=hidden  NAME=v_addr  value=\'([\d\D]*)\'>[\d\D]*<INPUT  TYPE=hidden  NAME=v_curscr  value=([\d\D]*)>/Us';        preg_match($pat, $content, $postopt);                $opt="v_index=TITLE&v_value=".$keyword."&v_pagenum=20&v_count=".$postopt[1]."&FLD_DAT_BEG=&FLD_DAT_END=&v_LogicSrch=0&v_LogicKeyLen=0&v_seldatabase=0&v_LogicSrch=0&v_curkey=".urldecode($postopt[2])."&v_addr=".$postopt[3]."&v_curscr=".$postopt[4]."&v_curdbno=0";    $res[0]=$opt;return $res;}   public function bookdetail($detail){$url='http://202.117.102.160/cgi-bin/'.$detail;$ch=curl_init();curl_setopt($ch, CURLOPT_URL,$url);curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36");$content=curl_exec($ch);curl_close($ch);            $pattern1='/<table border=0 bgcolor=#008080 width=661 cellspacing=1 cellpadding=4>[\d\D]*<tr bgcolor="#f8f8f8">[\d\D]*<td width="662" height="25">[\d\D]*<\/b><\/font>[\d\D]*<a href=[\d\D]*>([\d\D]*)<\/a>/Us';            $pattern2='/<td width="662" height="25"><font color="#3F67A3"><b>[\d\D]*<\/b><\/font>([\d\D]*)<\/td>/Us';$res=array();preg_match_all($pattern1, $content, $matches1);preg_match_all($pattern2, $content, $matches2);$res['title']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches1[1][0]),ENT_QUOTES, 'UTF-8'));$res['writer']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][0]),ENT_QUOTES, 'UTF-8'));$res['publish']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][1]),ENT_QUOTES, 'UTF-8'));$res['isbn']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][2]),ENT_QUOTES, 'UTF-8'));$res['page']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][3]),ENT_QUOTES, 'UTF-8'));$res['price']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][4]),ENT_QUOTES, 'UTF-8'));$res['series']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][5]),ENT_QUOTES, 'UTF-8'));$res['theme']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][6]),ENT_QUOTES, 'UTF-8'));$res['askfor']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][7]),ENT_QUOTES, 'UTF-8'));$res['classify']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][8]),ENT_QUOTES, 'UTF-8'));$res['aboutauthor']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][9]),ENT_QUOTES, 'UTF-8'));$res['infomation']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][10]),ENT_QUOTES, 'UTF-8'));$search ="'<[\/\!]*?[^<>]*?>'si";       // 去掉 HTML 标记        $res['title']=preg_replace($search,' ',$res['title']);$res['writer']=preg_replace($search,' ',$res['writer']);$res['publish']=preg_replace($search,' ',$res['publish']);$res['isbn']=preg_replace($search,' ',$res['isbn']);$res['page']=preg_replace($search,' ',$res['page']);$res['price']=preg_replace($search,' ',$res['price']);$res['series']=preg_replace($search,' ',$res['series']);$res['theme']=preg_replace($search,' ',$res['theme']);$res['askfor']=preg_replace($search,' ',$res['askfor']);$res['classify']=preg_replace($search,' ',$res['classify']);$res['aboutauthor']=preg_replace($search,' ',$res['aboutauthor']);$res['infomation']=preg_replace ($search,' ', $res['infomation']);$patterne='/<TD WIDTH=13% bgcolor="#FFFFFF">([\d\D]*)<\/TD>[\d\D]*<TD WIDTH=25% bgcolor="#FFFFFF">([\d\D]*)<\/TD>[\d\D]*<TD WIDTH=8% bgcolor="#FFFFFF">([\d\D]*)<\/TD>/Us';preg_match_all($patterne, $content, $matchese);$count=count($matchese[1]);for($n=0;$n<$count;$n++){$res['barcode'][$n]=iconv("GB2312","UTF-8//IGNORE",$matchese[1][$n]);$res['spot'][$n]=iconv("GB2312","UTF-8//IGNORE",$matchese[2][$n]);$res['circulate'][$n]=iconv("GB2312","UTF-8//IGNORE",$matchese[3][$n]);}return $res;}}?>

<?phprequire('class.XBsearch.php');/*if($_SERVER['REMOTE_ADDR'] != '116.255.173.155'){    echo "非法请求";    exit;}*/$lib=new nwu();$test=$lib->search_book('php');print_r($test);$test1=$lib->bookdetail($test[1]['detail']);print_r($test1);?>

详细解释:;
这是CURL的操作.模拟提交搜索图书为$keyword,地址为$url,这是搜索后的结果页.这个使用post提交的搜索表单,所以要知道这个表单提交了些什么参数.我用chrome浏览器的f12,的开发者工具.点开f12后点击Network,里面寻找hearder,你就会发现它提交的信息,分析后可得知我们:
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $fields);
然后
$pattern = '/<TR><td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=center bgcolor="#FFFFFF"><a href="([\d\D]*)">/Us';
preg_match_all($pattern, $content, $matches);
这是正则抓取我们需要的图书信息.//里面为我们的正则式子.U是关闭贪婪模式.要不会匹配很多.s是多行匹配.
()里是我们需要的东西,如果我们匹配后的变量为$matches,则$matches[1]为第一个()里的内容.通常抓网页时,遇到的代码分行,分行就要用[\d\D]*来对应,否则不能成功.任意匹配可用[\d\D]*,或者.*
$res[$i]['bookname']=iconv("GB2312","UTF-8//IGNORE",$matches[1][$i]);
iconv用来转编码,我们要统一成UTF-8,所以要转换一下.

下面的方法重新抓下一页.可能你要问,为什么要重新写一个方法来抓下一页?
因为这个你搜索的表单还有隐藏项,你只有进去地一个页面,下一个页面的隐藏表单信息才能用chrome的工具看到,而且
如果你只改
它这个系统比较...没法猜,每一页都有好几项不一样,你只能重新抓了然后在进下一页的时候用.当然,可以把前两个方法合到一起.当时没想这么多,索性弄了两个...

第三个那个是抓取图书详细页的方法.
看一下应该不难.同理得嘛,将抓取目录页的图书详细页地址传进去.
preg_match_all($pattern2, $content, $matches2);
$res['title']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches1[1][0]),ENT_QUOTES, 'UTF-8'));
trim()用来除去空格.html_entity_decode($string,ENT_QUOTES,'UTF-8');把 HTML 实体转换为字符.
$search ="'<[\/\!]*?[^<>]*?>'si"; // 去掉 HTML 标记
$res['title']=preg_replace($search,' ',$res['title']);
再做一次替换,去除无用信息

至于调用页里的
/*if($_SERVER['REMOTE_ADDR'] != '116.255.173.155'){
echo "非法请求";
exit;
}*/
这是防止别人用,我们用那个地址的服务器访问这个页面,别的ip都会呈现:非法请求...

0 0