curl应用,中转搜索图书
来源:互联网 发布:域名收费标准 编辑:程序博客网 时间:2024/06/05 08:38
<?phpclass nwu{public function search_book($keyword)//抓取第一页目录{ $url='http://202.117.102.160/cgi-bin/IlaswebBib'; $fields="v_index=TITLE&v_value=".$keyword."&FLD_DAT_BEG=&FLD_DAT_END=&v_pagenum=20&v_seldatabase=0&v_LogicSrch=0&submit=%B2%E9%26%23160%3B%D1%AF";$ch=curl_init();curl_setopt($ch, CURLOPT_URL,$url);curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36");curl_setopt($ch, CURLOPT_POST, 1);curl_setopt($ch, CURLOPT_POSTFIELDS, $fields);$content = curl_exec($ch);curl_close($ch);$pattern = '/<TR><td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=center bgcolor="#FFFFFF"><a href="([\d\D]*)">/Us';preg_match_all($pattern, $content, $matches);$count=count($matches[1]);$res=array();for ($i=0; $i<$count; $i++) {$res[$i]['bookname']=iconv("GB2312","UTF-8//IGNORE",$matches[1][$i]);$res[$i]['author']=iconv("GB2312","UTF-8//IGNORE",$matches[2][$i]);$res[$i]['publish']=iconv("GB2312","UTF-8//IGNORE",$matches[3][$i]);$res[$i]['page']=iconv("GB2312","UTF-8//IGNORE",$matches[4][$i]);$res[$i]['price']=iconv("GB2312","UTF-8//IGNORE",$matches[5][$i]);$res[$i]['booknum']=iconv("GB2312","UTF-8//IGNORE",$matches[6][$i]);$res[$i]['detail']=iconv("GB2312","UTF-8//IGNORE",$matches[7][$i]);}$pat='/<INPUT TYPE=hidden NAME=v_count value=\'([\d\D]*)\'>[\d\D]*<INPUT TYPE=hidden NAME=v_curkey value=\'([\d\D]*)\'>[\d\D]*<INPUT TYPE=hidden NAME=v_addr value=\'([\d\D]*)\'>[\d\D]*<INPUT TYPE=hidden NAME=v_curscr value=([\d\D]*)>/Us'; preg_match($pat, $content, $postopt); $opt="v_index=TITLE&v_value=".$keyword."&v_pagenum=20&v_count=".$postopt[1]."&FLD_DAT_BEG=&FLD_DAT_END=&v_LogicSrch=0&v_LogicKeyLen=0&v_seldatabase=0&v_LogicSrch=0&v_curkey=".urldecode($postopt[2])."&v_addr=".$postopt[3]."&v_curscr=".$postopt[4]."&v_curdbno=0"; $res[0]=$opt;return $res;}public function nextpage($keyword,$fields) //传入第一个或这个函数返回的$res[0],获取第二页及以后的目录页用这个函数{ $url='http://202.117.102.160/cgi-bin/IlaswebBib';$ch = curl_init();curl_setopt($ch, CURLOPT_URL,$url);curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36");curl_setopt($ch, CURLOPT_POST, 1);curl_setopt($ch, CURLOPT_POSTFIELDS, $fields);$content = curl_exec($ch);curl_close($ch);$pattern = '/<TR><td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=center bgcolor="#FFFFFF"><a href="([\d\D]*)">/Us';preg_match_all($pattern, $content, $matches);$count=count($matches[1]);$res=array();for ($i=0; $i<$count; $i++) {$res[$i]['bookname']=iconv("GB2312","UTF-8//IGNORE",$matches[1][$i]);$res[$i]['author']=iconv("GB2312","UTF-8//IGNORE",$matches[2][$i]);$res[$i]['publish']=iconv("GB2312","UTF-8//IGNORE",$matches[3][$i]);$res[$i]['page']=iconv("GB2312","UTF-8//IGNORE",$matches[4][$i]);$res[$i]['price']=iconv("GB2312","UTF-8//IGNORE",$matches[5][$i]);$res[$i]['booknum']=iconv("GB2312","UTF-8//IGNORE",$matches[6][$i]);$res[$i]['detail']=iconv("GB2312","UTF-8//IGNORE",$matches[7][$i]);}$pat='/<INPUT TYPE=hidden NAME=v_count value=\'([\d\D]*)\'>[\d\D]*<INPUT TYPE=hidden NAME=v_curkey value=\'([\d\D]*)\'>[\d\D]*<INPUT TYPE=hidden NAME=v_addr value=\'([\d\D]*)\'>[\d\D]*<INPUT TYPE=hidden NAME=v_curscr value=([\d\D]*)>/Us'; preg_match($pat, $content, $postopt); $opt="v_index=TITLE&v_value=".$keyword."&v_pagenum=20&v_count=".$postopt[1]."&FLD_DAT_BEG=&FLD_DAT_END=&v_LogicSrch=0&v_LogicKeyLen=0&v_seldatabase=0&v_LogicSrch=0&v_curkey=".urldecode($postopt[2])."&v_addr=".$postopt[3]."&v_curscr=".$postopt[4]."&v_curdbno=0"; $res[0]=$opt;return $res;} public function bookdetail($detail){$url='http://202.117.102.160/cgi-bin/'.$detail;$ch=curl_init();curl_setopt($ch, CURLOPT_URL,$url);curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36");$content=curl_exec($ch);curl_close($ch); $pattern1='/<table border=0 bgcolor=#008080 width=661 cellspacing=1 cellpadding=4>[\d\D]*<tr bgcolor="#f8f8f8">[\d\D]*<td width="662" height="25">[\d\D]*<\/b><\/font>[\d\D]*<a href=[\d\D]*>([\d\D]*)<\/a>/Us'; $pattern2='/<td width="662" height="25"><font color="#3F67A3"><b>[\d\D]*<\/b><\/font>([\d\D]*)<\/td>/Us';$res=array();preg_match_all($pattern1, $content, $matches1);preg_match_all($pattern2, $content, $matches2);$res['title']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches1[1][0]),ENT_QUOTES, 'UTF-8'));$res['writer']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][0]),ENT_QUOTES, 'UTF-8'));$res['publish']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][1]),ENT_QUOTES, 'UTF-8'));$res['isbn']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][2]),ENT_QUOTES, 'UTF-8'));$res['page']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][3]),ENT_QUOTES, 'UTF-8'));$res['price']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][4]),ENT_QUOTES, 'UTF-8'));$res['series']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][5]),ENT_QUOTES, 'UTF-8'));$res['theme']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][6]),ENT_QUOTES, 'UTF-8'));$res['askfor']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][7]),ENT_QUOTES, 'UTF-8'));$res['classify']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][8]),ENT_QUOTES, 'UTF-8'));$res['aboutauthor']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][9]),ENT_QUOTES, 'UTF-8'));$res['infomation']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches2[1][10]),ENT_QUOTES, 'UTF-8'));$search ="'<[\/\!]*?[^<>]*?>'si"; // 去掉 HTML 标记 $res['title']=preg_replace($search,' ',$res['title']);$res['writer']=preg_replace($search,' ',$res['writer']);$res['publish']=preg_replace($search,' ',$res['publish']);$res['isbn']=preg_replace($search,' ',$res['isbn']);$res['page']=preg_replace($search,' ',$res['page']);$res['price']=preg_replace($search,' ',$res['price']);$res['series']=preg_replace($search,' ',$res['series']);$res['theme']=preg_replace($search,' ',$res['theme']);$res['askfor']=preg_replace($search,' ',$res['askfor']);$res['classify']=preg_replace($search,' ',$res['classify']);$res['aboutauthor']=preg_replace($search,' ',$res['aboutauthor']);$res['infomation']=preg_replace ($search,' ', $res['infomation']);$patterne='/<TD WIDTH=13% bgcolor="#FFFFFF">([\d\D]*)<\/TD>[\d\D]*<TD WIDTH=25% bgcolor="#FFFFFF">([\d\D]*)<\/TD>[\d\D]*<TD WIDTH=8% bgcolor="#FFFFFF">([\d\D]*)<\/TD>/Us';preg_match_all($patterne, $content, $matchese);$count=count($matchese[1]);for($n=0;$n<$count;$n++){$res['barcode'][$n]=iconv("GB2312","UTF-8//IGNORE",$matchese[1][$n]);$res['spot'][$n]=iconv("GB2312","UTF-8//IGNORE",$matchese[2][$n]);$res['circulate'][$n]=iconv("GB2312","UTF-8//IGNORE",$matchese[3][$n]);}return $res;}}?>
<?phprequire('class.XBsearch.php');/*if($_SERVER['REMOTE_ADDR'] != '116.255.173.155'){ echo "非法请求"; exit;}*/$lib=new nwu();$test=$lib->search_book('php');print_r($test);$test1=$lib->bookdetail($test[1]['detail']);print_r($test1);?>
详细解释:;
这是CURL的操作.模拟提交搜索图书为$keyword,地址为$url,这是搜索后的结果页.这个使用post提交的搜索表单,所以要知道这个表单提交了些什么参数.我用chrome浏览器的f12,的开发者工具.点开f12后点击Network,里面寻找hearder,你就会发现它提交的信息,分析后可得知我们:
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $fields);
然后
$pattern = '/<TR><td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=center bgcolor="#FFFFFF"><a href="([\d\D]*)">/Us';
preg_match_all($pattern, $content, $matches);
这是正则抓取我们需要的图书信息.//里面为我们的正则式子.U是关闭贪婪模式.要不会匹配很多.s是多行匹配.
()里是我们需要的东西,如果我们匹配后的变量为$matches,则$matches[1]为第一个()里的内容.通常抓网页时,遇到的代码分行,分行就要用[\d\D]*来对应,否则不能成功.任意匹配可用[\d\D]*,或者.*
$res[$i]['bookname']=iconv("GB2312","UTF-8//IGNORE",$matches[1][$i]);
iconv用来转编码,我们要统一成UTF-8,所以要转换一下.
下面的方法重新抓下一页.可能你要问,为什么要重新写一个方法来抓下一页?
因为这个你搜索的表单还有隐藏项,你只有进去地一个页面,下一个页面的隐藏表单信息才能用chrome的工具看到,而且
如果你只改
它这个系统比较...没法猜,每一页都有好几项不一样,你只能重新抓了然后在进下一页的时候用.当然,可以把前两个方法合到一起.当时没想这么多,索性弄了两个...
第三个那个是抓取图书详细页的方法.
看一下应该不难.同理得嘛,将抓取目录页的图书详细页地址传进去.
preg_match_all($pattern2, $content, $matches2);
$res['title']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches1[1][0]),ENT_QUOTES, 'UTF-8'));
trim()用来除去空格.html_entity_decode($string,ENT_QUOTES,'UTF-8');把 HTML 实体转换为字符.
$search ="'<[\/\!]*?[^<>]*?>'si"; // 去掉 HTML 标记
$res['title']=preg_replace($search,' ',$res['title']);
再做一次替换,去除无用信息
至于调用页里的
/*if($_SERVER['REMOTE_ADDR'] != '116.255.173.155'){
echo "非法请求";
exit;
}*/
这是防止别人用,我们用那个地址的服务器访问这个页面,别的ip都会呈现:非法请求...
这是CURL的操作.模拟提交搜索图书为$keyword,地址为$url,这是搜索后的结果页.这个使用post提交的搜索表单,所以要知道这个表单提交了些什么参数.我用chrome浏览器的f12,的开发者工具.点开f12后点击Network,里面寻找hearder,你就会发现它提交的信息,分析后可得知我们:
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $fields);
然后
$pattern = '/<TR><td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=left height=25 bgcolor="#FFFFFF">([\d\D]*)<\/td>[\d\D]*<td align=center bgcolor="#FFFFFF"><a href="([\d\D]*)">/Us';
preg_match_all($pattern, $content, $matches);
这是正则抓取我们需要的图书信息.//里面为我们的正则式子.U是关闭贪婪模式.要不会匹配很多.s是多行匹配.
()里是我们需要的东西,如果我们匹配后的变量为$matches,则$matches[1]为第一个()里的内容.通常抓网页时,遇到的代码分行,分行就要用[\d\D]*来对应,否则不能成功.任意匹配可用[\d\D]*,或者.*
$res[$i]['bookname']=iconv("GB2312","UTF-8//IGNORE",$matches[1][$i]);
iconv用来转编码,我们要统一成UTF-8,所以要转换一下.
下面的方法重新抓下一页.可能你要问,为什么要重新写一个方法来抓下一页?
因为这个你搜索的表单还有隐藏项,你只有进去地一个页面,下一个页面的隐藏表单信息才能用chrome的工具看到,而且
如果你只改
它这个系统比较...没法猜,每一页都有好几项不一样,你只能重新抓了然后在进下一页的时候用.当然,可以把前两个方法合到一起.当时没想这么多,索性弄了两个...
第三个那个是抓取图书详细页的方法.
看一下应该不难.同理得嘛,将抓取目录页的图书详细页地址传进去.
preg_match_all($pattern2, $content, $matches2);
$res['title']=trim(html_entity_decode(iconv("GB2312","UTF-8//IGNORE",$matches1[1][0]),ENT_QUOTES, 'UTF-8'));
trim()用来除去空格.html_entity_decode($string,ENT_QUOTES,'UTF-8');把 HTML 实体转换为字符.
$search ="'<[\/\!]*?[^<>]*?>'si"; // 去掉 HTML 标记
$res['title']=preg_replace($search,' ',$res['title']);
再做一次替换,去除无用信息
至于调用页里的
/*if($_SERVER['REMOTE_ADDR'] != '116.255.173.155'){
echo "非法请求";
exit;
}*/
这是防止别人用,我们用那个地址的服务器访问这个页面,别的ip都会呈现:非法请求...
0 0
- curl应用,中转搜索图书
- 平院微信图书馆——中转图书查询
- 搜索技术图书攻略
- 图书管理搜索部份
- GOOGLE图书搜索
- 20160111 图书搜索
- curl 应用
- 中转
- 插件79:搜索Google图书
- 指南、图书搜索第一版优化
- 豆瓣图书搜索系统实验
- curl 获取google搜索结果
- 转:curl 应用小结
- curl应用总结
- curl应用总结
- curl应用总结
- cURL(学习、安装、应用)
- curl应用总结
- libevent 多线程
- /dev/null 2>&1 详解
- [DP]HOJ 1288 Bridging Signals
- Afaria 支持supervised mode for iOS设备
- 子图
- curl应用,中转搜索图书
- fluentd
- linux启动全过程分析
- 关于cocos2dx的AnchorPoint
- Fragment和Activity的交互
- [LeetCode]86.Partition List
- Linux NFS服务器的安装与配置
- 算法导论第九章课后答案
- 紫金VLAN ID 为101/0