抓取“华强电子网”供应商程序

来源:互联网 发布:企业大数据运营平台 编辑:程序博客网 时间:2024/04/30 20:27
<?php/** * 抓取“华强电子网”供应商主程序 * author Lee. * Last modify $Date: 2012-2-2 12:55:35 $ */require_once './config.inc.php';class huaqiang {private $key; // 型号private $pageNum; // 页码/** * 入口程序 */public function go($key) {$this->key = $key;if ($this->checkIsExistsData()) {$this->pageNum = $this->getPageNum();$this->getInfo();}}/** * 获取页面内容 * @param Number $page * @return string */private function getContent($page=1) {$re = file_get_contents($this->getUrl($this->key, $page));return $re;}/** * 检查第一页是否有数据 * @return 有返回 true;无返回 false */private function checkIsExistsData() {if (stristr($this->getContent(), '<span class="s_curr g_vm">1</span>')) {return true;} else {return false;}}/** * 获取供应商 url 链接数组 * @return ArrayObject */private function getInfo() {if ($this->pageNum==1) { # 处理只有一页的情况$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent()));# 循环抓取信息foreach ($arr as $k=>$v) {$infoArr = $this->getInfoByShopUrl($v);if ($this->execAdd($infoArr)) echo 'Add Success!!';$this->sleep();}$this->sleep();} elseif ($this->pageNum>1) { # 多页for ($i=1; $i<=$this->pageNum; $i++) {$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent($i)));# 循环抓取信息foreach ($arr as $k=>$v) {$infoArr = $this->getInfoByShopUrl($v);if ($this->execAdd($infoArr)) echo 'Add Success!!';$this->sleep();}$this->sleep();}}}/** * 执行添加数据库 * @param array $infoArr * @return Number 是否添加成功 */private function execAdd($infoArr) {$m = new Model();if (!$m->isExists('huaqiang', "company='{$infoArr['company']}'")) {$num = $m->insert('huaqiang', array('company','mobile','phone','fax','region','address','website','zip','email','qq','msn','market','shopUrl'), array($infoArr['company'],$infoArr['mobile'],$infoArr['phone'],$infoArr['fax'],$infoArr['region'],$infoArr['address'],$infoArr['website'],$infoArr['zip'],$infoArr['email'],$infoArr['qq'],$infoArr['msn'],$infoArr['market'],$infoArr['shopUrl']));}return $num;}/** * 抓取信息 * @param $url  * @return ArrayObject */private function getInfoByShopUrl($url) {$re = $mobileRe = $faxRe = $marketRe = $msnRe = $zipRe = $urlRe = $emailRe = $qqRe = $this->getUrlInfo($url);preg_match_all('/<li class=\"g\_fl tit\">公司名称:<\/li><li class="g_fl cont">(.+)<\/li>.+<li class="g_fl tit">电话:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">所在地区:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">详细地址:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">QQ:<\/li><li class=\"g\_fl cont\">(.+)<\/li>/Usi', $re, $shopArr);preg_match_all('/<li class=\"g\_fl tit\">手机:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $mobileRe, $mobileArr);preg_match_all('/<li class=\"g\_fl tit\">传真:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $faxRe, $faxArr);preg_match_all('/<li class=\"g\_fl tit\">网址:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $urlRe, $urlArr);preg_match_all('/<li class=\"g\_fl tit\">MSN:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $msnRe, $msnArr);preg_match_all('/<li class=\"g\_fl tit\">邮政编码:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $zipRe, $zipArr);preg_match_all('/<li class=\"g\_fl tit\">所属电子市场:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $marketRe, $marketArr);preg_match_all('/<li class=\"g\_fl tit\">电子邮箱:<\/li><li class=\"g\_fl cont cor\">(.*)<\/li>/Usi', $emailRe, $emailArr);preg_match_all('/<li class=\"g\_fl tit\">QQ:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $qqRe, $qqArr);$infoArr = array('company'=>trim($shopArr[1][0]),'mobile'=>empty($mobileArr[1][0]) ? '' : $mobileArr[1][0],'phone'=>$this->stripPhoneTags(trim($shopArr[2][0])),'fax'=>empty($faxArr[1][0]) ? '' : $faxArr[1][0],'region'=>trim($shopArr[3][0]),'address'=>trim($shopArr[4][0]),'zip'=>empty($zipArr[1][0]) ? '' : $zipArr[1][0],'email'=>empty($emailArr[1][0]) ? '' : $emailArr[1][0],'qq'=>empty($qqArr[1][0]) ? '' : $qqArr[1][0],'msn'=>empty($msnArr[1][0]) ? '' : $msnArr[1][0],'market'=>empty($marketArr[1][0]) ? '' : $marketArr[1][0],'website'=>empty($urlArr[1][0]) ? '' : $this->stripATags($urlArr[1][0]),'shopUrl'=>$url);return $infoArr;}/** * 供应商店铺链接添加 contact.html * @param array $arr * @return string  */private function shopAddContact($arr) {foreach ($arr as $k=>$v) {$arr[$k] = $v . '/contact.html';}return $arr;}/** * 去掉网址的 A 标签 * @param string $site * @return string */private function stripATags($site) {$site = preg_replace('/<a.+>(.+)<\/a>/', '\1', $site);return $site;}/** * 去掉手机多余标签 * @param string $phone * @return string */private function stripPhoneTags($phone) {$phone = str_replace('<span>', '', $phone);$phone = str_replace('</span>', ' ', $phone);$phone = str_replace('<br />', '', $phone);return $phone;}/** * 根据页面获取供应商 url 数组 * @param string $re * @return ArrayObject */private function shopUrlMatchReArr($re) {preg_match_all('/<li class="col3"><a class=\"company\" target=\"\_blank\" href=\"(.+)\" value=\".+\">.+<\/a>/Usi', $re, $arr);$arr = array_unique($arr[1]);return $arr;}/** * 获取页码 * @return Number */private function getPageNum() {$i = 1;while (true) {$re = $this->getContent($i);if (stristr($re, '<span class="g_vm s_f0f s_f0f1"  title="下一页">')) break;$i++;$this->sleep();}return $i;}/** * 获取 URL 链接 * @param string $str * @param int $page 页码 * @return string */private function getUrl($str, $page=1) {return "http://www.hqew.com/ic/{$str}_____0_00_0_{$page}.html";}/** * 获取页面内容 * @param string $url * @return string */private function getUrlInfo($url) {$re = file_get_contents($url);return $re;}/** * 休眠时间,默认5秒 */private function sleep($seconds=5) {sleep($seconds);}}/** * 使用方法:1、先实例化一个类;2、调用 go($param) 方法,$param 为型号 * 程序运行思路:根据“华强电子网”的IC搜索功能,输入型号进行搜索,然后抓取供应商信息 *//** * 数据库结构 * CREATE TABLE `huaqiang` (`id` mediumint(8) unsigned NOT NULL auto_increment,`company` varchar(500) NOT NULL,`mobile` varchar(500) NOT NULL,`phone` varchar(500) NOT NULL,`fax` varchar(500) NOT NULL,`region` varchar(500) NOT NULL,`address` varchar(500) NOT NULL,`website` varchar(200) NOT NULL,`zip` varchar(100) NOT NULL,`email` varchar(500) NOT NULL,`qq` varchar(200) NOT NULL,`msn` varchar(200) NOT NULL,`market` varchar(500) NOT NULL,`shopUrl` varchar(200) NOT NULL,PRIMARY KEY  (`id`)) ENGINE=InnoDB DEFAULT CHARSET=utf8 */$c = new huaqiang();$arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358'));foreach ($arr as $v) {$c->go($v);}?>


原创粉丝点击