抓取“中国 IC 网”供应商程序

来源:互联网 发布:自经于沟渎而莫之知也 编辑:程序博客网 时间:2024/04/30 11:06
<?php/** * 抓取“中国 IC 网(http://www.ic37.com)”供应商主程序 * author Lee. * Last modify $Date: 2012-2-9 9:32:21 $ * 注:本程序按照编码 GB2312 执行,因为“中国 IC 网”网站是GB2312编码,数据库也得保持一致*/class ic37 {private $key; // 型号private $pageNum; // 页码/** * 入口程序 */public function go($key) {$this->key = $key;$this->pageNum = $this->getPageNum();$this->getInfo();}/** * 获取供应商 url 链接数组 * @return ArrayObject */private function getInfo() {if ($this->pageNum==1) { # 处理只有一页的情况$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent()));$this->isAddSuccess($arr);} elseif ($this->pageNum>1) { # 多页for ($i=1; $i<=$this->pageNum; $i++) {$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent($i)));$this->isAddSuccess($arr);}}}/** * 打印是否添加成功 * @param ArrayObject $arr * @return string */private function isAddSuccess($arr) {foreach ($arr as $k=>$v) {if ($this->execAdd($this->getInfoByShopUrl($v))) {echo 'Add Success!!';} else {echo 'Add Faild!!';}}}/** * 执行添加到数据库 * @param ArrayObject $infoArr * @return Number 受影响的行数 */private function execAdd($infoArr) {$mysqli = $this->getDb();if (!empty($infoArr['company'])) {if (!$this->isExists($mysqli, $infoArr)) {$num = $mysqli->query("INSERT INTO ic37(company,person,phone,mobile,qq,msn,fax,email,address,country,region,zip,web,shopUrl) VALUES ('{$infoArr['company']}','{$infoArr['person']}','{$infoArr['phone']}','{$infoArr['mobile']}','{$infoArr['qq']}','{$infoArr['msn']}','{$infoArr['fax']}','{$infoArr['email']}','{$infoArr['address']}','{$infoArr['country']}','{$infoArr['region']}','{$infoArr['zip']}','{$infoArr['web']}','{$infoArr['shopUrl']}')");return $num;} else {return false; # 表示数据已经存在}} else {return false;}}private function formatStr($str) {$str = trim($str);$str = str_replace(' ', '', $str);$str = str_replace('==联系我们', '', $str);return $str;}/** * 连接数据库 */private function getDb() {$mysqli = new mysqli('localhost', 'root', '1715544', 'weiku');$mysqli->query('SET NAMES GB2312');return $mysqli;}/** * 检查公司是否已经存在 * @param Resource $mysqli * @param ArrayObject $infoArr * @return bool */private function isExists($mysqli, $infoArr) {$mysqli->query("SELECT company FROM weiku WHERE company = '{$infoArr['company']}'");if ($mysqli->affected_rows) {return true;} else {return false;}}/** * 抓取信息 * @param $url  * @return ArrayObject */private function getInfoByShopUrl($url) {$re = preg_replace('/<a.+>(.*)<\/a>/', '\1', str_replace('</font>', '', str_replace('<font color="#000099">', '', $this->getUrlInfo($url))));preg_match_all('/<title>(.*)<\/title>/Usi', $re, $companyArr);preg_match_all('/<strong>联系人:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $personArr);preg_match_all('/<strong>电话:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $phoneArr);preg_match_all('/<strong>手机:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $mobileArr);preg_match_all('/<strong>QQ:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $qqArr);preg_match_all('/<strong>MSN:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $msnArr);preg_match_all('/<strong>传真:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $faxArr);preg_match_all('/<strong>EMail:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $emailArr);preg_match_all('/司地址[:]*[<\/strong>]*[<strong>]*[:]*[<\/strong>]*<\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $addressArr);preg_match_all('/<strong>国家[:]*<\/strong>[<strong>]*[:]*[<\/strong>]*<\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $countryArr);preg_match_all('/<strong>地区:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $regionArr);preg_match_all('/<strong>邮政编码:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $zipArr);preg_match_all('/<strong>\s*网址[1]*:<\/strong><\/td>\s*<td.*>(.*)<\/td>/Usi', $re, $webArr);$infoArr = array('company'=>$this->formatStr($companyArr[1][0]),'person'=>$this->formatStr($personArr[1][0]),'phone'=>$this->formatStr($phoneArr[1][0]),'mobile'=>$this->formatStr($mobileArr[1][0]),'qq'=>$this->formatStr($qqArr[1][0]),'msn'=>$this->formatStr($msnArr[1][0]),'fax'=>$this->formatStr($faxArr[1][0]),'email'=>$this->formatStr($emailArr[1][0]),'address'=>$this->formatStr($addressArr[1][0]),'country'=>$this->formatStr($countryArr[1][0]),'region'=>$this->formatStr($regionArr[1][0]),'zip'=>$this->formatStr($zipArr[1][0]),'web'=>$this->formatStr($webArr[1][0]),'shopUrl'=>$url);return $infoArr;}/** * 根据页面获取供应商 url 数组 * @param string $re * @return ArrayObject */private function shopUrlMatchReArr($re) {preg_match_all('/<p class="Company"><a.* href=\"(.+)\".*>[<font color="#FF0000">]*.*[<\/font>]*<\/a>\s*<\/p>/Usi', $re, $arr);$arr = $this->formatUrlArr(array_unique($arr[1]));return $arr;}/** * 格式化数组 * @param Array $arr * @return ArrayObject */private function formatUrlArr($arr) {$newArr = array();foreach ($arr as $key=>$value) {if ($this->isExistsHttp($value)) {$newArr[$key] = $value;}}return $newArr;}/** * 格式化 QQ * @param string $str * @return string */private function formatQqMsn($str, $e='QQ') {if (empty($str)) return '';preg_match_all('/alt="'.$e.'\:(.+)"/Usi', $str, $arr);if (count($arr[1])==1) return $arr[1][0];$newStr = null;foreach ($arr[1] as $value) {$newStr .= $value . ' ';}return rtrim($newStr, ' ');}/** * 供应商店铺链接添加 contact.asp * @param array $arr * @return string  */private function shopAddContact($arr) {foreach ($arr as $k=>$v) {if (stristr($v, 'contact.asp')===FALSE) $newArr[$k] = $this->addContact($v);else$newArr[$k] = $v;}return $newArr;}/** * 链接添加 contact.asp * @param string $str * @return string  */private function addContact($str) {return $str . '/contact.asp';}/** * 去掉网址的 A 标签 * @param string $site * @return string */private function stripATags($site) {$site = preg_replace('/<a.+>(.+)<\/a>/', '\1', $site);return $site;}/** * 检查 url 是否有 http * @param string $url * @return bool */private function isExistsHttp($url) {if (stristr($url, 'http://')) {return true;} else {return false;}}/** * 获取页面内容 * @param Number $page * @return string */private function getContent($page=1) {$re = file_get_contents($this->getUrl($this->key, $page));return $re;}/** * 获取页码 * @return Number */private function getPageNum() {preg_match_all('/共.*条记录分(.*)页显示/Usi', $this->getContent(), $arr);return $arr[1][0];}/** * 获取 URL 链接 * @param string $str * @param int $page 页码 * @return string */private function getUrl($str, $page=1) {return "http://www.ic37.com/sell/search.asp?keyword={$str}&x=86&y=22&page={$page}";}/** * 获取页面内容 * @param string $url * @return string */private function getUrlInfo($url) {$re = file_get_contents($url);return $re;}}/*程序运行思路:根据“中国 IC 网”的IC搜索功能,输入型号进行搜索,然后抓取供应商信息数据库结构CREATE TABLE `ic37` (`id` mediumint(8) unsigned NOT NULL auto_increment,`company` varchar(500) default NULL,`person` varchar(500) default NULL,`phone` varchar(500) default NULL,`mobile` varchar(500) default NULL,`qq` varchar(500) default NULL,`msn` varchar(500) default NULL,`fax` varchar(500) default NULL,`email` varchar(500) default NULL,`address` varchar(1000) default NULL,`country` varchar(500) default NULL,`region` varchar(500) default NULL,`zip` varchar(500) default NULL,`web` varchar(500) default NULL,`shopUrl` varchar(500) default NULL,PRIMARY KEY  (`id`)) ENGINE=InnoDB DEFAULT CHARSET=gb2312*/$k = new ic37();$arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358', 'NE555', '78L05', 'LM324', 'TL431', 'PC817', '7805', 'LM339', 'LM317', '46A-3GRI', 'MODEL', '78L05', '93C46-3GRI', '8050', 'DS18B20', 'TDA2030', 'LM393', '74HC595', '6N137', 'SN75176BDR'));foreach ($arr as $v) {$k->go($v);}?>


原创粉丝点击