采集某个网站下的列表url及url的size

来源：互联网发布：单片机与plc的区别知乎编辑：程序博客网时间：2024/06/01 09:40

<?php
/**
   目的：采集某个网站下的列表url及url的size
   使用说明，下载canphp2.0目录lib的http.class.php类，下载simple_html_dom.php
   error.txt保存超时的url
   list.txt保存列表url及url的size
   error.txt保存记录所有不重复的url

*/

include ('Http.class.php');
include ('simple_html_dom.php');
//计数
$page = ! empty ( $_GET ['page'] ) ? $_GET ['page'] : 1;
//读取weburl.txt的所有连接
$pieces = getTextArray ();
if ($page == 1) {
   $weburl = "http://product.cn.china.cn/suppliers/%B9%E3%D6%DD/";
} else {
   $weburl = $pieces [$page - 1];
}

echo $weburl . "<br><br>";
$lista = $pieces;
//读取当前url的所有url包含/suppliers/的连接，读取当前url的size
$htmlcontent = Http::doGet ( $weburl );
if(!empty($htmlcontent)){
   $html = str_get_html ( $htmlcontent );
   foreach ( $html->find ( 'a' ) as $e ) {
       if (strpos ( $e->href, "/suppliers/" ) && ! strstr ( $e->href, $weburl )) {
           $lista [] = $e->href;
       }
   }
}
else{
   $errorfile = "error.txt";
   $error_str = $weburl . "\n";
   file_put_contents ( $errorfile, $error_str, FILE_APPEND );
   echo "<script language='javascript'>window.location.href='list_url_size_collect.php?page=" . $page . "'</script>";
}

$lista_new = array_unique ( $lista );
$listaSize = "";
$listaPer = "100";
foreach ( $html->find ( 'span.pageNum' ) as $e ) {
   preg_match ( "/(\d+)/i", $e->outertext, $matches );
   $listaSize = $matches [0];
}
$listaSize = $listaSize > 100 ? 100 : $listaSize;

//把当前url及当前url的size写进weburl.txt里
$listfile = "listurl.txt";
$listfile_str = $weburl . "@@@@" . $listaSize . "\n";
file_put_contents ( $listfile, $listfile_str, FILE_APPEND );

//把读取的不重复的url写进weburl.txt里
$webfile = "weburl.txt";
$webfile_str = "";
foreach ( $lista_new as $v ) {
   if (! empty ( $v )) {
       $webfile_str .= $v . "\n";
   }
}
file_put_contents ( $webfile, $webfile_str );

$page ++;
//如果当前url不为空，continue跳转到当前页面，计数+1
if (! empty ( $weburl )) {
   echo "<script language='javascript'>window.location.href='list_url_size_collect.php?page=" . $page . "'</script>";
} else {
   echo $weburl . "<br>";
}
function getTextArray($filename = "weburl.txt") {
   $urlcontent = file_get_contents ( $filename );
   $pieces = explode ( "\n", $urlcontent );
   array_pop ( $pieces );
   return $pieces;
}