用php读取xxx.com.cn整个页面源码

来源：互联网发布：java三大框架教材编辑：程序博客网时间：2024/05/16 07:23

问题描述：
读取它首页的完整源码，用了各种方式也搞不定它。
file_get_contents
fopen
fsockopen
wget
curl
都搞不定。都是读取了16k左右的时候就终止了。
哪位大侠对http1.1比较熟悉，看看可能是什么问题？
第一个提出有效建议的人会得到10Z币的报酬
-----------------------------------------------
解决方法：
具体原因还没搞明白，下面是几条线索。
http头里面
Transfer-Encoding: chunked
内容分段发送
Content-Encoding: gzip
gzip压缩内容
在易铎同学的帮助下，得到一个解决方法：
<?php
//构造一个http请求的头，结尾要留两个空行。千万别丢了。
//本来一个就够，但是here doc的定界符会自己吃掉一个换行。
$header = "
GET / HTTP/1.1
Accept: */*
Accept-Language: zh-cn
UA-CPU: x86
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; GoogleT5; SLCC1; .NET CLR 2.0.50727; CIBA)
Host: xxx.com.cn
Connection: close
";
$fp = fsockopen('xxx.com.cn',80,$errno,$errstr,10);
fwrite($fp,$header);
echo 'Reading Header ... ';
$head = read_header($fp);
echo $head;
echo 'Reading contents ... ';
$content = read_content($fp);
echo $content;
function read_header($fp)
{
$header = '';
while(trim($line=fgets($fp,1024))!="")
{ $header .= $line; }
return $header;
}
function read_content($fp)
{
$content = '';
$chunk_size = chop(fgets($fp,1024));
$chunk_size = hexdec($chunk_size);
$block_size = 0;
while(!feof($fp))
{
//这里是一个容易疏忽的地方，手册上关于fread的结束条件跟预想的有些偏差
//只注意了fread兼容二进制数据，没注意它的结束条件。
//请参考 http://cn2.php.net/manual/zh/function.fread.php
//$line = fread($fp,$chunk_size);
//die($chunk_size.":".strlen($line).':'.bin2hex($line));break;
//输出 7873:2360 本来fread应该读7873,结果只读了2360
if($block_size<$chunk_size)
{
 $content .= fgetc($fp);
 $block_size++;
}
else
{
 fread($fp,2);
 $chunk_size = hexdec(chop(fgets($fp,1024)));
 $block_size = 0;
 //echo "*****chunk_size:$chunk_size*****";
 if($chunk_size==0)
 { fclose($fp);break; }
}
}
//建立一个临时文件转换一下数据的格式
//如果你不需要并发处理，完全可以使用一个固定的文件名
//我的理想是用gzuncompress直接解压字符串。但是理想跟现实有些差距。
$tmpfile = tempnam('/tmp','webcache');
$fp = fopen($tmpfile,'w');
fwrite($fp,$content);
fclose($fp);
ob_start();
readgzfile($tmpfile);
$content = ob_get_contents();
ob_end_clean();
unlink($tmpfile);
return $content;
}
echo ' done!';
>
严重感谢易铎同学提供的线索。
同时也感谢晶楠，王涛等同学的关注和支持。
此问题暂时告一段落。具体原因虽然不清楚，但是针对此站的问题已经解决了。
报酬已经发给易铎同学。
改写了部分代码
<?php
function http_request($url)
{
$urlinfo = parse_url($url);
$header = "GET {$urlinfo['path']} HTTP/1.1/r/n";
$header.= "Accept: */*/r/n";
$header.= "Accept-Language: zh-cn/r/n";
$header.= "UA-CPU: x86/r/n";
$header.= "Accept-Encoding: gzip, deflate /r/n";
$header.= "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; .NET CLR 2.0.50727; CIBA)/r/n";
$header.= "Host: nbbbs.enet.com.cn/r/n";
$header.= "Connection: close /r/n";
$header.= "/r/n";
$fp = fsockopen($urlinfo['host'],isset($urlinfo['port'])?$urlinfo['port']:80,$errno,$errstr,10);
if(!$fp)
{ return false; }
fwrite($fp,$header);
$head = read_header($fp);
$content = read_content($fp);
return $content;
}
function read_header($fp)
{
$header = '';
while(trim($line=fgets($fp,1024))!="")
{ $header .= $line; }
return $header;
}
function read_content($fp)
{
$content = '';
$chunk_size = chop(fgets($fp,1024));
$chunk_size = hexdec($chunk_size);
$block_size = 0;
while(!feof($fp))
{
//$line = fread($fp,$chunk_size);
//die($chunk_size.":".strlen($line).':'.bin2hex($line));break;
//输出 7873:2360 本来fread应该读7873,结果只读了2360
if($block_size<$chunk_size)
{
 $content .= fgetc($fp);
 $block_size++;
}
else
{
 fread($fp,2);
 $chunk_size = hexdec(chop(fgets($fp,1024)));
 $block_size = 0;
 //echo "*****chunk_size:$chunk_size*****";
 if($chunk_size==0)
 { fclose($fp);break; }
}
}
$tmpfile = tempnam('/tmp','webcache');
$fp = fopen($tmpfile,'w');
fwrite($fp,$content);
fclose($fp);
ob_start();
readgzfile($tmpfile);
$content = ob_get_contents();
ob_end_clean();
unlink($tmpfile);
return $content;
}
echo http_request("http://nbbbs.enet.com.cn/");
>
最终完结版！

<?php
/**
* 模拟http请求，支持gzip，chunked格式
*
*/
function http_request($url)
{
 $urlinfo = parse_url($url);
 $urlinfo['path'] = $urlinfo['path']!=''?$urlinfo['path']:'/';
 $header = "GET {$urlinfo['path']} HTTP/1.1/r/n";
 $header.= "Accept: */*/r/n";
 $header.= "Accept-Language: zh-cn/r/n";
 $header.= "UA-CPU: x86/r/n";
 $header.= "Accept-Encoding: gzip, deflate /r/n";
 $header.= "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; .NET CLR 2.0.50727; CIBA)/r/n";
 $header.= "Host: {$urlinfo['host']}/r/n";
 $header.= "Connection: close /r/n";
 $header.= "/r/n";
 $header.= "/r/n";

 $fp = fsockopen($urlinfo['host'],isset($urlinfo['port'])?$urlinfo['port']:80,$errno,$errstr,10);
 if(!$fp)
 {
 echo $errno.$errstr;
 return false;
 }
 fwrite($fp,$header);
 $head = read_header($fp);
 $content = read_content($fp,$head);
 return $content;
}
function read_header($fp)
{
 $header = '';
 while(trim($line=fgets($fp,1024))!="")
 { $header .= $line; }
 return $header;
}
function read_content($fp,$head='')
{
 if(!strpos($head,'200 OK'))
 { return false; }
 $content = '';
 //只有chunked才要分段处理
 if(strpos($head,'chunk'))
 {
 $chunk_size = chop(fgets($fp,1024));
 $chunk_size = hexdec($chunk_size);
 $block_size = 0;
 while(!feof($fp))
 {
 //$line = fread($fp,$chunk_size);
 //die($chunk_size.":".strlen($line).':'.bin2hex($line));break;
 //输出 7873:2360 本来fread应该读7873,结果只读了2360
 if($block_size<$chunk_size)
 {
 $content .= fgetc($fp);
 $block_size++;
 }
 else
 {
 echo fread($fp,2);
 $chunk_size = hexdec(chop(fgets($fp,1024)));
 $block_size = 0;
 //echo "*****chunk_size:$chunk_size*****";
 if($chunk_size==0)
 { fclose($fp);break; }
 }
 }
 }
 else
 {
 //普通代码普通对待
 while(!feof($fp))
 { $content .= fgetc($fp); }
 }
 //经过n次测试，不用临时文件还是不行啊。搞不懂。
 $tmpfile = tempnam('/tmp','webcache');
 $fp = fopen($tmpfile,'w');
 fwrite($fp,$content);
 fclose($fp);
 ob_start();
 readgzfile($tmpfile);
 $content = ob_get_contents();
 ob_end_clean();
 unlink($tmpfile);
 return $content;
}
>
=========================================
网页读取

<?php
/**
* 抓新浪新闻
*
* @file $Source$
* @package snatch
* @author superspice
* @version $Id$
*/

//-----------------------------------------------------------------------------
// 新浪新闻小偷 By superspice
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
// 欢迎您指出存在的 b u g 和问题 , 更欢迎再加工
//-----------------------------------------------------------------------------
// 功能：1，自定义下载方式，适用windows和linux下最稳定运行
// 功能：2，自定义关键词选择性偷取新闻
// 功能：3，不提供成品，只生成抓取到内容，生成数组，可以生成页面，也可以进数据库
// 功能：4，去除新闻的广告
// 功能：5，抓取新闻的图片
// 抓到的图片还没加水印，需要加水印的自己加吧
//-----------------------------------------------------------------------------
//
//以下是返回数组的格式
/*
Array
(
 [0] => Array
 (
 [Url] => http://sports.sina.com.cn/golf/p/2004-08-0...061048574.shtml
 [Title] => 标题
 [Html] => Array
 (
 [Image] => Array
 (
 [0] => /News/Images/U364P6T12D1048574F44DT20040806180644.jpg
 )
 [Text] => 第一段正文
 第二段正文
 第三段正文
 )
 )
)
*/
//定义下载方式，wget和php两种
define('METHOD','wget');
//定义wget下载是否使用代理。
define('PROXY','on');
//wget下载文件的目录
define('WGET_DIR','/home/user1/snatchsina/WgetFiles');
//图片文件的网站路径
define('IMAGE_HTTP_FOLDER','/News/Images');
//图片文件存放的服务器路径
define('IMAGE_FOLDER','/home/www/Html/News/Images');
//是否设置关键词，yes和no两中，不设置关键词，所有新闻都被抓到。
define('SET_KEYWORDS','yes');
//定义关键词，可以接着添加
$Keywords = array('美女','网恋','情杀','赵忠祥','赵薇');

/**
* 判断关键词是否符合
*
* @author superspice
* @param string $Sentence 需要判断的标题
* @return boolean true or false
*/
function CheckKeywords($Sentence)
{
 if(SET_KEYWORDS == "no")
 { return true; }
 else
 {
 global $Keywords;
 foreach($Keywords as $K => $V)
 {
 if(ereg($V, $Sentence))
 { return true; }
 }
 return false;
 }
}
/**
* php下载页面
*
* @author superspice
* @param string $location 下载地址
* @return string 下载到的内容
*/
 function readAll($location){
$MAX_RETRY_NUM = 3;
if(preg_match("/(http:////)?([^//|/:]+)(:/d{1,5})?(//.+)?/", $location, $url)){
 list(,,$host,$port,$path) = $url;
 $port = preg_replace("/:/", "", $port);
 $port = strlen($port) ? $port : 80;
 $path = strlen($path) ? $path : "/";
 $request = "GET $path HTTP/1.0/r/n";
 $request .= "Host: $host/r/n";
 $request .= "Accept: */*/r/n";
 $request .= "Connection: close/r/n";
 $request .= "User-Agent: superspice's BOT/r/n/r/n";
 $retry = 0;
 $do = false;
 do{
 $socket = fsockopen($host, $port, &$errMsg, &$errCode );
 fwrite($socket, $request);
 $flag = true;
 $header = "";
 $body = "";
 while(!feof($socket)){
 $line = fgets($socket, 128);
 if($flag){
 if(trim(chop($line)) == ""){
 $flag = false;
 }else{
 $header .= $line;
 }
 }else{
 $body .= $line;
 }
 }
 if(preg_match("/Content-Length: (/d+)/", $header, $out)){
 $header_length = $out[1];
 if($header_length != strlen($body)){
 $do = true;
 }else{
 $do = false;
 }
 }
 if(is_resource($socket)){
 fclose($socket);
 }
 $retry ++;
 }while($do && $retry<$MAX_RETRY_NUM);
}
print("Fetching " . $location. " content-length: ". strlen($body));
return $body;
 }

/**
* wget下载页面
*
* @author superspice
* @param string $url 下载地址
* @return string 下载到的内容
*/
 function wget($url)
 {
 $Random = WGET_DIR."/".date("Y-m-d-H:i:s").".html";
 system("wget /"$url/" -O $Random --proxy=".PROXY);
 $fp = fopen($Random, "r");
 $Content = fread($fp, filesize($Random));
 fclose( $fp );
 return $Content;
 }
/**
* 下载页面
*
* @author superspice
* @param string $Url 下载地址
* @param string $Method 下载方法
* @return string 下载到的内容
*/
function GetContent($Url,$Method)
{
 if($Method == "php")
 {
 return readAll($Url);
 }
 elseif($Method == "wget")
 {
 return wget($Url);
 }
}
/**
* 得到新闻正文内容
*
* @author superspice
* @param string $Url 下载地址
* @param string $Method 下载方法
* @return array 下载到的内容数组，包括纯文本和图片
*/
function getHtml($Url, $Method)
{
 $Content = GetContent($Url, $Method);
 if(preg_match("/<html>(.+?) /is", $Content, $Out))
 {
 $Out1 = $Out[1];
 }
 if(preg_match_all("/(.+?)<//p>/is", $Out1, $Out))
 {
 $Out1 = $Out[1];
 }
 foreach($Out1 as $K => $V)
 {
 $Out2[] = preg_replace("/(.*?)/si","",$V);
 }

 foreach($Out2 as $K => $V)
 {
 $Text .= "".$V."/n";
 }

 if(preg_match_all("/<center><img src=(.*?) border=1> /is", $Content, $Image))
 {
$Image1 = $Image[1];
 }
 $CountImage = count($Image1);
 $ImageArray = array();
 if($CountImage > 0)
 {
 foreach($Image1 as $K => $V)
 {
 $ImageArray[] = getImage($V, $Method);
}
 }

 $Return[Image] = $ImageArray;
 $Return[Text] = $Text;
 return $Return;
}
/**
* 下载图片
*
* @author superspice
* @param string $Url 下载地址
* @param string $Method 下载方法
* @return string 下载之后的图片地址
*/
function getImage($Url, $Method)
{
 $FileName = parserUrl($Url);

 if($Method == "wget")
 {
if(!file_exists(IMAGE_FOLDER))
{
 system("mkdir ".IMAGE_FOLDER);
}
$Target = IMAGE_FOLDER."/".$FileName;
system("wget /"$Url/" -O $Target --proxy=".PROXY);
return IMAGE_HTTP_FOLDER."/".$FileName;
 }

 if($Method == "php")
 {
return $Url;
 }
}
/**
* 分析地址得到文件名
*
* @author superspice
* @param string $Url 下载地址
* @return string 文件名
*/
function parserUrl($Url)
{
 $Url = str_replace("http://", "", $Url);
 $Array = explode("/", $Url);
 $Count = count($Array);
 return $Array[$Count-1];
}
/**
* 开始抓
*
* @author superspice
* @param string $Url 下载地址
* @return array 最后抓取的内容放入一个数组
*/
function snatch($Url)
{
 $Content = GetContent($Url,METHOD);
 if(preg_match_all("/<li>(.+?)<//li>/is", $Content, $Out))
 {
 $Out1 = $Out[1];
 }
 $Content1 = array();
 $i = 0;
 foreach($Out1 as $Kay => $Val)
 {
 $Val = preg_replace("//[(.*?)/]/si","",$Val);
 if(CheckKeywords($Val) == true)
 {
 $Val = preg_replace("//si", "", $Val);
 if(preg_match("/<a href=(.+?) target=_blank>(.+?)<//a>/is", $Val, $URL))
 {
 $Content1[$i][Url] = $URL[1];
 $Content1[$i][Title] = $URL[2];
 $Content1[$i][Html] = getHtml($URL[1], METHOD);
 $i ++;
 }
 }
 }
 return $Content1;
}
//抓
$ReturnShow = snatch("http://news.sina.com.cn/news1000/index.shtml");
//看看效果
print_r($ReturnShow);
?>