毕设写的简单spider类[正则等内容]

来源:互联网 发布:淘宝双11怎么抢先付款 编辑:程序博客网 时间:2024/06/03 19:35

<?php
class Spider {
 
 var $mysql_host;
 var $mysql_name;
 var $mysql_pwd;
 var $mysql_db;

 var $parentUrl;   //开始搜索的url
 var $searchNum;   //搜索的层数
 var $url;
 var $db;

 //数据库连接函数
 function connect_to_db($mysql_host,$mysql_name,$mysql_pwd){
  $db=mysql_connect($mysql_host,$mysql_name,$mysql_pwd);
  return $db;
 }
 

 //处理url,以符合标准
 function dealUrl($url){
  if(strstr($url,"http://")){
   
   }else{
    $url="http://".$url;
   }
   if(strrpos($url,'/')==strlen($url)-1){
    $url = substr($url,0,-1);
   }
   return $url;
 }

 //取一个链接下的所有链接
 function getUrl($url){
  $fcontents = file($url);
  $nextUrl = "succeed";
  while(list(,$line)=each($fcontents)){

   //while(eregi('(href[[:space:]]*=[[:space:]]*"?[[:alnum:]:@/._-]+"?)(.*)',$line,$regs)){
 while(eregi('(href[[:space:]]*=[[:space:]]*"?[[:alnum:]:@/._-]+[^([:space:]|/>|")]*)(.*)',$line,$regs)){
    $regs[1] =
    eregi_replace('(href[[:space:]]*=[[:space:]]*"?)([[:alnum:]:@/._-]+)("?)',"//2",$regs[1]);
    
    if(strstr($regs[1],"http://")){
    }else{
     $regs[1]=$url."/".$regs[1];
    }
    
    //echo "&nbsp;&nbsp;&nbsp;&nbsp;$regs[1]<br>";
    $line = $regs[2];
    if(strstr($nextUrl,$regs[1])){
     
    }else{
if(strstr($regs[1],".php")||strstr($regs[1],".asp")||strstr($regs[1],".jsp")||strstr($regs[1],".htm")||strstr($regs[1],".com")||strstr($regs[1],".cn")||strstr($regs[1],".net")||strstr($regs[1],".org")){
      if(strstr($regs[1],"_bak")){
      
      }else{
       $nextUrl=$nextUrl.",".$regs[1];      
      }

     }
    }
   }
  
  }
  return $nextUrl;
 }
 
 
 //查询该URL是否需要重新搜索
 function queryUrl($url,$contentDesc,$db){
  mysql_select_db("SearchEngine");
  $sql="select * from visited where visitedUrl='".$url."' and contentDesc='".$contentDesc."'";
  $rs=mysql_query($sql,$db);
  if(mysql_fetch_row($rs)){
   return false;
  }else{
   return true;
  }
 }

 
 //得到图片大小
 function getImgLength($url){
  $info = @file($url);
  if($info){
   $info = implode("",$info);
   return strlen($info)." 字节";
  }else{
   return 0;
  }
 }

 //取得该url内的图片地址,及网页内容,保存日期,所在服务器IP,文件大小
 //存入数据库
 function gatherInfo($url){
  $url=$this->dealUrl($url);
  

  $content=$this->getUrlContent($url);
  if($content!=""&&strstr($this->getUrlResponse($url),"200")){
   //print_r($bodyInfo);
   $Ip=$this->getUrlIP($url);
   $Date=$this->getUrlDate($url);
   $imgInfo=$this->tags($url,"img");
   $url1=$url;
   for($i=0;$i<count($imgInfo);$i++){
    $imgSrc[$i]=$imgInfo[$i]['Attrs']['SRC'];
    $imgSize[$i]=$imgInfo[$i]['Attrs']['HEIGHT']."*".$imgInfo[$i]['Attrs']['WIDTH'];
    $theImgUrl[$i]=$imgSrc[$i];

    if(strstr($imgSrc[$i],"http://")){
    
    }else{
     if(strstr($url,".php")||strstr($url,".asp")||strstr($url,".jsp")||strstr($url,".htm")){
     $str=strrchr($url,'/');
      $url=str_replace($str,"",$url);
     }
     //if(strstr($url,".com")||strstr($url,".cn")||strstr($url,".net")||strstr($url,".org")){
      if(strrpos($url,'/')==strlen($url)-1){
       $url = substr($url,0,-1);
      }
     //}
     if(strpos($imgSrc[$i],'/')==0){
      $imgSrc[$i]=$url.$imgSrc[$i];
     }else{
      $imgSrc[$i]=$url."/".$imgSrc[$i];
     }
    }
    
   }
   for($i=0;$i<count($theImgUrl);$i++){
    $content=str_replace($theImgUrl[$i],$imgSrc[$i],$content);
   }
   for($i=0;$i<count($imgInfo);$i++){
    /*$imgSrc[$i]=$imgInfo[$i]['Attrs']['SRC'];
    $imgSize[$i]=$imgInfo[$i]['Attrs']['HEIGHT']."*".$imgInfo[$i]['Attrs']['WIDTH'];
    $theImgUrl[$i]=$imgSrc[$i];

    if(strstr($imgSrc[$i],"http://")){
    
    }else{
     if(strstr($url,".php")||strstr($url,".asp")||strstr($url,".jsp")||strstr($url,".htm")){
     $str=strrchr($url,'/');
      $url=str_replace($str,"",$url);
     }
     //if(strstr($url,".com")||strstr($url,".cn")||strstr($url,".net")||strstr($url,".org")){
      if(strrpos($url,'/')==strlen($url)-1){
       $url = substr($url,0,-1);
      }
     //}
     if(strpos($imgSrc[$i],'/')==0){
      $imgSrc[$i]=$url.$imgSrc[$i];
     }else{
      $imgSrc[$i]=$url."/".$imgSrc[$i];
     }
    }*/
    $length[$i]=$this->getImgLength($imgSrc[$i]);
    $db=$this->connect_to_db("localhost","root","");
    $contentDesc=strip_tags($content);
    mysql_select_db("SearchEngine");
    $sql="insert into contentgather set url='".$url1."', date='".$Date."', ip='".$Ip."', length='".$length[$i]."', imgurl='".$imgSrc[$i]."', size='".$imgSize[$i]."', content='".$content."'";
    //echo $sql;
    mysql_query($sql,$db);
    $contentGather_Id=mysql_insert_id();
    $contentDesc=strip_tags($content);
    $title=$this->getUrlTitle($url1);
    $imgName=$imgSrc[$i];
    //echo $contentGather_Id."<br>";
    //echo $contentDesc."<br>";
    //echo $title;
    $sql="insert into suoyin set url='".$url1."', imgName='".$imgName."', title='".$title."', contentDesc='".$contentDesc."', contentGather_Id=".$contentGather_Id;
    //echo $sql;
    mysql_query($sql,$db);

    //echo $imgSrc[$i]."chicun:".$imgSize[$i]."daxiao:".$length[$i]."<br>";
   }
   echo "完成搜集:".$url1."<br>";
  }
  
 }


 function getUrlResponse($url){
  $fp = @fopen($url,"r");
  return $http_response_header[0];
 }

 //取文件保存日期
 function getUrlDate($url){
  $fp = @fopen($url,"r");
  return $http_response_header[1];
 }

 //取某网址对应的IP
 function getUrlIP($url){
  $url_stuff = parse_url($url);
  return gethostbyname($url_stuff['host']);
 }


 //网页标签提取函数
 function tags($filename,$tag) {
   $buffer = @join("",file($filename));
   $buffer = eregi_replace("/r/n","",$buffer);
   $tagkey = sql_regcase($tag);
   $buffer = eregi_replace("<$tagkey ","/n<$tag ",$buffer);
   $ar = split("/n",$buffer);

   foreach($ar as $v) {
  if(! eregi("<$tagkey ",$v)) continue;
  eregi("<$tagkey ([^>]*)((.*)</$tagkey)?",$v,$regs);
  $p[tagName] = strtoupper($tag);
  if($regs[3])
    $p[Text] = $regs[3];
  $s = trim(eregi_replace("[ /t]+"," ",$regs[1]))." ";
  $s = eregi_replace(" *= *","=",$s);

  $a = split(" ",$s);
  for($i=0;$i<count($a);$i++) {
    $ch = array();
    if(eregi("=[/"']",$a[$i])) {
   $j = $i+1;
   while(!eregi("[/"']$",$a[$i])) {
     $a[$i] .= " ".$a[$j];
     unset($a[$j]);
   }
    }
  }
  foreach($a as $k) {
    $name = strtoupper(strtok($k,"="));
    $value = strtok("/0");
    if(eregi("^[/"']",$value))
   $value = substr($value,1,-1);
    if($name)
   $p[Attrs][$name] = $value;
  }
  $pp[] = $p;
   }
   return $pp;
 }


 //取网页内容
 function getUrlContent($url){
  
  /*$fcontents = file($url);
  while(list(,$line)=each($fcontents)){

   //while(eregi('(href[[:space:]]*=[[:space:]]*"?[[:alnum:]:@/._-]+"?)(.*)',$line,$regs)){
 while(eregi('(href[[:space:]]*=[[:space:]]*"?[[:alnum:]:@/._-]+[^([:space:]|/>|")]*)(.*)',$line,$regs)){
    $tmp =
    eregi_replace('(href[[:space:]]*=[[:space:]]*"?)([[:alnum:]:@/._-]+)("?)',"//2",$regs[1]);
    
    if(strstr($tmp,"http://")){
    }else{
     $tmp=$url."/".$tmp;
    }
    
    $line=eregi_replace($regs[1],$tmp,$line);
    $content=$content.$line;
    //echo "&nbsp;&nbsp;&nbsp;&nbsp;$line<br>";
    $line = $regs[2];
   }
   
  }
  echo $content;*/
 if(strstr($this->getUrlResponse($url),"200")){

   $buffer = @join("",file($url));
   $buffer = eregi_replace("/r/n","",$buffer);
   if(eregi('(href[[:space:]]*=[[:space:]]*"?[[:alnum:]:@/._-]+[^([:space:]|/>|")]*)(.*)',$buffer,$regs)){
     $tmp =
     eregi_replace('(href[[:space:]]*=[[:space:]]*"?)([[:alnum:]:@/._-]+)("?)',"//2",$regs[1]);
      
     if(strstr($tmp,"http://")){
     }else{
      $tmp=$url."/".$tmp;
     }
      
     $buffer=eregi_replace($regs[1],$tmp,$buffer);
     $buffer=eregi_replace("/'","/"",$buffer);
     //echo "&nbsp;&nbsp;&nbsp;&nbsp;$regs[1]<br>";
   }
   if($buffer){
    return $buffer;
   }
 }
 }
 
 function nextUrl($url){
  $nextUrl=split(",",$this->getUrl($url));
  for($i=2;$i<count($nextUrl);$i++)
   {
if(strstr($nextUrl[$i],".php")||strstr($nextUrl[$i],".asp")||strstr($nextUrl[$i],".jsp")||strstr($nextUrl[$i],".htm")||strstr($nextUrl[$i],".com")||strstr($nextUrl[$i],".cn")||strstr($nextUrl[$i],".net")||strstr($nextUrl[$i],".org")){
    $theUrl=$nextUrl[$i];
    break;
   }
  }
  return $theUrl;
 }

 

 function mainSpider($url){
  
  for($i=0;$i<1;$i++){
   $this->gatherInfo($url);
   $nextUrl=split(",",$this->getUrl($url));
   for($j=1;$j<count($nextUrl)+1;$j++){
    //
    echo $nextUrl[$j]."<br>";
    $this->gatherInfo($nextUrl[$j]);
   }
   $url=$this->nextUrl($url);
  }
 }

  
 //提取网页标题函数
 function getUrlTitle($url){
  $buffer = @join("",file($url));
  $buffer = eregi_replace("/r/n","",$buffer);
  $num = strlen(strstr($buffer,"<title>"))-strlen(strstr($buffer,"</title>"));
  return $this->substr_for_gb2312(strstr($buffer,"<title>"),8,$num-8);
 }

 

 //无乱码截取中文字符串的函数
 function substr_for_gb2312($str,$start,$len=null)
 {
   $totlelength = strlen($str);

   //特例情况
   if ($len == null) $len = $totlelength;
   if ($len ==0) return "";
   if ($len >= $totlelength && $start == 0 ) return $str;
   if ($start > $totlelength) return "";

   //分析$start
   if ($start < 0 )  //$start<0时,转化为$start>0时的定位.
   {
    if ( abs($start) >= $totlelength ) $start = 0;
    else $start = $totlelength - abs($start);
   }
  
   //确定起始位置,当起始位拆分某汉字时,返回值包含此汉字.
   if ($start > 0)
   {
    $i = $start-1;
    $flag = -1;
    while ($i >= 0)
    {
   if ( ord(substr($str,$i,1)) > 160)
   {
    $flag = -1*$flag;
   }
   else break;
   $i--;
    }
    if($flag==1)
    {
   $start = $start - 1;
   $len++;        //保证不位移.
    }
  }
  $str = substr($str,$start);//截除字符串$str的$start位前的字符
  $totlelength = strlen($str);

  //确定结束位置,当结束位拆分某汉字时,返回值不包含此汉字.
  if ($len<0) $len = $totlelength - abs($len);
  if ($len <= 0) return "";
  $i=min($len,$totlelength);
  $i--;
  $flag = -1;
  while ($i >= 0)
  {
    if (ord(substr($str,$i,1))>160)
    {
     $flag=-1*$flag;
    }
    else break;
    $i--;
  }
  if($flag == 1)
    $len=$len-1;
  $subit=substr($str,0,$len);
  return $subit;
 }


}
?>