blog search in Perl 量化的角度看blog

来源:互联网 发布:免费发货单打印软件 编辑:程序博客网 时间:2024/09/21 09:23


###################################################################
#
#
#          this perl script is written by Zui Tao (wineinceramic@163.com)
#          It is used to claw blog page url from the internet and store them into a local
#          MS access database. though this script is simple and strait forward,
#          blog itself as linked by the owner's favor may indecate a kind of
#          relationship that help to form a map and show us a whole view of ciber community
#          farther research would be using some data mining algorithm such as apriori....
#
#
#
#          Though it is succeed in clawing a minimun of blog links , it should be upgraded to
#          be a serious research。This script could  be copied and rewirte at anyone's  will.
#      
#
#          无论如何,初始化第一个开始的URL是比较重要的一个线索
#          这里本人选择在sina的blog里人气比较热的http://blog.sina.com.cn/m/xiaojingzi来作为初始化URL
#          目的在于研究blog作为一个媒介的联系关系,研究其间人际的网络是如何铺展的

#
#           3-15-2006
#
###################################################################


#read start index and total number and search ranks of links from config file

$index;
$totalNumber;
$ranks;
$iHASH = 'NUN';

use FileHandle;
$fh = new FileHandle "config.txt", "r";
if(defined $fh)
{
   $s = $fh->getline();
   $index = 0 + $s;
   $s = $fh->getline();
   $totalNumber = 0 + $s;
   $s = $fh->getline();
   $ranks = 0 + $s;
   undef $fh;
}

#store total number to current number
$currentTotalNumber = $totalNumber;

#open database first
use Win32::ODBC;
$dsn = "testDB";

$table = "DATASTORE";
# ====== Check if the database can be opened correctly
if (!($db = new Win32::ODBC($dsn))) {
 print "Error in opening DSN /"$dsn/"!/n";
 print "Error: " . Win32::ODBC::Error(). "/n";
 exit;
}

 

#begin search loop
#if search ranks exist and total - pre number > search ranks than quit
while(($currentTotalNumber - $totalNumber) < $ranks)
{

      # ====== Select all fields from the given table
      $sql = "select * from $table where INDEXNUM = $index;";

      #read url from database by the index
      if ($db->Sql($sql)) {
   print "Error in SQL query: /"$sql/"!/n";
   print "Error: " . $db->Error() . "/n";
   $db->Close();
   exit;
      }
      while($db->FetchRow()) {
     ($HASH, $URL) = $db->Data("HASHCODE", "URL");
      }
      print $URL ,"/n";


      #begin to get each link
      use LWP::UserAgent;
      use FileHandle;
      $ua = LWP::UserAgent->new;
      my $req = HTTP::Request->new(GET => $URL);
      $res = $ua->request($req, "tempHTMLStoreFile.txt");
      if ($res->is_success)
      {
          print "ok/n";
          use HTML::TreeBuilder;
          my $tree = HTML::TreeBuilder->new();
          $tree->parse_file("tempHTMLStoreFile.txt");
          foreach my $a ($tree->look_down('_tag', 'a',))
          {
           #$tag = $a->tag;
           if(defined $a)
           {
             

              $hrefvalue = $a->attr('href');
              if(defined $hrefvalue)
              {
                 $hrefvalue = lc $hrefvalue;
                 #pre check if this link is a blog link simply to see if it contain the word 'blog'
                
                 ###################################################################
                 #
                 #this section is used for filtering some names that supposed not
                 #to be blog urls ,some may be image links
                 #
                 ####################################################################
                 $_ = $hrefvalue;
                 /blog/;                
                 if(!defined($&))
                 {
                     next; #only the url contain blog can go farther
                 }
                
                 $_ = $hrefvalue;
                 /image/;                
                 if($& eq 'image')
                 {
                     next; #only the url contain blog can go farther
                 }
                
                 $_ = $hrefvalue;
                 //'/;                
                 if($& eq '/'')
                 {
                     next; #only the url contain blog can go farther
                 }
                
                 #pre check if this link is a blog link simply to see if it contain the word 'http://'
                 $_ = $hrefvalue;
                 /http://///;
                
                 ######################################################################
                 if($& eq 'http://')
                 {
                    #try to store this url here and increment the currentTotalNumber

                    #.................................
                    #.................................
                    print $hrefvalue,"/n";
                   
                    use Digest::MD5 qw(md5_hex);
                    $hashcode = md5_hex($hrefvalue);
                    $sqlLink = "select * from $table where HASHCODE = '$hashcode';";
                    if ($db->Sql($sqlLink))
                    {
                 print "Error in SQL query: /"$sql/"!/n";
                 print "Error: " . $db->Error() . "/n";
                 $db->Close();
                 exit;
                    }
                    if(defined $iHASH)
                    {
                       $iHASH = 'NUN';
                    }
                    while($db->FetchRow())
                    {
                   ($iHASH,$iLINK) = $db->Data("HASHCODE","LINKNO");
                    }
                   
                    print $iHASH ,"/n";
                   
                    #if links exist add the index to its link field
                    #else store the links and total number increment  1
                    if($iHASH eq 'NUN')#store the href
                    {    
                       $iStr = sprintf("%d",$index);
                       $currentTotalNumber = $currentTotalNumber + 1;
                       $sqlinsert = "INSERT INTO DATASTORE VALUES ($currentTotalNumber,'$hashcode','$hrefvalue', '$iStr')";
                       $rc = $db->Sql($sqlinsert);
                       die qq(SQL 失败 "$sqlinsert": ), $db->Error(), qq(n) if $rc;
                    }
                    else                 #save the index
                    {                     

                       $iStr = sprintf("%d",$index);
                       $iLINK = join('',$iLINK,", ",$iStr);
                       $sqlupdate = "UPDATE DATASTORE SET LINKNO = '$iLINK' WHERE HASHCODE = '$iHASH'";
                       $rc = $db->Sql($sqlupdate);
                       die qq(SQL 失败 "$sqlupdate": ), $db->Error(), qq(n) if $rc;
                    }                              
                  }#if(defined($&))
               }#if(defined $hrefvalue)
            }#if(defined $a)
           

           }#foreach my $a ($tree->look_down('_tag', 'a',))
           $tree->delete;
       }#if ($res->is_success)
       else
       {
        print $res->status_line, "/n";
       }


      #end getting each links

      #start index increment
      $index = $index + 1;
  
      #store current index and total number in to config file


      use IO::File;
      $filehdler = new IO::File " > config.txt";
      if (defined $filehdler) {
          print $filehdler $index ,"/n";
          print $filehdler $currentTotalNumber ,"/n";
          print $filehdler $ranks ,"/n";
          $filehdler->close;
      }
}
#end search loop  

#close database
$db->Close();


print $currentTotalNumber,"/n";

原创粉丝点击