windows版爬取csdn

来源:互联网 发布:mcs 51系列单片机 编辑:程序博客网 时间:2024/06/16 08:38
use  LWP::UserAgent;use POSIX;use HTML::TreeBuilder::XPath; use Encode; use HTML::TreeBuilder;open DATAFH,">csdn.html" || die "open csdn file failed:$!"; my $ua = LWP::UserAgent->new;$ua->timeout(10);$ua->env_proxy;$ua->agent("Mozilla/8.0");my $response = $ua->get('http://blog.csdn.net/zhaoyangjian724');my $base_dir="F:\\20161219";if ($response->is_success) { print DATAFH  $response->content }; close DATAFH; use HTML::TreeBuilder::XPath;  my $tree= HTML::TreeBuilder::XPath->new;  $tree->parse_file( "csdn.html");  ##     <a href="/zhaoyangjian724/article/category/1756569" onclick="_gaq.push(['_trackEvent','function', 'onclick', 'blog_articles_wenzhangfenlei']); ">Oracle dump解析  ##获取博客分类的URL,根据a标签查找  @Links = $tree->find_by_tag_name('a');        foreach (@Links) {                $href = $_->attr('href'); ###获取博客每个类别的url ####@href 表示所有分类的url                 if ($href =~/category/){print "\$href is $href\n";                 push (@href,$href);                    };}; #@href 是所有类别url的汇总 print "\@href is @href\n"; #@type 是类别名称汇总,根据ul标签查找/li/a对应的值 每个类别的值 my @type=$tree->findvalues( '/html/body//ul[@class="panel_body"]/li/a'); #my @type=encode("gbk", decode("utf8","@type")); foreach  (@type){  my $a=encode("gbk", decode("utf8","$_")) ;  push (@a, $a);  };   #@type 是类别名称汇总,根据ul标签查找/li/a对应的值 my  @type=@a; print "\@type is @type\n"; my $length=@href; my @tmp=(); ##@type 表示所有分类的名称 for ($i=0;$i<$length;$i++){         print "$href[$i]===$type[$i]\n";     push (@tmp,$type[$i])}; ##@tmp 所有分类的数组 #循环类别开始for ($i=0;$i<=@type - 1; $i++){print "\$type is $type\n";                     #next  unless ($type[$i]) ;                   if (! -d "$type[$i]"){                    mkdir $type[$i];                                };                      chdir "$base_dir/$type[$i]";                                       ##进入每个分类版块url                                     my  $pageString;                   my $response = $ua->get("http://blog.csdn.net$href[$i]");                    ##每个版块首页urlprint "\$href[$i] is $href[$i]\n";    ##fh1.html每个版块首页url                    open fh1,">fh1.html" || die "open csdn file failed:$!";                    print fh1  $response->content;                    close fh1;                     my $tree= HTML::TreeBuilder::XPath->new;                     $tree->parse_file( "fh1.html");                   ##获取每个版块的页码数 这个方法有问题,这里是数组$_ is  150条数据 共8页                   my  @pageString = $tree->findvalues('/html/body//div[@id="papelist"]/span');                if ($pageString[0]){ if ($pageString[0] =~ /.*\s+.*?(\d+).*/){$pageString=$1}; };my @pageString =encode("gbk", decode("utf8","@pageString")) ;print "\@pageString is @pageString\n;";   ##获取$pageString   sleep (1);                   unless ($pageString){$pageString=1};                    print "\$pageString is $pageString\n";                     ##进入每页,处理url                     for ($j=1;$j<=$pageString + 0; $j++){ ##每个类别对应的url                      my $url="http://blog.csdn.net$href[$i]/$j";                      print "\$url is $url\n";                      my $response = $ua->get("$url");  ##fh2 每页url                      open fh2,">fh2.html" || die "open csdn file failed:$!";                       print fh2  $response->content;                      close fh2;                      #获取每页都多少条标题                     my @pageTitles="";                      my $tree= HTML::TreeBuilder::XPath->new;                     $tree->parse_file( "fh2.html");                     #获取标题,这里会拿到除了该类别下文章外,                     my @pageTitles = $tree->findvalues('/html/body//span[@class="link_title"]'); my @a=();  foreach  (@pageTitles){                     my $a=encode("gbk", decode("utf8","$_")) ;                     push (@a, $a);                        };     my @pageTitles=@a;                      print "\$pageTitles[0] is  $pageTitles[0]\n";  print "\@pageTitles is @pageTitles\n";                       ##获取标题连接url                       my $tree= HTML::TreeBuilder::XPath->new;                      $tree->parse_file( "fh2.html");  @titleLinks="";                      @titleLinks=$tree->find_by_tag_name('a');                      @urlall=();                       @urltmp=();                      #@urlall除了包含每个类别的文章,还包含阅读排行里的文章                      foreach (@titleLinks) {                                               @titleHref = $_->attr('href');                                               foreach (@titleHref) {   ###获取版块中每个页面的url                                               if ($_ =~/zhaoyangjian724\/article\/details\/(\d+)$/){                                                 unless ($_ ~~ @urlall) { print "\$_=========$_\n";push (@urlall ,encode("gbk", decode("utf8","$_")));}}                                                     };                                           };                                          print "\@urlall is @urlall\n";  sleep (1);                                          for ($k=0;$k<=@pageTitles - 1;$k++){                                          print "\$urlall[$k] is $urlall[$k]\n";                                            push (@urltmp,$urlall[$k]);                                                                           };                                                 @urlall=@urltmp;                                          print "\$---urlall[0] is  $urlall[0]\n";  sleep (1);for ($m=0;$m<=@urlall - 1; $m++){$pageTitles[$m] =~ s/\s+//g;print "===========================\n";print "$pageTitles[$m]======$urlall[$m]\n";    print "===========================\n";     $pageTitles[$m] =~ s/\*//g; $pageTitles[$m] =~ s/\s+//g; $pageTitles[$m] =~ s /\\//g; $pageTitles[$m] =~ s /\///g; $pageTitles[$m] =~ s /://g; $pageTitles[$m] =~ s /\|//g; $pageTitles[$m] =~ s /\<//g; $pageTitles[$m] =~ s /\>//g; $pageTitles[$m] =~ s /\?//g;  $pageTitles[$m] =~ s /\"//g;   $pageTitles[$m] =~ s /\'//g;          open fh3,">$pageTitles[$m].html" || die "open csdn file failed:$!";  my $response = $ua->get("http://blog.csdn.net$urlall[$m]");                                                                                                        print "--------------------------------\n";                                                  print "$urlall[$m]"."\n";                                                  print fh3  $response->content;                                                  close fh3;                                                 # unlink("$pageTitles[$m].html.tmp");                           #循环页码结束                                                    }; #循环每个分类的url结束                                       #循环单个类别结束                          }   chdir "$base_dir";                          }


                                             
0 0
原创粉丝点击