基于Mojo的perl爬虫

来源:互联网 发布:递归算法反汇编 编辑:程序博客网 时间:2024/06/14 19:47
不多说 Mojo 的perl爬虫,我不懂HTML,CSS,XML,甚至是面向对象的编成也是一知半解。
所以只能照葫芦画瓢,写的不好不要喷
另外我对于 如何获取动态页面的信息十分困惑,例如如何 获取JS接口 发送的信息我就没有看懂 
所以请路过大神指点一二 
可以留言 鄙人QQ:326889964
#!/usr/bin/perluse strict;use 5.20.0;use Mojo::UserAgent;use Bloom::Filter;use Smart::Comments;use utf8;use Encode;no warnings; my $dept_level = 3;my $baseUrl = Mojo::URL->new($ARGV[0] || 'http://www.weather.com.cn/forecast/');my ($domain) = $baseUrl =~ qr#http://(?:www.)?([^/]+)#;my $filter = Bloom::Filter->new(capacity => 100000, error_rate => 0.0001);my $ua = Mojo::UserAgent->new(max_redirects => 3);my @Citys; my $callback;$callback = sub  {    my ($ua, $tx) = @_;    return if !$tx->success;    my $dept = $tx->req->headers->header('dept');    return if $dept > $dept_level; # 深度    ++$dept;    my $flag = $tx->req->headers->header('flag');    if($flag == 1){      my $satuation;      open my $fh,">",\$satuation or die "can not open $satuation:$!";      $tx->res->dom->find("div[class=crumbs fl]")->each(                                                   sub{                                                     my @tmp = split /</,$_;                                                     map{if($_ =~ />(\w*)$/ ){                                                       if($1){                                                         print $fh $1,"->" if $1;                                                       }                                                     }}@tmp;                                                   });      $tx->res->dom->find("input[id*=hidden_title]")->each(                                                   sub{                                                     $_ = "$_->{value}\n";                                                     print $fh $_;                                                   });      close($fh);      unshift @Citys,$satuation;    };    $tx->res->dom->find("a[href][target=_blank]")->each(sub{            my $attrs  = shift->attr;              my $newUrl = Mojo::URL->new($attrs->{href});             # 修复 url 的路径            if (!$newUrl->host and !$newUrl->scheme) {                $newUrl->host($tx->req->url->host);                $newUrl->scheme($tx->req->url->scheme);            }               $newUrl->fragment(undef); # 去掉 foo=bar#23 后面的 #xxx             # 域名, 协议, 后缀以下不对的都退出            next if ( $newUrl->scheme ne 'http' && $newUrl->scheme ne 'https' );            next if $newUrl->host !~ qr/$domain/;            if ( $newUrl->path =~ /(weather1d.*\/101[0-9]+\.shtml)$/i ){                if( !$filter->check($newUrl) ) {                    $filter->add($newUrl);                    $ua->get($newUrl => { dept => $dept,flag => 1} => $callback);                }                           }    });}; say "Base Url is $baseUrl";my $Done = $ua->get($baseUrl => {dept => 1,flag => 0} => $callback);Mojo::IOLoop->start;if(@Citys){  print "now",0+@Citys,"\n";  my %Hash;  for (@Citys){    my @tmp = split /->/,$_,2;    $Hash{$tmp[0]} = [] unless $Hash{$tmp[0]};    unshift @{$Hash{$tmp[0]}}, $tmp[-1];  }  my $count=0;  for (sort keys %Hash){     print $count;     print $_,"\n";     map{print "    |--- $_"}(sort@{$Hash{$_}});     $count++;  }}
0 0