use LWP::UserAgent;use POSIX;use HTML::TreeBuilder::XPath; use Encode; use HTML::TreeBuilder;open DATAFH,">csdn.html" || die "open csdn file failed:$!"; my $ua = LWP::UserAgent->new;$ua->timeout(10);$ua->env_proxy;$ua->agent("Mozilla/8.0");my $response = $ua->get('http://blog.csdn.net/zhaoyangjian724');my $base_dir="F:\\20161219";if ($response->is_success) { print DATAFH $response->content }; close DATAFH; use HTML::TreeBuilder::XPath; my $tree= HTML::TreeBuilder::XPath->new; $tree->parse_file( "csdn.html"); ## <a href="/zhaoyangjian724/article/category/1756569" onclick="_gaq.push(['_trackEvent','function', 'onclick', 'blog_articles_wenzhangfenlei']); ">Oracle dump解析 ##获取博客分类的URL,根据a标签查找 @Links = $tree->find_by_tag_name('a'); foreach (@Links) { $href = $_->attr('href'); ###获取博客每个类别的url ####@href 表示所有分类的url if ($href =~/category/){print "\$href is $href\n"; push (@href,$href); };}; #@href 是所有类别url的汇总 print "\@href is @href\n"; #@type 是类别名称汇总,根据ul标签查找/li/a对应的值 每个类别的值 my @type=$tree->findvalues( '/html/body//ul[@class="panel_body"]/li/a'); #my @type=encode("gbk", decode("utf8","@type")); foreach (@type){ my $a=encode("gbk", decode("utf8","$_")) ; push (@a, $a); }; #@type 是类别名称汇总,根据ul标签查找/li/a对应的值 my @type=@a; print "\@type is @type\n"; my $length=@href; my @tmp=(); ##@type 表示所有分类的名称 for ($i=0;$i<$length;$i++){ print "$href[$i]===$type[$i]\n"; push (@tmp,$type[$i])}; ##@tmp 所有分类的数组 #循环类别开始for ($i=0;$i<=@type - 1; $i++){print "\$type is $type\n"; #next unless ($type[$i]) ; if (! -d "$type[$i]"){ mkdir $type[$i]; }; chdir "$base_dir/$type[$i]"; ##进入每个分类版块url my $pageString; my $response = $ua->get("http://blog.csdn.net$href[$i]"); ##每个版块首页urlprint "\$href[$i] is $href[$i]\n"; ##fh1.html每个版块首页url open fh1,">fh1.html" || die "open csdn file failed:$!"; print fh1 $response->content; close fh1; my $tree= HTML::TreeBuilder::XPath->new; $tree->parse_file( "fh1.html"); ##获取每个版块的页码数 这个方法有问题,这里是数组$_ is 150条数据 共8页 my @pageString = $tree->findvalues('/html/body//div[@id="papelist"]/span'); if ($pageString[0]){ if ($pageString[0] =~ /.*\s+.*?(\d+).*/){$pageString=$1}; };my @pageString =encode("gbk", decode("utf8","@pageString")) ;print "\@pageString is @pageString\n;"; ##获取$pageString sleep (1); unless ($pageString){$pageString=1}; print "\$pageString is $pageString\n"; ##进入每页,处理url for ($j=1;$j<=$pageString + 0; $j++){ ##每个类别对应的url my $url="http://blog.csdn.net$href[$i]/$j"; print "\$url is $url\n"; my $response = $ua->get("$url"); ##fh2 每页url open fh2,">fh2.html" || die "open csdn file failed:$!"; print fh2 $response->content; close fh2; #获取每页都多少条标题 my @pageTitles=""; my $tree= HTML::TreeBuilder::XPath->new; $tree->parse_file( "fh2.html"); #获取标题,这里会拿到除了该类别下文章外, my @pageTitles = $tree->findvalues('/html/body//span[@class="link_title"]'); my @a=(); foreach (@pageTitles){ my $a=encode("gbk", decode("utf8","$_")) ; push (@a, $a); }; my @pageTitles=@a; print "\$pageTitles[0] is $pageTitles[0]\n"; print "\@pageTitles is @pageTitles\n"; ##获取标题连接url my $tree= HTML::TreeBuilder::XPath->new; $tree->parse_file( "fh2.html"); @titleLinks=""; @titleLinks=$tree->find_by_tag_name('a'); @urlall=(); @urltmp=(); #@urlall除了包含每个类别的文章,还包含阅读排行里的文章 foreach (@titleLinks) { @titleHref = $_->attr('href'); foreach (@titleHref) { ###获取版块中每个页面的url if ($_ =~/zhaoyangjian724\/article\/details\/(\d+)$/){ unless ($_ ~~ @urlall) { print "\$_=========$_\n";push (@urlall ,encode("gbk", decode("utf8","$_")));}} }; }; print "\@urlall is @urlall\n"; sleep (1); for ($k=0;$k<=@pageTitles - 1;$k++){ print "\$urlall[$k] is $urlall[$k]\n"; push (@urltmp,$urlall[$k]); }; @urlall=@urltmp; print "\$---urlall[0] is $urlall[0]\n"; sleep (1);for ($m=0;$m<=@urlall - 1; $m++){$pageTitles[$m] =~ s/\s+//g;print "===========================\n";print "$pageTitles[$m]======$urlall[$m]\n"; print "===========================\n"; $pageTitles[$m] =~ s/\*//g; $pageTitles[$m] =~ s/\s+//g; $pageTitles[$m] =~ s /\\//g; $pageTitles[$m] =~ s /\///g; $pageTitles[$m] =~ s /://g; $pageTitles[$m] =~ s /\|//g; $pageTitles[$m] =~ s /\<//g; $pageTitles[$m] =~ s /\>//g; $pageTitles[$m] =~ s /\?//g; $pageTitles[$m] =~ s /\"//g; $pageTitles[$m] =~ s /\'//g; open fh3,">$pageTitles[$m].html" || die "open csdn file failed:$!"; my $response = $ua->get("http://blog.csdn.net$urlall[$m]"); print "--------------------------------\n"; print "$urlall[$m]"."\n"; print fh3 $response->content; close fh3; # unlink("$pageTitles[$m].html.tmp"); #循环页码结束 }; #循环每个分类的url结束 #循环单个类别结束 } chdir "$base_dir"; }
0 0