perl 爬虫两个技巧

来源:互联网 发布:2017世乒赛丁宁知乎 编辑:程序博客网 时间:2024/05/17 17:17
<pre name="code" class="cpp">jrhmpt01:/root/lwp# cat data.html      <div class="m-page J-ajax-page">        <a class="changePage" page="1" href="javascript:void(0);">首页</a> <a class="changePage" page="11" href="javascript:void(0);">上一页</a>  <a class="changePage" page="11" href="javascript:void(0);">11</a>  <a class="cur" href="javascript:void(0)">12</a> <span class="page_info">12/12</span>     </div>    <div class="m-page J-ajax-page">        <a class="changePage" page="1" href="javascript:void(0);">首页</a> <a class="changePage" page="11" href="javascript:void(0);">上一页</a>  <a class="changePage" page="11" href="javascript:void(0);">11</a>  <a class="cur" href="javascript:void(0)">12</a> <span class="page_info">12/12</span>     </div>  jrhmpt01:/root/lwp# cat c1.pl use  LWP::UserAgent;use DBI;  use POSIX;use Data::Dumper;use HTML::TreeBuilder;my $ua = LWP::UserAgent->new;$ua->timeout(10);$ua->env_proxy;$ua->agent("Mozilla/8.0");  use HTML::TreeBuilder::XPath;   $tree= HTML::TreeBuilder::XPath->new;  $tree->parse_file( "data.html");my @title=  $tree->findvalues('/html/body//a[@class="changePage"]');print "\@title is @title\n";jrhmpt01:/root/lwp# perl c1.pl@title is 首页 上一页 11 首页 上一页 11my @title=  $tree->findvalue('/html/body//a[@class="changePage"]');表示 根据body的内容 查找a标签的@class="changePage"的值jrhmpt01:/root/lwp# cat c1.pl use  LWP::UserAgent;use DBI;  use POSIX;use Data::Dumper;use HTML::TreeBuilder;my $ua = LWP::UserAgent->new;$ua->timeout(10);$ua->env_proxy;$ua->agent("Mozilla/8.0");  use HTML::TreeBuilder::XPath;   $tree= HTML::TreeBuilder::XPath->new;  $tree->parse_file( "data.html");my    @pages=$tree->find_by_tag_name('a');                      #@urlall除了包含每个类别的文章,还包含阅读排行里的文章                      foreach (@pages) {                                               @titlepage = $_->attr('page');                                               foreach (@titlepage) {                                                 if ($_){                                                 print "\$_ is $_\n";                                                     };                                           };};jrhmpt01:/root/lwp# perl c1.pl $_ is 1$_ is 11$_ is 11$_ is 1$_ is 11$_ is 11根据a标签,查看page属性的值


                                             
0 0
原创粉丝点击