perl 爬取铜板街

来源：互联网发布：cad迷你看图 mac 编辑：程序博客网时间：2024/05/01 14:20

use LWP::UserAgent;

use utf8;

use HTML::TreeBuilder;

use Data::Dumper;

open DATAFH,">data.html" || die "open data file failed:$!";

#模拟浏览器进行网页数据爬取

my $ua = LWP::UserAgent->new;

$ua->timeout(10);

$ua->env_proxy;

$ua->agent("Mozilla/8.0");

my $response = $ua->get('https://product.tongbanjie.com/list.html?pageNo=1#list_wrap');

if ($response->is_success) {

print DATAFH $response->decoded_content;

use HTML::TreeBuilder::XPath;

my $tree= HTML::TreeBuilder->new;

$tree->parse_file( "data.html"); #此处有个 parse_file 跟 parse 的区别

my @total_count = $tree->find_by_tag_name("a"); #根据标签名字来获取页数

#foreach my $dd ( @total_count ) {

# print $dd->as_text,"\n";

$total_num = $total_count[@total_count-2]->as_text;

close DATAFH; #每次用完之后都要关闭文件句柄

for ( $num=1; $num<=$total_num; $num++ ){ #循环遍历每页数据，拿取历史产品数据

#if ( -f data.html ) {

# open DATAFH, ">data.html" || die "open data file failed:$!";

# close DATAFH;

open DATAFH,">data.html" || die "open data file failed:$!";

my $ua = LWP::UserAgent->new;

$ua->timeout(10);

$ua->env_proxy;

$ua->agent("Mozilla/8.0");

my $response = $ua->get("https://product.tongbanjie.com/list.html?pageNo=$num#list_wrap");

if ( $response->is_success ) {

print DATAFH $response->decoded_content;

use HTML::TreeBuilder::XPath;

my $tree= HTML::TreeBuilder->new;

$tree->parse_file( "data.html");

#获取产品名字

my @title=$tree->findvalues( '/html/body//div[@class="title"]/a');

foreach (@title){

print "$_\n";

}

# 获取产品利率

my @rate = $tree->findvalues('/html/body//div[@class="income"]/p[@class="numBox"]');

foreach (@rate) {

print "$_\n";

}

#获取产品期限

my @date = $tree->findvalues('/html/body//div[@class="day floatleft"]/span[@class="numBox"]');

foreach (@date) {

print "$_\n";

}

#循环遍历插入数据库

for ( $single=0; $single<=@title-1; $single++ ) {

}

close DATAFH;

sleep(10); #停留 10 秒，防止被屏蔽

}

#my @rows=$tree->find_by_tag_name("div");

#shift @rows;

#foreach my $row ( @rows ) {

# my @cell = $row->content_list;

# foreach my $cell ( @cell ) {

# print $cell->as_text, "\t";

# }

# print "\n";

}

else {

die $response->status_line;

}

0 0