用webmagic抓文章列表和详细页

来源:互联网 发布:哈登总得分最新数据 编辑:程序博客网 时间:2024/06/06 01:41

webMaigc文档:http://webmagic.io


public class ForumPageprocess implements PageProcessor {
private Sitesite= Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);privatestatic String username="qq_14955245";private String domian="http://blog.csdn.net/";//网站首页private int size=0;//抓取到的文章数量

public Site getSite() {

return site;

}
public void process(Pagepage) {if(!page.getUrl().regex(""+domian+""+username+"/article/details/\\d+").match()) {page.addTargetRequests(page.getHtml().xpath("//*[@id=\"article_toplist\"]").regex(""+username+"/article/details/\\d+").replace(""+username+"",domian+username+"").all());page.addTargetRequests(page.getHtml().xpath("//*[@id=\"article_list\"]").regex(""+username+"/article/details/\\d+").replace(""+username+"",domian+username+"").all()); }

else  page.putField("title:",page.getHtml().xpath("//*[@id=\"article_details\"]/div[1]/h1/span/").replace("<[^>]*>",""));  page.putField("原创:",page.getHtml().regex("ico_type_(Original)",1).get());  page.putField("阅读次数:",page.getHtml().xpath("//*[@id=\"article_details\"]/div[2]/div[2]/span[2]").regex("(\\d+)人阅读", 1));  page.putField("评论次数:",page.getHtml().xpath("//*[@id=\"article_details\"]/div[2]/div[2]/span[3]").regex("(\\d+)", 1));  page.putField("发布时间:",page.getHtml().css("span.link_postdate","text")); page.putField("标签:",page.getHtml().xpath("//*[@id=\"article_details\"]/div[2]/div[1]/span").regex("<a[^>]*>(.*?)</a>",1).all(); page.putField("分类:",page.getHtml().xpath("//*[@id=\"article_details\"]/div[3]/div[2]/label/span/text()")); page.putField("文章内容:",page.getHtml().xpath("//*[@id=\"article_content\"]").replace("<[^>]*>","")); System.out.println("抓取第"+size+"文章");
 }
public static void main(String[] args) {long startTime,endTime;System.out.println("爬虫开始请大家耐心等待");startTime =System.currentTimeMillis();System.out.println(startTime); // 从用户博客首页开始抓,开启2个线程,启动爬虫Spider.create(new ForumPageprocess()).addPipeline(new ConsolePipeline()) .addUrl("http://blog.csdn.net/"+username+"").thread(2).run();endTime=System.currentTimeMillis();System.out.println("总共用时"+((endTime-startTime)/1000)+"秒");}
运行结果:




原创粉丝点击