webmagic首次demo

来源:互联网 发布:股权投资案例 知乎 编辑:程序博客网 时间:2024/06/15 23:19
package com.tvs.webmgic;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.processor.PageProcessor;public class MyWebmagic implements PageProcessor {// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);    @Override    public Site getSite() {        return site;    }@Overridepublic void process(Page page) {page.addTargetRequests(page.getHtml().css("div#page").links().all());/*String title = page.getHtml().xpath("//*[@id=\"main\"]/div[1]/div["+i+"]/dl/dd/a//text()").toString();System.out.println(title);*/if(page.getUrl().regex("http://www.bjnews.com.cn/opinion/[?page=\\d{0,}]{0,1}").match()){page.addTargetRequests(page.getHtml().css("div.news").links().all());}if(page.getUrl().regex("http://www.bjnews.com.cn/opinion/2017/\\d{2}/\\d{2}/\\d{6}.html").match()){String author = page.getHtml().xpath("//*[@id=\"author_baidu\"]//text()").toString();System.out.println(author);String title = page.getHtml().xpath("//*[@id=\"main\"]/div[1]/h1//text()").toString();System.out.println(title);}}public static void main(String[] args) {Spider.create(new MyWebmagic()).addUrl("http://www.bjnews.com.cn/opinion/").thread(5).run();}}