webmagic 爬取示例,新手学习

来源:互联网 发布:个人域名申请 编辑:程序博客网 时间:2024/05/22 18:22
package com.ssii;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Request;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.model.HttpRequestBody;import us.codecraft.webmagic.pipeline.ConsolePipeline;import us.codecraft.webmagic.pipeline.JsonFilePipeline;import us.codecraft.webmagic.processor.PageProcessor;import us.codecraft.webmagic.utils.HttpConstant;import java.io.UnsupportedEncodingException;/** * @author code4crafter@gmail.com <br> */public class LuPaWorldProcessor implements PageProcessor {    public static final String URL_POST = "http://ehr\\.goodjobs\\.cn/show\\.php\\?jobID=\\w+";    public static int pageNum = 1;    public static final String URL_INDEX = "http://search\\.goodjobs\\.cn/index\\.php\\?boxft=b18\\&page=\\d+\\&cepage=solrpc1503993718l9Komg";    private Site site = Site            .me()            .setDomain("blog.sina.com.cn")            .setSleepTime(3000)            .setUserAgent(                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");    @Override    public void process(Page page) {        if(page.getUrl().regex(URL_INDEX).match()&&pageNum<61){            //获取URL_INDEX页面列表中的url地址,并加入爬虫            page.addTargetRequests(page.getHtml().xpath("//form[@id=\"jobApp\"]").links().regex(URL_POST).all());            //新增爬取主页面(URL_INDEX类似)            Request req = new Request();            req.setMethod(HttpConstant.Method.POST);            req.setUrl("http://search.goodjobs.cn/index.php?boxft=b18&page="+(++pageNum)+"&cepage=solrpc1503993718l9Komg");            page.addTargetRequest(req);        }else{           /* 进入http://ehr.goodjobs.cn/show.php?jobID=(w+)页面            *由于新安网job有两种不同的页面显示,下面将两种不同得数据做统一爬取            */            //获取待遇            page.putField("pay", page.getHtml().xpath("//div[@class='fr zwx']/strong/text()"));            page.putField("pay2", page.getHtml().xpath("//div[@class='op']/strong/text()"));            //获取公司名            page.putField("company", page.getHtml().xpath("//p[@class='cname pt10']/a/text()"));            page.putField("company2", page.getHtml().xpath("//p[@class='cname']/a/text()"));            //获取日期            page.putField("data", page.getHtml().xpath("//div[@class='t1 clearfix']/allText()"));            //获取标题            page.putField("title", page.getHtml().xpath("//h1[@class='fl fz22 w500']/text()"));            page.putField("title2", page.getHtml().xpath("//h1[@class='cn']/h1/text()"));            //获取公司地址            page.putField("address", page.getHtml().xpath("//div[@class='comadress clearfix']/text()"));            //获取岗位要求            page.putField("jobRequirements", page.getHtml().xpath("//p[@class='duol mt20 lh28 pb10']/text()"));            page.putField("jobRequirements2", page.getHtml().xpath("//p[@class='duol mt20']/text()"));            //获取福利            page.putField("welfare", page.getHtml().xpath("//p[@class='t2 mt10']/allText()"));        }    }    @Override    public Site getSite() {        return site;    }    public static void main(String[] args) {        Spider.create(new LuPaWorldProcessor()).addUrl("http://search.goodjobs.cn/index.php?boxft=b18&page=1&cepage=solrpc1503993718l9Komg").addPipeline(new ConsolePipeline())                .thread(10).run();    }}
原创粉丝点击