WebMagic爬虫案例

来源:互联网 发布:安装win7无法连接网络 编辑:程序博客网 时间:2024/05/22 14:24

使用Maven导入以下两个包:

       <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.5.2</version>
        </dependency>

        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.5.2</version>
        </dependency>


这次弄了两个小案例,都是爬的小说网,第一个是起点的列表页


用firebug我们可以看到:

此时用WebMagic注解方式即可,方便简单:

package com.zab.webmagic;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.model.ConsolePageModelPipeline;import us.codecraft.webmagic.model.OOSpider;import us.codecraft.webmagic.model.annotation.ExtractBy;import us.codecraft.webmagic.model.annotation.TargetUrl;@TargetUrl("http://a.qidian.com/")@ExtractBy(value = "//ul[@class=\"all-img-list cf\"]/li",multi = true)public class GithubRepoPageProcessor {    @ExtractBy("//div[@class=book-mid-info]/h4/a/text()")    private String title;    @ExtractBy("//div[@class=book-mid-info]/p[@class=author]/a[@class=name]/text()")    private String author;    @ExtractBy("//div[@class=book-mid-info]/p[@class=author]/a[@class=go-sub-type]/text()")    private String type;    @ExtractBy("//div[@class=book-mid-info]/p[@class=author]/span/text()")    private String status;    @ExtractBy("//div[@class=book-mid-info]/p[@class=intro]/text()")    private String intro;    @ExtractBy("//div[@class=book-mid-info]/p[@class=update]/span/text()")    private String count;    public static void main(String[] args) {//        OOSpider.create(Site.me(), new ConsolePageModelPipeline(), Qidian.class).addUrl("http://a.qidian.com/").thread(4).run();       OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(100), new ConsolePageModelPipeline(), GithubRepoPageProcessor.class);       GithubRepoPageProcessor qidian= ooSpider.get("http://a.qidian.com/");        System.out.println(qidian);    }}

 此时,我们就已经得到结果了



第二个例子是创世网的列表页:

同样的用firebug查看:

代码为:

package com.zab.webmagic;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.model.ConsolePageModelPipeline;import us.codecraft.webmagic.model.OOSpider;import us.codecraft.webmagic.model.annotation.ExtractBy;import us.codecraft.webmagic.model.annotation.TargetUrl;@TargetUrl("http://chuangshi.qq.com/bk/")@ExtractBy(value = "//div[@class='leftlist']/table/tbody/tr",multi = true)public class ChuangShi {    @ExtractBy("//a[@class=green]/text()")    private String title;    @ExtractBy("//a[@class=grey3]/text()")    private String author;    @ExtractBy("//a[@class=grey2]/text()")    private String type;        public static void main(String[] args) {//        OOSpider.create(Site.me(), new ConsolePageModelPipeline(), Qidian.class).addUrl("http://a.qidian.com/").thread(4).run();       OOSpider ooSpider = OOSpider.create(Site.me().setCharset("utf-8"), new ConsolePageModelPipeline(), ChuangShi.class);       ChuangShi qidian= ooSpider.get("http://chuangshi.qq.com/bk/");        System.out.println(qidian);    }}

结果显示:



0 0
原创粉丝点击