SPRING BOOT+WEBMAGIC

来源:互联网 发布:软件研发类期刊 编辑:程序博客网 时间:2024/06/07 05:43
    最近,想自己学习下hadoop,但又缺少点文本数据,所以需要爬取点数据~ 不会写py , 就直接找了个爬虫框架~    webmagic的原理图如下,很简单很好用:      

这里写图片描述

POM.xml

    <!-- mybatis start-->        <dependency>            <groupId>org.mybatis.spring.boot</groupId>            <artifactId>mybatis-spring-boot-starter</artifactId>            <version>${mybatis-version}</version>        </dependency>        <dependency>            <groupId>mysql</groupId>            <artifactId>mysql-connector-java</artifactId>        </dependency>        <!-- mybatis end -->        <!-- webMagic start -->        <dependency>            <groupId>us.codecraft</groupId>            <artifactId>webmagic-core</artifactId>            <version>${webMagic-version}</version>        </dependency>        <dependency>            <groupId>us.codecraft</groupId>            <artifactId>webmagic-extension</artifactId>            <version>${webMagic-version}</version>        </dependency>        <!-- webMagic end -->

Application.java

import org.springframework.boot.SpringApplication;import org.springframework.boot.autoconfigure.SpringBootApplication;import org.springframework.boot.builder.SpringApplicationBuilder;import org.springframework.boot.web.support.SpringBootServletInitializer;import org.springframework.scheduling.annotation.EnableScheduling;@SpringBootApplication@EnableSchedulingpublic class GlobeFishWebMagicApplication extends SpringBootServletInitializer {    @Override    protected SpringApplicationBuilder configure(SpringApplicationBuilder application) {        return application.sources(GlobeFishWebMagicApplication.class);    }    public static void main(String[] args) {        SpringApplication.run(GlobeFishWebMagicApplication.class, args);    }}

processor.java

import java.util.Date;import java.util.List;import java.util.concurrent.CountDownLatch;import java.util.concurrent.atomic.AtomicInteger;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.boot.autoconfigure.SpringBootApplication;import org.springframework.scheduling.annotation.Scheduled;import org.springframework.stereotype.Component;import com.panchen.globeFishWebMagic.entity.CSDNMessage;import com.panchen.globeFishWebMagic.mapper.CSDNMessageMapper;import com.panchen.globeFishWebMagic.util.SpringContextUtil;import com.panchen.globeFishWebMagic.util.UUIDUtil;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.processor.PageProcessor;/** * 爬虫 *  * @author pc * */@Component@SpringBootApplication  public class CSDNProcessor extends Thread implements PageProcessor {    private final static Logger logger = LoggerFactory.getLogger(CSDNProcessor.class);    @Autowired    private CSDNMessageMapper csdnMessageMapper;    private String originalUrl;    private String taskName;    // CountDownLatch作为计数器记录线程    private static CountDownLatch cdl=new CountDownLatch(9);    //使用原子变量    private static AtomicInteger urlCount = new AtomicInteger(0);    private static AtomicInteger pageCount = new AtomicInteger(1);    public CSDNProcessor() {    }    public CSDNProcessor(CountDownLatch cdl) {        this.cdl = cdl;    }    // 抓取配置    private Site site = Site.me().setSleepTime(1000).setRetryTimes(30).setCharset("utf-8").setTimeOut(300000)            .setUserAgent(                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");    @Override    public Site getSite() {        return site;    }    @Override    public void process(Page page) {        if (page.getUrl().regex("http://blog\\.csdn\\.net/(.*)/article/details/(.*)").match()) {            // get            CSDNMessage newCSDNMessage = new CSDNMessage(UUIDUtil.getUUID(), page.getUrl().get(),                    page.getHtml().xpath("//*[@id=\"blog_userface\"]/span/a/text()").get(),                    page.getHtml().xpath("//*[@class=\"article_title\"]/h1/span/text()").get(),                    page.getHtml().xpath("//*[@id=\"article_content\"]").get(),                    page.getHtml().xpath("//*[@class=\"link_postdate\"]/text()").get(), new Date(), 1, null, null,                    page.getHtml().xpath("//*[@id=\"btnDigg\"]/dd/text()").get(),                    page.getHtml().xpath("//*[@id=\"btnBury\"]/dd/text()").get(),                    page.getHtml().xpath("//*/[@class=\"link_view\"]/text()").get(),                    page.getHtml().xpath("//*[@class=\"link_comments\"]/text()").get(),                    page.getHtml().xpath("//*[@class=\"category_r\"]/label/span/text()").get());            csdnMessageMapper.addCSDNMessage(newCSDNMessage);            urlCount.getAndIncrement();        }        List<String> urls = page.getHtml()                .xpath("//*[@class=\"blog_list clearfix\"]/dd/[@class=\"tracking-ad\"]/a/@href").all();        // 跳页        if (page.getUrl().get().matches("http://blog\\.csdn\\.net/(.*)/newarticle.html(.*)")) {            pageCount.getAndIncrement();            if (page.getUrl().get().matches("http://blog\\.csdn\\.net/(.*)/newarticle.html")) {                page.addTargetRequest(page.getUrl().get() + "?&page=2");            } else {                page.addTargetRequest(                        page.getUrl().get().substring(0, page.getUrl().get().lastIndexOf('=') + 1) + pageCount);            }        }        if (null != urls && 0 < urls.size()) {            for (String url : urls) {                if (null != csdnMessageMapper.getMessageByUrl(url)) {                    csdnMessageMapper.deleteCSDNMessageByUrl(url);                }                page.addTargetRequest(url);            }        }    }    public void run() {        long startTime, endTime;        logger.info(taskName + "START!!!!!");        startTime = System.currentTimeMillis();        //spring对bean的管理是安全的  无法通过注入来得到bean 工具类实现ApplicationContextAware即可        Spider.create(SpringContextUtil.getBeanByClass(new CSDNProcessor().getClass())).addUrl(originalUrl).thread(1).run();        endTime = System.currentTimeMillis();        logger.info(taskName + "END!!!!!,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了" + pageCount + "页、" + urlCount                + "条记录");        cdl.countDown();    }    /**     *      * 对多模块进行爬取     */    @Scheduled(cron = "0 46 16 ? * *")    public void scheduled() {        long startTime, endTime;        logger.info("START!!!!!");        startTime = System.currentTimeMillis();        // 手機        CSDNProcessor mobile = new CSDNProcessor(cdl);        mobile.setOriginalUrl("http://blog.csdn.net/mobile/newarticle.html");        mobile.setTaskName("mobile");        // web前端        CSDNProcessor web = new CSDNProcessor(cdl);        web.setOriginalUrl("http://blog.csdn.net/web/newarticle.html");        web.setTaskName("web");        // 研發管理        CSDNProcessor software = new CSDNProcessor(cdl);        software.setOriginalUrl("http://blog.csdn.net/software/newarticle.html");        software.setTaskName("software");        // 架構設計        CSDNProcessor enterprise = new CSDNProcessor(cdl);        enterprise.setOriginalUrl("http://blog.csdn.net/enterprise/newarticle.html");        enterprise.setTaskName("enterprise");        // 程序語言        CSDNProcessor code = new CSDNProcessor(cdl);        code.setOriginalUrl("http://blog.csdn.net/code/newarticle.html");        code.setTaskName("code");        // 互聯網        CSDNProcessor www = new CSDNProcessor(cdl);        www.setOriginalUrl("http://blog.csdn.net/www/newarticle.html");        www.setTaskName("www");        // 數據庫        CSDNProcessor database = new CSDNProcessor(cdl);        database.setOriginalUrl("http://blog.csdn.net/database/newarticle.html");        database.setTaskName("database");        // cloud        CSDNProcessor cloud = new CSDNProcessor(cdl);        cloud.setOriginalUrl("http://blog.csdn.net/cloud/newarticle.html");        cloud.setTaskName("cloud");        // 總和        CSDNProcessor other = new CSDNProcessor(cdl);        other.setOriginalUrl("http://blog.csdn.net/other/newarticle.html");        other.setTaskName("other");        // 子线程开始        mobile.start();        web.start();        software.start();        enterprise.start();        code.start();        www.start();        database.start();        cloud.start();        other.start();        // 主线程等待        try {            cdl.await();        } catch (InterruptedException e) {            e.printStackTrace();        }        endTime = System.currentTimeMillis();        logger.info("END!!!!!,总耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了" + pageCount + "页、" + urlCount + "条记录");    }    public String getOriginalUrl() {        return originalUrl;    }    public void setOriginalUrl(String originalUrl) {        this.originalUrl = originalUrl;    }    public String getTaskName() {        return taskName;    }    public void setTaskName(String taskName) {        this.taskName = taskName;    }}
原创粉丝点击