Springboot通过集成Webmagic实现数据抓取功能。

来源:互联网 发布:linux内网穿透 编辑:程序博客网 时间:2024/06/05 16:03

一、什么是Webmagic.
要使用Webmagic首先需要了解什么是Webmagic.
webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic主要由Downloader(下载器)、PageProcesser(解析器)、Schedule(调度器)和Pipeline(管道)四部分组成。
webmagic采用完全模块化的设计,功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),支持多线程抓取,分布式抓取,并支持自动重试、自定义UA/cookie等功能。
webmagic包含页面抽取功能,开发者可以使用css selector、xpath和正则表达式进行链接和内容的提取,支持多个选择器链式调用。
二、示例代码
(1)pom.xml 文件添加新的依赖

<!-- webmagic 爬虫框架 -->      <dependency>             <groupId>us.codecraft</groupId>`这里写代码片`            <artifactId>webmagic-core</artifactId>            <version>0.5.3</version>     </dependency>     <dependency>            <groupId>us.codecraft</groupId>            <artifactId>webmagic-extension</artifactId>            <version>0.5.3</version>     </dependency>    </dependencies>
(2)Scheduling定时任务设定,也可称为调度器。
import javax.annotation.Resource;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import org.springframework.scheduling.annotation.EnableScheduling;import org.springframework.scheduling.annotation.Scheduled;import org.springframework.stereotype.Component;import org.springframework.transaction.annotation.Transactional;import com.zhibo.xmt.common.webmagic.xpager.popeline.XpaperZgtcbPopeline;import com.zhibo.xmt.common.webmagic.xpager.processor.XpaperZgtcbProcessor;import us.codecraft.webmagic.Spider;/** * 爬取 xpaper http://i.xpaper.net/cnsports  版面信息数据 * 每周 二 、 四、日发布新期刊 * @author Bruce * */@Component@EnableSchedulingpublic class XpaperWebmagicSchedulingConfig {    private final Logger logger = LoggerFactory.getLogger(XpaperWebmagicSchedulingConfig.class);    public static final String BASE_URL = "http://i.xpaper.net/cnsports";    @Resource    private XpaperZgtcbPopeline xpaperZgtcbPopeline;    /**     * 中国体彩报 xpaper全媒体数字报 版面内容抓取     */    /**     * "0 0/1 18 * * ?" 每天18:00到18:59  没分钟执行一次     *      * "0 10 4 ? * *" 每天上午4:10触发      */    @Transactional    @Scheduled(cron = "0 10 4 ? * *")    public void createLotteryInfo(){        System.out.println("中国体彩报 xpaper全媒体数字报 版面内容抓取");        long startTime, endTime;        System.out.println("【爬虫开始】");        startTime = System.currentTimeMillis();        logger.info("爬取地址:" + BASE_URL);        try {            Spider spider = Spider.create(new XpaperZgtcbProcessor());            spider.addUrl(BASE_URL);            spider.addPipeline(xpaperZgtcbPopeline);            spider.thread(5);            spider.setExitWhenComplete(true);            spider.start();            spider.stop();        } catch (Exception e) {            logger.error(e.getMessage(),e);        }        endTime = System.currentTimeMillis();        System.out.println("【爬虫结束】");        System.out.println("中国体彩报 xpaper全媒体数字报 版面内容抓取耗时约" + ((endTime - startTime) / 1000) + "秒,已保存到数据库.");    }}

(3)XpaperZgtcbProcessor解析器,解析要爬取的页面

import java.util.ArrayList;import java.util.Date;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.lang3.StringUtils;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import org.springframework.stereotype.Component;import com.zhibo.xmt.common.enums.common.EnumCommonStatus;import com.zhibo.xmt.common.util.DateUtil;import com.zhibo.xmt.common.vo.pagesub.Journal;import com.zhibo.xmt.common.vo.pagesub.JournalPage;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.processor.PageProcessor;import us.codecraft.webmagic.selector.Selectable;/** * 中国体彩报 xpaper全媒体数字报 版面内容抓取 * http://i.xpaper.net/cnsports * @author Bruce * */@Componentpublic class XpaperZgtcbProcessor implements PageProcessor{    private static Logger logger = LoggerFactory.getLogger(XpaperZgtcbProcessor.class);    // 正则表达式\\. \\转义java中的\ \.转义正则中的.    // 主域名    public static final String BASE_URL = "http://i.xpaper.net/cnsports";    private Site site = Site.me()            .setDomain(BASE_URL)            .setSleepTime(1000)            .setRetryTimes(30)            .setCharset("utf-8")            .setTimeOut(30000)            .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");    @Override    public Site getSite() {        return site;    }    @Override    public void process(Page page) {         if (page.getUrl().regex(BASE_URL).match()) {             String contentTitle = page.getHtml().xpath("//title/text()").toString();             /**              * System.out.println("issue:" + issue);                System.out.println("issueDesc:" + issueDesc);                System.out.println("contentTitle:" + contentTitle);              * contentTitle:中国体彩报 - 第1151期 - 第01版 - A1                issue: 1151                 issueDesc:中国体彩报 - 第1151期               */             String[] contentTitles = contentTitle.trim().split("-");             String issueStr = contentTitles[1].replaceAll("第", "").replaceAll("期", "").replaceAll(" ", "").trim().replaceAll("\\s*", "");             String issue = new String(issueStr);             //由于里面有空格,因此使用了多种方式去空格。             Pattern p = Pattern.compile("\\s*|\t|\r|\n");             Matcher m = p.matcher(issue);             issue = m.replaceAll("");             issue = issue.replaceAll("\\u00A0","");               String issueDesc = contentTitles[0] + "-" + contentTitles[1];             Journal journal = new Journal();             journal.setTitle(issueDesc);             journal.setTitleDesc(contentTitle);             journal.setIssue(issue);             journal.setDate(DateUtil.getDateByFormat(DateUtil.getDateByFormat(new Date(), "yyyy-MM-dd"), "yyyy-MM-dd"));             journal.setDateStr(DateUtil.getDateByFormat(new Date(), "yyyy-MM-dd"));             journal.setType(1);             journal.setStatus(EnumCommonStatus.NORMAL.getValue());             journal.setGrabDate(new Date());             journal.setCreatedAt(new Date());             journal.setUpdatedAt(new Date());             logger.info("期刊数据:" + journal.toString());             List<Selectable> list = page.getHtml().xpath("//div[@id='m1']/a").nodes();             if(list != null && list.size() > 0){                 List<JournalPage> journalPages = new ArrayList<JournalPage>();                 for(int i = 0; i < list.size(); i++){                     Selectable s = list.get(i);                     String link = s.links().toString();                     String titleStr = s.xpath("//b/text()").toString();                     if(StringUtils.isBlank(titleStr)){                         titleStr = s.toString().split(">")[1].replaceAll("</a", "").replaceAll(" ", "").replaceAll("&nbsp;", " ");                     }                     String title = new String(titleStr);//                     title = title.replaceAll("\\s*", "");//                  Pattern p = Pattern.compile("\\s*|\t|\r|\n");                    Matcher ma = p.matcher(title);                    title = ma.replaceAll("");                    title = title.replaceAll("\\u00A0","");                    title= title.replaceAll("版", "版 ");//                     System.out.println("title:" + title);                     /**                      * System.out.println("s:" + s.toString());                        System.out.println("link:" + link);                        System.out.println("title:" + title);                      * s:<a href="http://i.xpaper.net/cnsports/release/539/2040.shtml"> <b>第01版 &nbsp; A1</b> </a>                        link:http://i.xpaper.net/cnsports/release/539/2040.shtml                        title:第01版   A1                      */                     if(StringUtils.isNotBlank(title) && StringUtils.isNotBlank(link)){                         if(i == 0){                             journal.setUrl(link);                             journal.setStageDesc(title);                         }                         JournalPage  journalPage = new JournalPage();                         journalPage.setJournalId(journal.getId());                         journalPage.setPageHtmlTitle(title);                         journalPage.setPageHtmlUrl(link);                         journalPage.setStatus(EnumCommonStatus.NORMAL.getValue());                         journalPage.setGrabDate(new Date());                         journalPage.setCreatedAt(new Date());                         journalPage.setUpdatedAt(new Date());                         logger.info("版面数据:" + journalPage.toString());                         journalPages.add(journalPage);                     }                 }                 journal.setJournalPages(journalPages);                 logger.info("journal.toString():" + journal.toString());             }             page.putField("journal", journal);           }    }//  public static void main(String[] args) {//        Spider spider = Spider.create(new XpaperZgtcbProcessor());//        spider.addUrl(BASE_URL);//        spider.addPipeline(new XpaperZgtcbPopeline());//        spider.thread(1);//        spider.setExitWhenComplete(true);//        spider.start();////        spider.stop();//    }}

(4)XpaperZgtcbPopeline-Pipeline(管道)

import java.util.Map;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.stereotype.Service;import com.zhibo.xmt.common.mapper.pagesub.JournalMapper;import com.zhibo.xmt.common.mapper.pagesub.JournalPageMapper;import com.zhibo.xmt.common.vo.pagesub.Journal;import com.zhibo.xmt.common.vo.pagesub.JournalPage;import com.zhibo.xmt.common.webmagic.xpager.processor.XpaperZgtcbJournalPageContentProcessor;import us.codecraft.webmagic.ResultItems;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.Task;import us.codecraft.webmagic.pipeline.Pipeline;@Service("xpaperZgtcbPopeline")public class XpaperZgtcbPopeline implements Pipeline{    private static Logger logger = LoggerFactory.getLogger(XpaperZgtcbPopeline.class);    @Autowired    private JournalMapper journalMapper;    @Autowired    private JournalPageMapper journalPageMapper;    @Autowired    private XpaperZgtcbJournalPageContentPopeline xpaperZgtcbJournalPageContentPopeline;    @Override    public void process(ResultItems resultItems, Task task) {        for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {            if (entry.getKey().contains("journal")) {                Journal journal = (Journal) entry.getValue();                if(journal != null){                    Journal oldJournal = journalMapper.selectByIssue(journal.getIssue());                    if(oldJournal == null){                        try {                            /**                             * //替换URL为 appurl                             * http://i.xpaper.net/cnsports/release/542/2069.shtml                             * 替换为                             * http://i.xpaper.net/cnsports/apprelease/542/2069.shtml                             */                            journal.setUrl(journal.getUrl().replaceAll("release", "apprelease"));                            journalMapper.insertSelective(journal);                            logger.info("journalMapper-insert:" + journal.toString());                            String oldPageHtmlUrl = null;                            for(JournalPage journalPage : journal.getJournalPages()){                                journalPage.setJournalId(journal.getId());                                /**                                 * //替换URL为 appurl                                 * http://i.xpaper.net/cnsports/release/542/2069.shtml                                 * 替换为                                 * http://i.xpaper.net/cnsports/apprelease/542/2069.shtml                                 */                                oldPageHtmlUrl = journalPage.getPageHtmlUrl();//保存原有url地址,用于数据页面内容抓取                                journalPage.setPageHtmlUrl(journalPage.getPageHtmlUrl().replaceAll("release", "apprelease"));                                journalPageMapper.insertSelective(journalPage);                                logger.info("journalPageMapper-insert:" + journalPage.toString());                                logger.info("XpaperZgtcbJournalPageContentProcessor-start");                                //这里我们对后面的页面进行了深度的抓取,这里就不再进行过//多的表述,如果需要可以联系我。                                Spider spider = Spider.create(new XpaperZgtcbJournalPageContentProcessor());                                spider.addUrl(oldPageHtmlUrl);                                spider.addPipeline(xpaperZgtcbJournalPageContentPopeline);                                spider.thread(1);                                spider.setExitWhenComplete(true);                                spider.start();                                logger.info("XpaperZgtcbJournalPageContentProcessor-end");                            }                        } catch (Exception e) {                            logger.error(e.getMessage(), e);                        }                    }else{                        logger.info("期号为" + journal.getIssue() + "的期刊已经存在!");                    }                }            }        }    }}

(5)数据入库
这个部分就不再过多的赘述,因为我们用的是mybatis,所以入库的部分都是使用的mapper,还有很多项目使用的是JPA,只需要在相应的mappper部分进行替换即可。
(6)数据表设计

①s_journals 期刊信息表

-- ------------------------------ Table structure for s_journals-- ----------------------------DROP TABLE IF EXISTS `s_journals`;CREATE TABLE `s_journals` (  `id` bigint(20) NOT NULL AUTO_INCREMENT,  `title` varchar(100) DEFAULT NULL COMMENT '标题',  `title_desc` varchar(100) NOT NULL COMMENT '标题全本',  `issue` varchar(50) NOT NULL COMMENT '期号 唯一',  `date` datetime NOT NULL COMMENT '期刊日期 yyyy-MM-dd',  `date_str` varchar(50) NOT NULL COMMENT '期刊日期字符串格式 yyyy-MM-dd',  `url` varchar(255) NOT NULL COMMENT '版面地址',  `stage_desc` varchar(255) DEFAULT NULL COMMENT '版面备注',  `type` int(11) NOT NULL COMMENT '期刊类型\r\n1.中国体彩报\r\n2.其他',  `status` int(11) NOT NULL COMMENT '状态\r\n1.正常\r\n2.删除\r\n',  `grab_date` datetime NOT NULL COMMENT '抓取时间 yyyy-MM-dd HH:mm:ss',  `created_at` datetime NOT NULL COMMENT '创建时间 yyyy-MM-dd HH:mm:ss',  `updated_at` datetime NOT NULL COMMENT '更新时间 yyyy-MM-dd HH:mm:ss',  PRIMARY KEY (`id`),  UNIQUE KEY `UNIQUE_ISSUE` (`issue`) USING BTREE,  KEY `NORNAM_DATE_STR` (`date_str`) USING BTREE) ENGINE=InnoDB AUTO_INCREMENT=27 DEFAULT CHARSET=utf8 COMMENT='期刊信息表\r\n';

②s_journal_pages 期刊版面表

-- ------------------------------ Table structure for s_journal_pages-- ----------------------------DROP TABLE IF EXISTS `s_journal_pages`;CREATE TABLE `s_journal_pages` (  `id` bigint(20) NOT NULL AUTO_INCREMENT,  `journal_id` bigint(20) NOT NULL COMMENT '期刊信息表ID  journals表id',  `page_html_title` varchar(100) NOT NULL COMMENT '版面HTML格式标题 与journals_id字段联合唯一索引',  `page_html_url` varchar(500) NOT NULL COMMENT '版面HTML格式地址',  `page_html_desc` varchar(100) DEFAULT NULL COMMENT '版面html格式备注信息',  `page_pdf_title` varchar(100) DEFAULT NULL COMMENT '版面PDF格式标题',  `page_pdf_url` varchar(500) DEFAULT NULL COMMENT '版面PDF格式地址',  `page_pdf_desc` varchar(100) DEFAULT NULL COMMENT '版面PDF格式备注信息',  `page_desc` varchar(500) DEFAULT NULL COMMENT '版面备注信息',  `status` int(11) NOT NULL COMMENT '状态\r\n1.正常\r\n2.删除\r\n',  `grab_date` datetime NOT NULL COMMENT '抓取时间yyyy-MM-dd HH:mm:ss',  `created_at` datetime NOT NULL COMMENT '创建时间yyyy-MM-dd HH:mm:ss',  `updated_at` datetime NOT NULL COMMENT '更新时间yyyy-MM-dd HH:mm:ss',  PRIMARY KEY (`id`),  UNIQUE KEY `UNIQUE_JOURNALID_PAGEHTMLTITLE` (`journal_id`,`page_html_title`) USING BTREE) ENGINE=InnoDB AUTO_INCREMENT=197 DEFAULT CHARSET=utf8 COMMENT='期刊版面表(s_ journal_pages)\r\n备注: 由于一个期刊拥有多个版面,因此期刊信息表与期刊版面表为一对多关系。\r\n因此要建立journal_id 与page_heml_title两者建立联合唯一的索引。\r\n';

③s_journal_page_contents 期刊版面内容表

-- ------------------------------ Table structure for s_journal_page_contents-- ----------------------------DROP TABLE IF EXISTS `s_journal_page_contents`;CREATE TABLE `s_journal_page_contents` (  `id` bigint(20) NOT NULL AUTO_INCREMENT,  `journal_id` bigint(20) NOT NULL COMMENT '期刊信息表ID  Journals表ID',  `journal_page_id` bigint(20) NOT NULL COMMENT '期刊版面表ID  Journal_pages表ID',  `content_title` varchar(200) NOT NULL COMMENT '内容标题',  `content_url` varchar(500) NOT NULL COMMENT '内容url',  `content_id` varchar(50) DEFAULT NULL COMMENT '内容ID',  `content_id_desc` varchar(50) DEFAULT NULL COMMENT '内容ID原文',  `content_desc` varchar(50) DEFAULT NULL COMMENT '内容信息备注',  `status` int(11) NOT NULL COMMENT '状态\r\n1.正常\r\n2.删除',  `grab_date` datetime NOT NULL COMMENT '抓取时间yyyy-MM-dd HH:mm:ss',  `created_at` datetime NOT NULL COMMENT '创建时间yyyy-MM-dd HH:mm:ss',  `updated_at` datetime NOT NULL COMMENT '更新时间yyyy-MM-dd HH:mm:ss',  PRIMARY KEY (`id`),  UNIQUE KEY `UNIQUE_JOURNALID_JOURNALPAGEID_CONTENTTITLE` (`journal_id`,`journal_page_id`,`content_title`) USING BTREE) ENGINE=InnoDB AUTO_INCREMENT=178 DEFAULT CHARSET=utf8 COMMENT='期刊版面内容表\r\n备注: 由于一个期刊拥有多个版面,因此期刊信息表与期刊版面表为一对多关系;由于一个版面含有多个内容和标题,因此期刊版面表与期刊版面内容表为一对多关系。\r\n因此要建立journal_id 、journal_page_id 、content_title三者建立联合唯一的索引。\r\n';