SPRING BOOT+WEBMAGIC
来源:互联网 发布:软件研发类期刊 编辑:程序博客网 时间:2024/06/07 05:43
最近,想自己学习下hadoop,但又缺少点文本数据,所以需要爬取点数据~ 不会写py , 就直接找了个爬虫框架~ webmagic的原理图如下,很简单很好用:
POM.xml
<!-- mybatis start--> <dependency> <groupId>org.mybatis.spring.boot</groupId> <artifactId>mybatis-spring-boot-starter</artifactId> <version>${mybatis-version}</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> </dependency> <!-- mybatis end --> <!-- webMagic start --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>${webMagic-version}</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>${webMagic-version}</version> </dependency> <!-- webMagic end -->
Application.java
import org.springframework.boot.SpringApplication;import org.springframework.boot.autoconfigure.SpringBootApplication;import org.springframework.boot.builder.SpringApplicationBuilder;import org.springframework.boot.web.support.SpringBootServletInitializer;import org.springframework.scheduling.annotation.EnableScheduling;@SpringBootApplication@EnableSchedulingpublic class GlobeFishWebMagicApplication extends SpringBootServletInitializer { @Override protected SpringApplicationBuilder configure(SpringApplicationBuilder application) { return application.sources(GlobeFishWebMagicApplication.class); } public static void main(String[] args) { SpringApplication.run(GlobeFishWebMagicApplication.class, args); }}
processor.java
import java.util.Date;import java.util.List;import java.util.concurrent.CountDownLatch;import java.util.concurrent.atomic.AtomicInteger;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.boot.autoconfigure.SpringBootApplication;import org.springframework.scheduling.annotation.Scheduled;import org.springframework.stereotype.Component;import com.panchen.globeFishWebMagic.entity.CSDNMessage;import com.panchen.globeFishWebMagic.mapper.CSDNMessageMapper;import com.panchen.globeFishWebMagic.util.SpringContextUtil;import com.panchen.globeFishWebMagic.util.UUIDUtil;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.processor.PageProcessor;/** * 爬虫 * * @author pc * */@Component@SpringBootApplication public class CSDNProcessor extends Thread implements PageProcessor { private final static Logger logger = LoggerFactory.getLogger(CSDNProcessor.class); @Autowired private CSDNMessageMapper csdnMessageMapper; private String originalUrl; private String taskName; // CountDownLatch作为计数器记录线程 private static CountDownLatch cdl=new CountDownLatch(9); //使用原子变量 private static AtomicInteger urlCount = new AtomicInteger(0); private static AtomicInteger pageCount = new AtomicInteger(1); public CSDNProcessor() { } public CSDNProcessor(CountDownLatch cdl) { this.cdl = cdl; } // 抓取配置 private Site site = Site.me().setSleepTime(1000).setRetryTimes(30).setCharset("utf-8").setTimeOut(300000) .setUserAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); @Override public Site getSite() { return site; } @Override public void process(Page page) { if (page.getUrl().regex("http://blog\\.csdn\\.net/(.*)/article/details/(.*)").match()) { // get CSDNMessage newCSDNMessage = new CSDNMessage(UUIDUtil.getUUID(), page.getUrl().get(), page.getHtml().xpath("//*[@id=\"blog_userface\"]/span/a/text()").get(), page.getHtml().xpath("//*[@class=\"article_title\"]/h1/span/text()").get(), page.getHtml().xpath("//*[@id=\"article_content\"]").get(), page.getHtml().xpath("//*[@class=\"link_postdate\"]/text()").get(), new Date(), 1, null, null, page.getHtml().xpath("//*[@id=\"btnDigg\"]/dd/text()").get(), page.getHtml().xpath("//*[@id=\"btnBury\"]/dd/text()").get(), page.getHtml().xpath("//*/[@class=\"link_view\"]/text()").get(), page.getHtml().xpath("//*[@class=\"link_comments\"]/text()").get(), page.getHtml().xpath("//*[@class=\"category_r\"]/label/span/text()").get()); csdnMessageMapper.addCSDNMessage(newCSDNMessage); urlCount.getAndIncrement(); } List<String> urls = page.getHtml() .xpath("//*[@class=\"blog_list clearfix\"]/dd/[@class=\"tracking-ad\"]/a/@href").all(); // 跳页 if (page.getUrl().get().matches("http://blog\\.csdn\\.net/(.*)/newarticle.html(.*)")) { pageCount.getAndIncrement(); if (page.getUrl().get().matches("http://blog\\.csdn\\.net/(.*)/newarticle.html")) { page.addTargetRequest(page.getUrl().get() + "?&page=2"); } else { page.addTargetRequest( page.getUrl().get().substring(0, page.getUrl().get().lastIndexOf('=') + 1) + pageCount); } } if (null != urls && 0 < urls.size()) { for (String url : urls) { if (null != csdnMessageMapper.getMessageByUrl(url)) { csdnMessageMapper.deleteCSDNMessageByUrl(url); } page.addTargetRequest(url); } } } public void run() { long startTime, endTime; logger.info(taskName + "START!!!!!"); startTime = System.currentTimeMillis(); //spring对bean的管理是安全的 无法通过注入来得到bean 工具类实现ApplicationContextAware即可 Spider.create(SpringContextUtil.getBeanByClass(new CSDNProcessor().getClass())).addUrl(originalUrl).thread(1).run(); endTime = System.currentTimeMillis(); logger.info(taskName + "END!!!!!,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了" + pageCount + "页、" + urlCount + "条记录"); cdl.countDown(); } /** * * 对多模块进行爬取 */ @Scheduled(cron = "0 46 16 ? * *") public void scheduled() { long startTime, endTime; logger.info("START!!!!!"); startTime = System.currentTimeMillis(); // 手機 CSDNProcessor mobile = new CSDNProcessor(cdl); mobile.setOriginalUrl("http://blog.csdn.net/mobile/newarticle.html"); mobile.setTaskName("mobile"); // web前端 CSDNProcessor web = new CSDNProcessor(cdl); web.setOriginalUrl("http://blog.csdn.net/web/newarticle.html"); web.setTaskName("web"); // 研發管理 CSDNProcessor software = new CSDNProcessor(cdl); software.setOriginalUrl("http://blog.csdn.net/software/newarticle.html"); software.setTaskName("software"); // 架構設計 CSDNProcessor enterprise = new CSDNProcessor(cdl); enterprise.setOriginalUrl("http://blog.csdn.net/enterprise/newarticle.html"); enterprise.setTaskName("enterprise"); // 程序語言 CSDNProcessor code = new CSDNProcessor(cdl); code.setOriginalUrl("http://blog.csdn.net/code/newarticle.html"); code.setTaskName("code"); // 互聯網 CSDNProcessor www = new CSDNProcessor(cdl); www.setOriginalUrl("http://blog.csdn.net/www/newarticle.html"); www.setTaskName("www"); // 數據庫 CSDNProcessor database = new CSDNProcessor(cdl); database.setOriginalUrl("http://blog.csdn.net/database/newarticle.html"); database.setTaskName("database"); // cloud CSDNProcessor cloud = new CSDNProcessor(cdl); cloud.setOriginalUrl("http://blog.csdn.net/cloud/newarticle.html"); cloud.setTaskName("cloud"); // 總和 CSDNProcessor other = new CSDNProcessor(cdl); other.setOriginalUrl("http://blog.csdn.net/other/newarticle.html"); other.setTaskName("other"); // 子线程开始 mobile.start(); web.start(); software.start(); enterprise.start(); code.start(); www.start(); database.start(); cloud.start(); other.start(); // 主线程等待 try { cdl.await(); } catch (InterruptedException e) { e.printStackTrace(); } endTime = System.currentTimeMillis(); logger.info("END!!!!!,总耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了" + pageCount + "页、" + urlCount + "条记录"); } public String getOriginalUrl() { return originalUrl; } public void setOriginalUrl(String originalUrl) { this.originalUrl = originalUrl; } public String getTaskName() { return taskName; } public void setTaskName(String taskName) { this.taskName = taskName; }}
阅读全文
2 0
- SPRING BOOT+WEBMAGIC
- 爬虫框架webmagic与spring boot的结合使用
- webmagic
- WebMagic
- webmagic
- 【Spring Boot】Spring Boot
- spring boot
- spring boot
- spring-boot
- spring boot
- Spring Boot
- spring boot
- spring boot
- Spring Boot
- spring boot
- Spring Boot
- spring boot
- spring boot
- 二叉搜索树转化为排序双向链表。可以使用中序线索化的方法去进行,在这里需要注意的是我们需要一个记录前一个访问节点的结点。 二叉搜索树转换前: 转换后: 思路:如果根节点的左子树存在,则一直去访
- KRPano
- LeetCode——122. Best Time to Buy and Sell Stock II
- 2n皇后问题
- win7环境下查询端口号占用及杀进程
- SPRING BOOT+WEBMAGIC
- Lua中的元表与元方法学习总结
- Harvesting the Low-hanging Fruits: Defending Against Automated Large-Scale Cyber-Intrusions by Focus
- 百度地图 key 错误,230
- Masonry自动计算cell行高:HYBMasonryAutoCellHeight
- [绍棠_Swift] Swift3.0笔记整理
- 使用vs2015编译lua语言
- Java IO流学习总结
- CF427D(Div2) Palindromic characteristics