用WebCollector 写一个163新闻网站的爬虫，并且定时启动

来源：互联网发布：app 暂无数据图片素材编辑：程序博客网时间：2024/06/05 19:33

package com.yd.ibuznet.modules.crawl;

import java.util.Timer;
import java.util.TimerTask;

import org.jsoup.nodes.Document;

import com.yd.ibuznet.core.util.DateUtil;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;

public class NewsCrawler extends BreadthCrawler {

public NewsCrawler(String crawlPath, boolean autoParse) {
super(crawlPath, autoParse);
/* 不加载图片 */
this.addRegex("-.*\\.(jpg|png|gif).*");
/* 不要爬去包含 #的url */
this.addRegex("-.*#.*");
}

@Override
public void visit(Page page, CrawlDatums arg1) {
Integer year = (Integer.valueOf(DateUtil.getYear())-2000);
String day = (DateUtil.getMonth()+DateUtil.getDay());
@SuppressWarnings("deprecation")
String url = page.getUrl();
/* 判断是否是新闻页 */
if (page.matchUrl("http://news.163.com/"+year+"/"+day+"/"+".*html")) {
/* 用jsoup解析页面 */
// @SuppressWarnings("deprecation")
// Document doc = page.getDoc();

/* 通过css选择器提取新闻标题和新闻内容 */
String title = page.select("div[id=epContentLeft]>h1").first().text();
String source = page.select("div[class=ep-source cDGray]>span").first().text();
String content = page.select("div#endText", 0).text();
//String author = page.select("", 0).text();
System.out.println("地址:" + url);
System.out.println("标题:" + title);
System.out.println("内容:" + content);
System.out.println("目标来源:"+source);
/*
* 如果你想添加新的爬取任务，可以向next中添加爬取任务，这就是上文中提到的手动解析
*/
/*
* WebCollector会自动去掉重复的任务(通过任务的key，默认是URL)，
* 因此在编写爬虫时不需要考虑去重问题，加入重复的URL不会导致重复爬取
*/
/*
* 如果autoParse是true(构造函数的第二个参数)，爬虫会自动抽取网页中符合正则规则的URL，
* 作为后续任务，当然，爬虫会去掉重复的URL，不会爬取历史中爬取过的URL。 autoParse为true即开启自动解析机制
*/
// next.add("http://xxxxxx.com");
}
}

public static void main(String[] args) throws Exception {
Timer timer = new Timer();
timer.schedule(new TimerTask() {
public void run() {
// System.out.println(Integer.valueOf(year.getYear())-2000);
// ystem.out.println(year.getMonth()+year.getDay());
//获取当前系统时间日期
Integer year = (Integer.valueOf(DateUtil.getYear())-2000);
String day = (DateUtil.getMonth()+DateUtil.getDay());
NewsCrawler crawler = new NewsCrawler("crawl", true);
crawler.addSeed("http://news.163.com/");
// URL正则规则爬取符合http://news.hfut.edu.cn/show-xxxxxxhtml的URL
// http://news.hfut.edu.cn/show-.*html
crawler.addRegex("http://news.163.com/"+year+"/"+day+"/"+".*html");
crawler.setResumable(false);
/*线程数*/
crawler.setThreads(50);
/*设置每次迭代中爬取数量的上限*/
crawler.setTopN(1000);
// crawler.setResumable(true);
/* start crawl with depth of 4 */
try {
crawler.start(4);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
},0,20000);// 设定指定的时间time,此处为20000毫秒
}

}

下图为爬完后打印出来的数据

阅读全文

0 0