欢迎使用CSDN-markdown编辑器

来源:互联网 发布:淘宝九一一正品折扣店 编辑:程序博客网 时间:2024/06/01 08:55
        轻量级爬虫Crawler4j之初体验

Crawler4j:轻量级多线程爬虫
github传送门:https://github.com/yasserg/crawler4j

本篇是对源码中调用Crawler4j例子的学习笔记。
源码中的注释我也保留了,以免理解错误

package code;import edu.uci.ics.crawler4j.crawler.CrawlConfig;import edu.uci.ics.crawler4j.crawler.CrawlController;import edu.uci.ics.crawler4j.fetcher.PageFetcher;import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;public class crawler4FstTry {    private static void myCrawlerTry() throws Exception {        // 设置爬取时数据临时存放目录          String crawlStorageFolder = "./tmp";         // 设置爬取线程数        int numberOfCrawlers = 5;          // 初始化爬虫配置        CrawlConfig config = new CrawlConfig();        config.setCrawlStorageFolder(crawlStorageFolder);        /*         * Be polite: Make sure that we don't send more than 1 request per         * second (1000 milliseconds between requests).         */        /*         * 设置没多少秒发出一次请求         * 比如每1000ms发出一个请求         */        config.setPolitenessDelay(1000);        /*         * You can set the maximum crawl depth here. The default value is -1 for         * unlimited depth         */        /*         * 每个网页爬取的深度:         * 比如说         * 我们准备爬取A网址,A网址中包含网址,B网址中包含C网址         * 那么 A的深度是1,B的深度是2,C的深度是3         *          */        config.setMaxDepthOfCrawling(2);        /*         * You can set the maximum number of pages to crawl. The default value         * is -1 for unlimited number of pages         */        /*         * 设置最多爬取多少个url         * 设为-1则是无限制         */        config.setMaxPagesToFetch(1000);        /*         * 如果需要爬google或者另外一些需要代理的网址         * 通过下面方法可以设置代理         *///      config.setProxyHost("");//      config.setProxyPort(0);        /*         * 这个参数用来设置你的爬虫是否可恢复         * 举个栗子:当该参数设为ture时,我们爬虫程序中断或是崩溃掉后         * 重启爬虫,爬虫会从上次中断的地方继续开始爬         */        config.setResumableCrawling(false);        /*         * 实例化爬虫的controller         */        PageFetcher pageFetcher = new PageFetcher(config);        RobotstxtConfig robotstxtConfig = new RobotstxtConfig();        RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);        CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);        /*         * 我们需要设置一个初始的url,程序会从这个url开始爬取网页里其他的         * 的url,并继续爬取所找到的url,不断循环         */        controller.addSeed("http://www.ics.uci.edu/");//      controller.addSeed("http://www.ics.uci.edu/~lopes/");//      controller.addSeed("http://www.ics.uci.edu/~welling/");        /*          * Start the crawl. This is a blocking operation, meaning that your code          * will reach the line after this only when crawling is finished.          */          /*         *开始爬取         */        controller.start(BasicCrawler.class, numberOfCrawlers);      }    public static void main(String[] args) throws Exception {        myCrawlerTry();    }}

BasicCrawler.class的源码

package code;import edu.uci.ics.crawler4j.crawler.Page;import edu.uci.ics.crawler4j.crawler.WebCrawler;import edu.uci.ics.crawler4j.parser.HtmlParseData;import edu.uci.ics.crawler4j.url.WebURL;import java.util.List;import java.util.regex.Pattern;import org.apache.http.Header;/** * @author Yasser Ganjisaffar <lastname at gmail dot com> */public class BasicCrawler extends WebCrawler {    //正则表达式,用于匹配url    private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g" + "|png|tiff?|mid|mp2|mp3|mp4"            + "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");    /**     * You should implement this function to specify whether the given url     * should be crawled or not (based on your crawling logic).     */     /**     * 该方法是判断该url是否需要爬取     */    @Override    public boolean shouldVisit(WebURL url) {        String href = url.getURL().toLowerCase();        return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");    }    /**     * This function is called when a page is fetched and ready to be processed     * by your program.     */     /**     * 当页面被爬取时,这个方法会被调用     * 用于输出一些页面的信息     */    @Override    public void visit(Page page) {        //以 www.baidu.com  为例        //程序定义的ID        int docid = page.getWebURL().getDocid();        //url:https://www.baidu.com/        String url = page.getWebURL().getURL();        ////域名,如baidu.com        String domain = page.getWebURL().getDomain();        ////路径,不包含url等参数  如 "/"        String path = page.getWebURL().getPath();        //子域名        String subDomain = page.getWebURL().getSubDomain();        //父页面        String parentUrl = page.getWebURL().getParentUrl();        //锚,即HTML显示的信息,如<a href="***">锚</a>        String anchor = page.getWebURL().getAnchor();        System.out.println("Docid: " + docid);        System.out.println("URL: " + url);        System.out.println("Domain: '" + domain + "'");        System.out.println("Sub-domain: '" + subDomain + "'");        System.out.println("Path: '" + path + "'");        System.out.println("Parent page: " + parentUrl);        System.out.println("Anchor text: " + anchor);        if (page.getParseData() instanceof HtmlParseData) {            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();            //HTML显示的信息            String text = htmlParseData.getText();            //HTML全部代码              String html = htmlParseData.getHtml();            //在该页面发现的全部URL地址            List<WebURL> links = htmlParseData.getOutgoingUrls();            System.out.println("Text length: " + text.length());            System.out.println("Html length: " + html.length());            System.out.println("Number of outgoing links: " + links.size());        }        //页面服务器返回的HTML头信息          Header[] responseHeaders = page.getFetchResponseHeaders();        if (responseHeaders != null) {            System.out.println("Response headers:");            for (Header header : responseHeaders) {                System.out.println("\t" + header.getName() + ": " + header.getValue());            }        }        System.out.println("=============");    }}
0 0