webcollector爬虫demo

来源:互联网 发布:如何用java编写程序 编辑:程序博客网 时间:2024/06/05 18:31

由于我们公司第二季度亏了7-8亿,所以项目组没有多余的资金让我们去正规渠道买数据。然后我就走向了一天爬虫的不归路。

其实Java爬虫有很多开源的框架,这边我选择的是webcollector这个中小型的框架(官网:https://github.com/CrawlScript/WebCollector,教程文档:http://datahref.com/archives/category/webcollector%E6%95%99%E7%A8%8B)


爬虫新手一只,现在我把代码贴出来,我们共同学习:



import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;import cn.edu.hfut.dmic.webcollector.model.Page;import cn.edu.hfut.dmic.webcollector.net.HttpRequest;import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;import cn.edu.hfut.dmic.webcollector.util.CharsetDetector;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/** * 这段代码是获取 * http://www.variflight.com/sitemap.html?AE71649A58c77= * 页面的所有的航班号信息的一小段爬虫代码 */public class DemoPostCrawlerTest extends BreadthCrawler {    public DemoPostCrawlerTest(String crawlPath, boolean autoParse) {        super(crawlPath, autoParse);    }    @Override    public void visit(Page page, CrawlDatums next) {        CloseableHttpClient client = HttpClients.createDefault();        String url = page.getUrl();        try {            HttpGet get = new HttpGet(url);            HttpResponse response = client.execute(get);            HttpEntity entity = response.getEntity();            byte[] content = EntityUtils.toByteArray(entity);            String charset = CharsetDetector.guessEncoding(content);            String html = new String(content, charset);            Document doc = Jsoup.parse(html, url);            Elements links = doc.select("a[href]");            for (int i = 0; i < links.size(); i++) {                Element link = links.get(i);                String href = link.attr("abs:href");                if (href.startsWith("http://www.variflight.com/flight/fnum/")) {                    String flightNo = href.replace("http://www.variflight.com/flight/fnum/", "").split("\\.")[0];                    System.out.println(flightNo);                }            }        } catch (Exception e) {        }    }    public static void main(String[] args) throws Exception {        DemoPostCrawlerTest crawler = new DemoPostCrawlerTest("crawl", true);        crawler.setThreads(20);        crawler.addSeed("http://www.variflight.com/sitemap.html?AE71649A58c77=");        crawler.start(3);    }}


pom.xml

<dependencies>    <dependency>        <groupId>cn.edu.hfut.dmic.webcollector</groupId>        <artifactId>WebCollector</artifactId>        <version>2.31</version>    </dependency>    <dependency>        <groupId>org.apache.httpcomponents</groupId>        <artifactId>httpclient</artifactId>        <version>4.5</version>    </dependency>    <dependency>        <groupId>org.jsoup</groupId>        <artifactId>jsoup</artifactId>        <version>1.8.3</version>    </dependency></dependencies>

0 0
原创粉丝点击