webcollector 初探(二)

来源:互联网 发布:apache tomcat 配置 编辑:程序博客网 时间:2024/06/14 12:43

将webcollector初探(一)中的空架子添加,稍加修改,

就可以得到类似scrapy爬取规则的java多线程爬虫,不知道速度上与scrapy的高下,

这是两门语言的比较,也是多线程与Twisted的基于事件的异步模型的比较,在以后会

探究二者速度上的差异,先给出一个爬取斗鱼视频的简单例子。

从分类页面到具体标签,再到第一个视频页面的内容抽取。

这里一个与scrapy的不同在于,原创的webcollector框架只执行一个callback函数(页面处理的)、

相反scrapy可以实现多个parse指定。

故使用meta作为区分机制,进行不同的页面处理逻辑。

下面是简单的示例代码:


package DouYuSingle;import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;import cn.edu.hfut.dmic.webcollector.model.Page;import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;import org.jsoup.nodes.Element;import org.jsoup.helper.StringUtil;import org.jsoup.select.Elements;import java.util.ArrayList;class item{    public String  video_name;    public String  origin_class;    public String  source ;    public String  url ;    public String  anchor ;    public String  avatar;    public int    popularity ;    public String  img;    public int    fun_num;    public ArrayList<String> labels = new ArrayList<>();}public class SpiderDouYu_Single extends  BreadthCrawler{    public ArrayList<item> item_list= new ArrayList<>();    public SpiderDouYu_Single(String crawlPath, boolean autoParse){        super(crawlPath, autoParse);        this.addSeed(new CrawlDatum("https://www.douyu.com/directory").meta("category_depth", "game_name"));    }    @Override    public void visit(Page page, CrawlDatums next)    {        if (page.meta("category_depth").equals("game_name"))        {            System.out.println("call if game_name");            for(Element li_Element : page.select("ul[id=live-list-contentbox]>li"))            {                String url = StringUtil.resolve(page.getUrl(), li_Element.select("a").attr("href"));                //this.addSeed(new CrawlDatum(url).meta("category_depth", "category"));                next.add(new CrawlDatum(url).meta("category_depth", "category"));                System.out.println("url : " + url);            }        }        else if(page.meta("category_depth").equals("category"))        {            System.out.println("call if category");            if (!page.select("div[class=nonconText]").isEmpty())                return;            Elements tag_list = new Elements();            tag_list = page.select("div[class=tag_list] > ul").select("a");            if (!tag_list.isEmpty())            {                for(Element tag_Element : tag_list)                {                    String url = StringUtil.resolve(page.getUrl(), tag_Element.attr("data-href"));                    String label = tag_Element.text();                    CrawlDatum crawl_datum_ext = new CrawlDatum(url);                    crawl_datum_ext.meta("category", label);                    crawl_datum_ext.meta("category_depth", "video");                    //this.addSeed(crawl_datum_ext);                    next.add(crawl_datum_ext);                }            }        }        else if(page.meta("category_depth").equals("video"))        {            System.out.println("call if video");            String page_url = page.getUrl();            for(Element video_Element :page.select("div[id=live-list-content]>ul").select("a"))            {                item Item = new item();                Item.source = "斗鱼";                Item.url = StringUtil.resolve(page_url, video_Element.attr("href"));                Item.img = video_Element.select("span>img").attr("data-original");                Item.video_name = video_Element.select("h3[class='ellipsis']").text();                Item.anchor = video_Element.select("span[class=dy-name ellipsis fl]").text();                Item.labels.add(page.meta("category"));                String popularity_text = video_Element.select("span[class=dy-num fr]").text();                int popularity;                if (popularity_text.contains("万"))                {                    popularity_text.replace("万", "");                    popularity =(int) Double.parseDouble(popularity_text) * 10000;                }                else                    popularity = 0;                Item.popularity = popularity;                System.out.println(Item.anchor +  " :" + Item.popularity);                item_list.add(Item);            }        }    }    public static void main(String[] args) throws Exception {        SpiderDouYu_Single crawler = new SpiderDouYu_Single("SpiderDouYu_Single", true);        crawler.setThreads(5);        crawler.setTopN(100);    /*start crawl with depth of 4*/        crawler.start(4);    }}



0 0
原创粉丝点击