webcollector 初探(三)
来源:互联网 发布:adams导入数据 编辑:程序博客网 时间:2024/06/06 09:09
这里是利用设定 key来禁用判重机制,用java写真的比python复杂得多,
不过速度上随着线程数的增加媲美scrapy,就是占点内存。
由于是多线程,故在redis插入时使用了redis线程池,
相应线程池代码见:http://blog.csdn.net/songylwq/article/details/26008327
package DouYuSingle;import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;import cn.edu.hfut.dmic.webcollector.model.Page;import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;import org.jsoup.nodes.Element;import org.jsoup.helper.StringUtil;import org.jsoup.select.Elements;import java.util.*;import redis.clients.jedis.Jedis;import redis.clients.jedis.JedisPool;import com.ljq.utils.RedisAPI;public class SpiderDouYu_Single extends BreadthCrawler{ //redis connection JedisPool pool = RedisAPI.getPool(); //String public String main_url_format = "https://www.douyu.com/directory/game/%s?page=%d&isAjax=1"; public String sub_url_format = "https://www.douyu.com/directory/subCate/%s/%s?page=%d&isAjax=1"; //Container public HashMap<String, HashMap<String, String>> item_dict = new HashMap<>(); public HashMap<String, HashMap<String, HashMap<String, String>>> target_status_dict = new HashMap<>(); public HashSet<String> urlsSet = new HashSet<>(); //method public SpiderDouYu_Single(String crawlPath, boolean autoParse){ super(crawlPath, autoParse); this.addSeed(new CrawlDatum("https://www.douyu.com/directory").meta("category_depth", "game_name")); } @Override public void visit(Page page, CrawlDatums next) { if (page.meta("category_depth").equals("game_name")) { for(Element li_Element : page.select("ul[id=live-list-contentbox]>li[class=unit ]")) { String url = StringUtil.resolve(page.getUrl(), li_Element.select("a").attr("href")); String game_name = li_Element.select("a").select("p").text(); CrawlDatum crawl_datum_ext = new CrawlDatum(url); crawl_datum_ext.meta("game_name", game_name); crawl_datum_ext.meta("category_depth", "category"); crawl_datum_ext.setKey(System.currentTimeMillis() + "++" + url); next.add(crawl_datum_ext); } } else if(page.meta("category_depth").equals("category")) { if (!page.select("div[class=nonconText]").isEmpty()) return; Elements tag_list = page.select("div[class=tag_list] > ul").select("a"); if (!tag_list.isEmpty()) { target_status_dict.put(page.meta("game_name"), new HashMap<String, HashMap<String, String>>()); for(Element tag_Element : tag_list) { String[] url_split_array = tag_Element.attr("data-href").split("/"); String label = tag_Element.text(); String url = new String(); if (!"全部".equals(label)) url = String.format(sub_url_format, url_split_array[url_split_array.length - 2], url_split_array[url_split_array.length - 1], 1); else url = String.format(main_url_format, url_split_array[url_split_array.length - 1], 1); HashMap<String, String> map = new HashMap<>(); map.put("url", url); map.put("status", "false"); target_status_dict.get(page.meta("game_name")).put(label, map); if(!"全部".equals(label)) { CrawlDatum crawl_datum_ext = new CrawlDatum(url); crawl_datum_ext.meta("format_args", url_split_array[url_split_array.length - 2] + "/" + url_split_array[url_split_array.length - 1]); crawl_datum_ext.meta("category", label); crawl_datum_ext.meta("category_depth", "video"); crawl_datum_ext.meta("page", "2"); crawl_datum_ext.meta("haveCategorys", "true"); crawl_datum_ext.setKey(System.currentTimeMillis() + "++" + url); next.add(crawl_datum_ext); } } } else{ String url = new String(); String[] url_split_array = page.getUrl().split("/"); url = String.format(main_url_format, url_split_array[url_split_array.length - 1], 1); target_status_dict.put(page.meta("game_name"), new HashMap<String, HashMap<String, String>>()); HashMap<String, String> map = new HashMap<>(); map.put("url", url); map.put("status", "false"); target_status_dict.get(page.meta("game_name")).put("", map); CrawlDatum crawl_datum_ext = new CrawlDatum(url); crawl_datum_ext.meta("format_args", url_split_array[url_split_array.length - 1]); crawl_datum_ext.meta("game_name", page.meta("game_name")); crawl_datum_ext.meta("category", ""); crawl_datum_ext.meta("category_depth", "video"); crawl_datum_ext.meta("page", "2"); crawl_datum_ext.setKey(System.currentTimeMillis() + "++" + page.getUrl()); next.add(crawl_datum_ext); } } else if(page.meta("category_depth").equals("video")) { boolean will_return = false; HashSet<String> temp_urlsSet = (HashSet<String>) urlsSet.clone(); for(String url: page.select("li > a").attr("href").split(" ")) temp_urlsSet.add(url); if(temp_urlsSet.size() == urlsSet.size()) will_return = true; if(!will_return) { String page_url = page.getUrl(); for(Element video_Element :page.select("li").select("a")) { String VideoUrl = StringUtil.resolve(page_url, video_Element.attr("href")); if(!urlsSet.contains(VideoUrl)) urlsSet.add(VideoUrl); else continue; HashMap<String, String>item_hash = new HashMap<>(); item_hash.put("url", VideoUrl); item_hash.put("img", video_Element.select("span>img").attr("data-original")); item_hash.put("video_name", video_Element.select("h3[class='ellipsis']").text()); item_hash.put("anchor", video_Element.select("span[class=dy-name ellipsis fl]").text()); item_hash.put("origin_class", page.meta("category")); item_hash.put("room_id", video_Element.attr("data-rid")); String popularity_text = video_Element.select("span[class=dy-num fr]").text(); int popularity; if (popularity_text.contains("万")) { popularity_text = popularity_text.replace("万", ""); popularity =(int) Double.parseDouble(popularity_text) * 10000; } else { try{ popularity = Integer.parseInt(popularity_text); }catch(Exception e){ popularity = 0; } } item_hash.put("popularity", String.valueOf(popularity)); item_dict.put(item_hash.get("url"), item_hash); System.out.println("the size of item_url :" + item_dict.keySet().size()); CrawlDatum crawl_datum_ext = new CrawlDatum(item_hash.get("url")); crawl_datum_ext.meta("category_depth", "individual"); crawl_datum_ext.meta("ori_url", item_hash.get("url")); crawl_datum_ext.setKey(System.currentTimeMillis() + "++" + item_hash.get("url")); next.add(crawl_datum_ext); } String[] format_args = page.meta("format_args").split("/"); int page_num = Integer.parseInt(page.meta("page")); page_num += 1; String url = new String(); if(page.getUrl().contains("subCate")) url = String.format(sub_url_format, format_args[0], format_args[1], page_num); else url = String.format(main_url_format, format_args[0], page_num); CrawlDatum crawl_datum_ext = new CrawlDatum(url); crawl_datum_ext.meta("format_args", page.meta("format_args")); crawl_datum_ext.meta("game_name", page.meta("game_name")); crawl_datum_ext.meta("category", page.meta("category")); crawl_datum_ext.meta("category_depth", "video"); crawl_datum_ext.meta("page", String.valueOf(page_num)); if (page.meta("haveCategorys") != null) crawl_datum_ext.meta("haveCategorys", "true"); crawl_datum_ext.setKey(System.currentTimeMillis() + "++" + url); next.add(crawl_datum_ext); } else { target_status_dict.get(page.meta("game_name")).get(page.meta("label")).put("status", "true"); boolean all_end = true; HashMap<String, HashMap<String, String>>GameNameDict = target_status_dict.get(page.meta("game_name")); for(HashMap<String, String>h: GameNameDict.values()) { if(h.get("status").equals("false")) { all_end = false; break; } } if(all_end) { for(String key: GameNameDict.keySet()) GameNameDict.get(key).put("status", "false"); if(GameNameDict.keySet().contains("全部")) { String MainUrl = GameNameDict.get("全部").get("url"); String[] url_split_array = MainUrl.split("/"); CrawlDatum crawl_datum_ext = new CrawlDatum(MainUrl); crawl_datum_ext.meta("format_args", url_split_array[url_split_array.length - 1]); crawl_datum_ext.meta("game_name", page.meta("game_name")); crawl_datum_ext.meta("category", ""); crawl_datum_ext.meta("category_depth", "video"); crawl_datum_ext.meta("page", "2"); crawl_datum_ext.setKey(System.currentTimeMillis() + "++" + page.getUrl()); next.add(crawl_datum_ext); } } } } else if(page.meta("category_depth").equals("individual")) { HashMap<String, String>item_hash = item_dict.get(page.meta("ori_url")); String avatar = page.select("div[class=anchor-pic fl]>img").attr("src"); if (avatar.length() == 0) avatar = page.select("div[class=h_tx fl]>img").attr("src"); item_hash.put("avatar", avatar); //System.out.println(item_hash.toString()); Jedis jedis = pool.getResource(); jedis.lpush("DOUYU_ITEMS", item_hash.toString()); RedisAPI.returnResource(pool, jedis); } } public static void main(String[] args) throws Exception { SpiderDouYu_Single crawler = new SpiderDouYu_Single("SpiderDouYu_Single", true); crawler.setThreads(50); //crawler.setTopN(100); /*start crawl with depth of 4*/ crawler.start(4); }}
0 0
- webcollector 初探(三)
- webcollector 初探(一)
- webcollector 初探(二)
- Webcollector + Spring + MVC 搭建应用初探(三)
- Webcollector + Spring + MVC 搭建应用初探(一)
- Webcollector + Spring + MVC 搭建应用初探(二)
- Webcollector + Spring + MVC 搭建应用初探(四)
- Webcollector + Spring + MVC 搭建应用初探(五)(Crab 推荐系统实例)
- Webcollector + Spring + MVC 搭建应用初探(六)(Lenskit 推荐系统实例)
- WebCollector入门教程(中文版)
- COM初探(三)
- tolua++初探(三)
- Scala初探(三)
- Oracle初探(三)
- 初探UiAutomator(三)
- 异常初探(三)
- 多线程初探(三)
- 集合初探(三)
- Maven实战指南 06
- 齐肯多夫定理
- iOS-传值大全
- List双括号初始化
- python中的项目和应用的关系
- webcollector 初探(三)
- feof()和EOF的用法—— C中文件结尾的判断
- c++模板详解(2)
- [hihocoder1038]01背包问题
- bp神经网络及matlab实现
- Android中的加密方法
- Java I/O流图示
- node.js开发错误——TypeError: req.flash is not a function
- JAVA实现的支付宝扫描二维码支付