scrala 初探(一)
来源:互联网 发布:备孕前准备 知乎 编辑:程序博客网 时间:2024/06/07 02:54
import com.gaocegege.scrala.core.spider.impl.DefaultSpiderimport com.gaocegege.scrala.core.common.response.impl.HttpResponseimport org.jsoup.helper.StringUtilimport scala.collection.mutable.Setimport scala.collection.mutable.Mapclass SpiderDouYu extends DefaultSpider { var main_url_format = "https://www.douyu.com/directory/game/%s?page=%d&isAjax=1" var sub_url_format = "https://www.douyu.com/directory/subCate/%s/%s?page=%d&isAjax=1" var item_dict = Map[String, Map[String, String]]() var target_status_dict = Map[String, Map[String, Map[String, String]]]() var urlsSet = Set[String]() abstract class MetaCallback case class category(url: String, game_name: String)extends Function1[HttpResponse, Unit] { override def apply(response: HttpResponse) = { if (response.getContentParser().select("div[class=nonText]").isEmpty){ val tag_list = (response.getContentParser()).select("div[class=tag_list] > ul").select("a") if(!tag_list.isEmpty) { target_status_dict += (game_name -> Map[String, Map[String, String]]()) for(i <- 0 to tag_list.size - 1) { var url_split_array : Array[String] = tag_list.get(i).attr("data-href").split("/") var label = tag_list.get(i).text() var url = "" if(!"全部".equals(label)) url = sub_url_format.format(url_split_array(url_split_array.size - 2), url_split_array(url_split_array.size - 1), 1) else url = main_url_format.format(url_split_array(url_split_array.size - 1), 1) var map = Map[String, String]("url" -> url, "status" -> "false") target_status_dict(game_name) += (label -> map) if(!"全部".equals(label)) request(url, video(url, game_name, label, "2", "true", url_split_array(url_split_array.size - 2) + "/" + url_split_array(url_split_array.size - 1))) } } else { var url_split_array: Array[String] = this.url.split("/") var url = main_url_format.format(url_split_array(url_split_array.size - 1), 1) target_status_dict += (game_name -> Map[String, Map[String, String]]()) var map = Map[String, String]("url" -> url, "status" -> "false") target_status_dict(game_name) += ("" -> map) request(url, video(url, game_name, "", "2", "false", url_split_array(url_split_array.size - 1))) } } } } case class video(url: String, game_name: String, category: String, page: String, haveCategory: String, format_args: String)extends Function1[HttpResponse, Unit] { override def apply(response: HttpResponse): Unit = { var will_return = false var temp_urlsSet = urlsSet.clone() temp_urlsSet += response.getContentParser().select("li > a").attr("href") if(temp_urlsSet == urlsSet) will_return = true if (!will_return) { var page_url: String = this.url var video_Elements = response.getContentParser().select("li").select("a") for(i <- 0 to video_Elements.size - 1) { var video_Element = video_Elements.get(i) var VideoUrl = StringUtil.resolve(page_url, video_Element.attr("href")) if(!urlsSet.contains(video_Element.attr("href"))) { urlsSet += video_Element.attr("href") var item_hash = Map[String, String]() item_hash += ("url" -> VideoUrl) item_hash.+= ("img" -> video_Element.select("span>img").attr("data-original")) item_hash += ("video_name" -> video_Element.select("h3[class='ellipsis']").text()) item_hash.+= ("anchor" -> video_Element.select("span[class=dy-name ellipsis fl]").text()) item_hash += ("origin_class" -> category) item_hash += ("room_id" -> video_Element.attr("data-rid")) var popularity_text = video_Element.select("span[class=dy-num fr]").text() var popularity: Int = 0 if (popularity_text.contains("万")) { popularity_text = popularity_text.replace("万", ""); popularity = (popularity_text.toDouble * 10000).toInt; } else { try{ popularity = popularity_text.toInt }catch{ case e: Exception => popularity = 0 } } item_hash.+= ("popularity" -> popularity.toString); item_dict += (item_hash("url") -> item_hash.clone()) request(item_hash("url"), individual(item_hash("url"))) } } var format_args: Array[String] = this.format_args.split("/") var page_num = this.page.toInt page_num += 1 var url: String = "" if(this.url.contains("subCate")) url = sub_url_format.format(format_args(0), format_args(1), page_num) else url = main_url_format.format(format_args(0), page_num) request(url, video(url, game_name, category, page_num.toString, haveCategory, this.format_args)) } else { var game = target_status_dict(game_name) game(category) += ("status" -> "true") var all_end: Boolean = true var GameNameDict = target_status_dict(game_name) for(h: Map[String, String] <- GameNameDict.values) { if (h("status").equals("false")) { all_end = false } } if(all_end) { for(key: String <- GameNameDict.keySet) GameNameDict(key) -> ("status" -> "false") if(GameNameDict.keySet.contains("全部")) { var MainUrl: String = GameNameDict.get("全部").get("url") var url_split_array: Array[String] = MainUrl.split("/") request(MainUrl, video(url, game_name, "", "2", "false", url_split_array(url_split_array.size - 1))) } } } } } case class individual(url: String)extends Function1[HttpResponse, Unit] { override def apply(response: HttpResponse): Unit = { var item_hash = item_dict(url) var avatar = response.getContentParser().select("div[class=anchor-pic fl]>img").attr("src") if (avatar.length() == 0) avatar = response.getContentParser().select("div[class=h_tx fl]>img").attr("src") item_hash += ("avatar" -> avatar); println("item_hash : " + item_hash) } } def startUrl = List[String]("https://www.douyu.com/directory") def parse(response: HttpResponse): Unit = { val links = (response.getContentParser()).select("ul[id=live-list-contentbox]>li[class=unit ]") for(i <- 0 to links.size - 1){ val url = StringUtil.resolve(startUrl(0), links.get(i).select("a").attr("href")) val game_name = links.get(i).select("a").select("p").text() request(url, category(url, game_name)) } }}object Main { def main(args: Array[String]) { val test = new SpiderDouYu() test.workerCount = 10 test begin }}
0 0
- scrala 初探(一)
- Hibernate初探(一)
- COM初探(一)
- (一)python初探
- tolua++初探(一)
- GDB初探(一)
- Android初探(一)
- quartz初探(一)
- Dalvik 初探 (一)
- Hadoop初探(一)
- Mahout初探(一)
- AngularJS初探(一)
- scala初探(一)
- trustZone初探(一)
- ElasticSearch初探(一)
- Oracle初探(一)
- TrustZone初探(一)
- Latex初探(一)
- solr5.5.0 中文分词<三>
- HTML5 FileReader 读取txt文件
- ASP.NET中引用JS不能调用JQuery问题 解决
- 数据库____操作表中数据
- Spring配置项<context:annotation-config/>解释说明
- scrala 初探(一)
- c/c++中字符输入问题
- LINUX系统启动流程
- java集合类详解
- bzoj3295【CDQ分治】
- Ural1540 Battle for the Ring
- 通过键盘输入一串小写字母(a~z)组成的字符串。请编写一个字符串过滤程序,若字符串中出现多个相同的字符,将非首次出现的字符过滤掉。
- HttpURLConnection用法详解
- 分糖果问题--蓝桥杯