webmagic爬取腾讯nba数据

来源:互联网 发布:海康录像机设备域名 编辑:程序博客网 时间:2024/05/29 05:53
package cn.taneroom.webmagic.demo.processor;import java.util.List;import org.apache.commons.collections.CollectionUtils;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Request;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.processor.PageProcessor;import us.codecraft.webmagic.selector.JsonPathSelector;import us.codecraft.webmagic.utils.HttpConstant;/** * 腾讯NBA数据爬取 * @author TANZHEN553 */public class TencentNbaPageProcessor implements PageProcessor {    // 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);    private final String LIST_URL = "http://tags\\.open\\.qq\\.com/interface/tag/articles\\.php\\?callback=tagListCb&p=\\d+&l=\\d+&tag=NBA&oe=gbk&ie=utf-8&source=web&site=sports&_=\\d+";    private final String DETAIL_URL = "http://sports\\.qq\\.com/a/\\d+/\\d+\\.htm";    public Site getSite() {        return site;    }    public void process(Page page) {        if (page.getUrl().regex(LIST_URL).match()) {  // 匹配列表页            // 获取列表的jsonp数据,并解析得到对应的详情url            String rawText = page.getRawText();            rawText = rawText.substring("tagListCb(".length(), rawText.length() - 1);            List<String> detailUrls = new JsonPathSelector("$.data.articles[*].url").selectList(rawText);              if (CollectionUtils.isNotEmpty(detailUrls)) {                  for (String detailUrl : detailUrls) {                      //构造get请求                      Request request = createGetRequest(detailUrl);                      //添加Request对象到URL请求队列                      page.addTargetRequest(request);                  }              }          } else if (page.getUrl().regex(DETAIL_URL).match()) {  // 匹配详情页            String title = page.getHtml().xpath("//div[@class='qq_article']/div[@class='hd']/h1/text()").get();            String content = page.getHtml().xpath("//div[@id='Cnt-Main-Article-QQ']").get();            if (content.length() > 1000) {                content = content.substring(0, 1000)+"......";            }            System.out.println("标题:\n"+title);            System.out.println("内容:\n"+content);        }    }    /**     * 创建GET请求的Request对象       * @param url     * @return     */    private Request createGetRequest(String url){          //构造Request请求对象          Request request = new Request(url);          request.setMethod(HttpConstant.Method.GET);          return request;      }      public static void main(String[] args) {        int start = 1;        int end = 20;        Spider.create(new TencentNbaPageProcessor()).addUrl("http://tags.open.qq.com/interface/tag/articles.php?callback=tagListCb&p="+start+"&l="+end+"&tag=NBA&oe=gbk&ie=utf-8&source=web&site=sports&_="+System.currentTimeMillis()).thread(3).run();    }}