抓取36氨北极社区内容

来源:互联网 发布:相机软件 编辑:程序博客网 时间:2024/04/27 16:44
private static void crawl36() {try {String urls = "http://www.36kr.com/topics/recent?page=1";String site = "http://www.36kr.com";HttpURLConnection con = getHttpURLConnection(urls);con.setRequestProperty("User-Agent","Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25");con.setRequestMethod("GET");String contentType = con.getContentType();if (contentType.indexOf("charset") == -1) {contentType = "GBK";} else {contentType = contentType.substring(contentType.indexOf("=") + 1, contentType.length());}Document document = Jsoup.parse(IOUtils.toString(con.getInputStream(), contentType));Elements elements = document.select(".infos");List<News> listNews = new ArrayList<News>();News news = null;for (Element element : elements) {Element elementUrl = element.select(".title").get(0);String url = elementUrl.getElementsByTag("a").attr("href");String title = elementUrl.getElementsByTag("a").text();String tag = element.select(".node").text();news = new News();news.setUrl(site + url);news.setTitle(title);news.setTag(tag);listNews.add(news);}System.out.println("抓取" + listNews.size() + "条");for (int i = 0; i < listNews.size(); i++) {System.out.print(listNews.get(i));}} catch (Exception e) {}}待续


                                             
0 0