【网络爬虫】使用HttpClient4.3.5抓取数据
来源:互联网 发布:吃奶酪 知乎 编辑:程序博客网 时间:2024/06/05 17:21
使用jar——Apache client
下载地址:
http://hc.apache.org/downloads.cgi
结构:
代码结构:
具体代码:
抓取结果封装
/** * 抓取结果的封装 * @author tsj-pc * */public class CrawlResultPojo { private boolean isSuccess; private String pageContent; private int httpStatuCode; public boolean isSuccess() { return isSuccess; } @Override public String toString() { return "CrawlResultPojo [httpStatuCode=" + httpStatuCode + ", isSuccess=" + isSuccess + ", pageContent=" + pageContent + "]"; } public void setSuccess(boolean isSuccess) { this.isSuccess = isSuccess; } public String getPageContent() { return pageContent; } public void setPageContent(String pageContent) { this.pageContent = pageContent; } public int getHttpStatuCode() { return httpStatuCode; } public void setHttpStatuCode(int httpStatuCode) { this.httpStatuCode = httpStatuCode; }}
RUL任务的POJO类
/** * url任务的pojo类 * @author tsj-pc * */public class UrlPojo { private Map<String, Object> parasMap; public Map<String, Object> getParasMap() { return parasMap; } public void setParasMap(Map<String, Object> parasMap) { this.parasMap = parasMap; } public UrlPojo(String url) { this.url = url; } public UrlPojo(String url, Map<String, Object> parasMap) { this.url = url; this.parasMap = parasMap; } @Override public String toString() { return "UrlPojo [taskLevel=" + taskLevel + ", url=" + url + "]"; } public UrlPojo(String url, TaskLevel taskLevel) { this.url = url; this.taskLevel = taskLevel; } private String url; private TaskLevel taskLevel = TaskLevel.MIDDLE; public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public TaskLevel getTaskLevel() { return taskLevel; } public void setTaskLevel(TaskLevel taskLevel) { this.taskLevel = taskLevel; } public HttpURLConnection getConnection() { try { URL url = new URL(this.url); URLConnection connection = url.openConnection(); if (connection instanceof HttpURLConnection) { return (HttpURLConnection) connection; } else { throw new Exception("connection is errr!"); } } catch (Exception e) { e.printStackTrace(); } return null; } public String getHost() { try { URL url = new URL(this.url); return url.getHost(); } catch (Exception e) { e.printStackTrace(); } return null; }}
使用HttpClient4.3.5抓取数据
public class HttpClientCrawlerImpl implements ICrawler { //当CloseableHttpClient不再需要,并且不再连接管理的范围,需要调用CloseableHttpClient.close()方法将其关闭 public CloseableHttpClient httpclient = HttpClients.custom().build(); @Override public CrawlResultPojo crawl(UrlPojo urlPojo) { if (urlPojo == null) { return null; } CrawlResultPojo crawlResultPojo = new CrawlResultPojo(); CloseableHttpResponse response1 = null; BufferedReader br = null; try { HttpGet httpget = new HttpGet(urlPojo.getUrl()); response1 = httpclient.execute(httpget); HttpEntity entity = response1.getEntity(); InputStreamReader isr = new InputStreamReader(entity.getContent(), "utf-8"); br = new BufferedReader(isr); String line = null; StringBuilder stringBuilder = new StringBuilder(); while ((line = br.readLine()) != null) { stringBuilder.append(line + "\n"); } crawlResultPojo.setSuccess(true); crawlResultPojo.setPageContent(stringBuilder.toString()); return crawlResultPojo; } catch (Exception e) { e.printStackTrace(); crawlResultPojo.setSuccess(false); } finally { if (response1 != null) { try { response1.close(); } catch (IOException e1) { e1.printStackTrace(); } } if (br != null) { try { br.close(); } catch (IOException e1) { e1.printStackTrace(); } } } return crawlResultPojo; } /** * 传入加入参数post参数的url pojo */ public CrawlResultPojo crawl4Post(UrlPojo urlPojo) { if (urlPojo == null) { return null; } CrawlResultPojo crawlResultPojo = new CrawlResultPojo(); CloseableHttpResponse response1 = null; BufferedReader br = null; try { RequestBuilder rb = RequestBuilder.post().setUri( new URI(urlPojo.getUrl())); ; // .addParameter("IDToken1", // "username").addParameter("IDToken2", "password").build(); Map<String, Object> parasMap = urlPojo.getParasMap(); if (parasMap != null) { for (Entry<String, Object> entry : parasMap.entrySet()) { rb .addParameter(entry.getKey(), entry.getValue() .toString()); } } HttpUriRequest httpRequest = rb.build(); response1 = httpclient.execute(httpRequest); HttpEntity entity = response1.getEntity(); InputStreamReader isr = new InputStreamReader(entity.getContent(), "utf-8"); br = new BufferedReader(isr); String line = null; StringBuilder stringBuilder = new StringBuilder(); while ((line = br.readLine()) != null) { stringBuilder.append(line + "\n"); } crawlResultPojo.setSuccess(true); crawlResultPojo.setPageContent(stringBuilder.toString()); return crawlResultPojo; } catch (Exception e) { e.printStackTrace(); crawlResultPojo.setSuccess(false); } finally { if (response1 != null) { try { response1.close(); } catch (IOException e1) { e1.printStackTrace(); } } if (br != null) { try { br.close(); } catch (IOException e1) { e1.printStackTrace(); } } } return crawlResultPojo; } public static void main(String[] args) throws Exception { HttpClientCrawlerImpl httpClientCrawlerImpl = new HttpClientCrawlerImpl(); String url = "http://www.wangdaizhijia.com/front_select-plat"; UrlPojo urlPojo = new UrlPojo(url); Map<String, Object> parasMap = new HashMap<String, Object>(); int max_page_number = 1000; parasMap.put("currPage", 30); parasMap.put("params", ""); parasMap.put("sort", 0); urlPojo.setParasMap(parasMap); CrawlResultPojo resultPojo = httpClientCrawlerImpl.crawl4Post(urlPojo); if (resultPojo != null) { System.out.println(resultPojo); } }}
结果:
0 0
- 【网络爬虫】使用HttpClient4.3.5抓取数据
- 【 网络爬虫】java 使用Socket, HttpUrlConnection方式抓取数据
- java实现网络爬虫--抓取网站数据
- Jsoup实现网络爬虫抓取数据
- 【网络爬虫】HttpClient抓取+解析+存储数据
- 网络爬虫,使用NodeJs抓取RSS新闻
- 网络爬虫/数据抓取,反爬虫(更新版)
- 分布式爬虫:使用Scrapy抓取数据
- 使用爬虫抓取网站异步加载数据
- 分布式爬虫:使用Scrapy抓取数据
- Node.js学习之网络爬虫(使用cheerio抓取网页数据)
- 网络爬虫内容抓取
- 网络爬虫-视频抓取
- 基于HttpClient4.5实现网络爬虫
- 网页抓取/数据抽取/网络爬虫技术资料汇总
- 读书笔记--用Python写网络爬虫02--数据抓取
- java 开发用到网络爬虫,抓取某某网站数据经历
- 用python写网络爬虫读书笔记 第二章数据抓取
- CSS选择器
- ClassView标签页里不显示类信息或者显示不全的解决方案
- argc argv
- HDOJ 2001 计算两点间的距离
- 常见HTTP状态码
- 【网络爬虫】使用HttpClient4.3.5抓取数据
- OpenCV安装教程及错误:Unsupported gpu architecture 'compute_11' 解决方法
- SPOJ 962 IM
- reactnative Dimensions进行获取屏幕宽度和高度信息
- 【JAVA-基础】—类成员单例(Singleton)类
- 学习TP框架(二)
- SSH学习笔记
- HZAU校赛补题PA(动态规划)
- Java异常处理、异常处理使用的一些注意点(例如,基类和子类捕获的顺序问题)