Httpclient写爬虫
来源:互联网 发布:手机淘宝怎样创桌面 编辑:程序博客网 时间:2024/06/06 06:59
第1部分 了解爬虫
1.1 什么事爬虫
简单通俗的理解,就是通过Http请求模拟用户在浏览器操作行为的代码。
1.2 爬虫能做什么
常用于抓数据,通过一些列的http请求,将别人网站的内容抓到自己的数据库中。
1.3 爬虫的应用场景
刚使用,大家去别地找一找吧。
第2部分 基础知识准备
第2部分
2.1 什么事http
对于这个问题,大家觉得可能很搞笑,这么简单的问题谁不知道。然而,我更相信绝大多数人都是一知半解。
记得第一次找找工作就有考官问,http与https的区别,还有如何跟踪一个会话。当场就懵掉了,时至今日我也不能全部说的明白。
跟着前辈们学学吧:http://www.cnblogs.com/rayray/p/3729533.html。
2.2 Session与cookie
上班坐地铁需要一个多小时的时间,闲来无事看看这边博客: http://blog.csdn.net/fangaoxin/article/details/6952954,感觉头脑清晰了不少,感谢这些爱写博客又这个严谨的前辈们,我们都应当向他们看起。
第3部分 环境准备
第3部分
3.1 抓包工具fiddler4
网址:https://www.telerik.com/download/fiddler/fiddler4
安装:一路下一步即可,最好放在英文路径下。
3.2 使用fiddler4
1. 我们使用fiddler进行抓包,打开chrome浏览器的隐身模式输入电信官网。可以发现fiddler已经捕获到了我们的操作行为,而我们的浏览器行为就是一个个的http请求。
2. Ok,我们打开一个请求,查看其详细内容,发现每个一点记录的都非常的清晰。而想写爬虫程序,这些东西都要搞明白。
例如:Headers(错误大量在这里)
Cookie(如果涉及到权限部分,这里必须注意)
http响应码,也是至关重要的,他决定你一个请求访问的链,如302必须自己写重定向过程。
3. 我们在看一个登陆的过程,我们可以查看登陆时需要提交的表单内容
4. 那么接下来爬虫的流程便清晰了,首先找到url,之后报文头,最后整理表单数据即可。
第4部分 Demo
Httpclient工具类:
import com.hbc.api.exception.SimpleException;import org.apache.http.Header;import org.apache.http.HttpResponse;import org.apache.http.HttpStatus;import org.apache.http.NameValuePair;import org.apache.http.client.HttpClient;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.protocol.HttpClientContext;import org.apache.http.entity.StringEntity;import org.apache.http.util.EntityUtils;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.util.List;/** * 简单的eHttpClient工具类 * * Created by zc on 2017/6/27. */public class SimpleHttpClientUtil { private Logger logger = LoggerFactory.getLogger(getClass()) ; private HttpClient httpClient ; private HttpClientContext httpClientContext ; /** * 无参空构造 */ public SimpleHttpClientUtil(){} /** * 含参构造 * @param client httpClient * @param context HttpClientContext */ public SimpleHttpClientUtil(HttpClient client, HttpClientContext context){ this.httpClient = httpClient ; this.httpClientContext = httpClientContext ; } /** * 重定向 * @param response 重定向上游HttpResponse * @param client HttpClient * @param context HttpClientContext * @return HttpResponse */ public HttpResponse redirectResp(HttpResponse response, HttpClient client, HttpClientContext context){ Header header=response.getFirstHeader("Location") ; return httpGetResp(client,context,header.getValue()) ; } /** * 重定向 * @param response 重定向上游HttpResponse * @param client HttpClient * @param context HttpClientContext * @return 响应码 */ public int redirect(HttpResponse response, HttpClient client, HttpClientContext context){ Header header=response.getFirstHeader("Location") ; return httpGet(client,context,header.getValue()) ; } public int redirect(HttpResponse response){ return redirect(response,this.httpClient,this.httpClientContext) ; } /** * GET 方式访问 * * @param client HttpClient * @param context HttpClientContext * @param url url * @return 响应文本内容 */ public String httpGetRespTxt(HttpClient client, HttpClientContext context,String url){ String webTxt ; HttpGet httpGet = new HttpGet(url) ; HttpResponse response ; httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240") ;//逃避反爬虫 try{ response = client.execute(httpGet,context) ; webTxt = EntityUtils.toString(response.getEntity()) ; }catch (Exception e){ logger.error("GET 方式访问异常:"+url,e) ; throw new SimpleException(2,"Http响应:"+e.getMessage()) ; }finally { httpGet.abort() ; } return webTxt ; } /** * GET 方式访问 * * @param client HttpClient * @param context HttpClientContext * @param url url * @return HttpResponse */ public HttpResponse httpGetResp(HttpClient client, HttpClientContext context,String url){ HttpGet httpGet = new HttpGet(url) ; HttpResponse response ; httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240") ;//逃避反爬虫 try{ response = client.execute(httpGet,context) ; }catch (Exception e){ logger.error("GET 方式访问异常:"+url,e) ; throw new SimpleException(2,"Http响应:"+e.getMessage()) ; }finally { httpGet.abort() ; } return response ; } /** * GET 方式访问 * * @param client HttpClient * @param context HttpClientContext * @param url url * @return 响应码 */ public int httpGet(HttpClient client, HttpClientContext context,String url){ return httpGetResp(client,context,url).getStatusLine().getStatusCode() ; } /** * GET 方式访问 * @param url url * @return 响应码 */ public int httpGet(String url) { return httpGet(this.httpClient,this.httpClientContext,url) ; } /** * post json请求 * @param client HttpClient * @param context HttpClientContext * @param url url * @param json json * @return json */ public String httpPostJson(HttpClient client, HttpClientContext context,String url,String json){ HttpPost httpPost = new HttpPost(url); httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36") ;//逃避反爬虫 String resultJson="" ; HttpResponse httpResponse = null ; try { StringEntity se = new StringEntity(json); se.setContentEncoding("UTF-8"); se.setContentType("application/json");//发送json数据需要设置contentType httpPost.setEntity(se); httpResponse = client.execute(httpPost); if(httpResponse.getStatusLine().getStatusCode() == HttpStatus.SC_OK){ resultJson= EntityUtils.toString(httpResponse.getEntity());// 返回json格式: } } catch (Exception e) { System.out.println("httpPostJson:"+httpResponse.getStatusLine()); logger.error("POST 方式访问异常:"+url+" httpPostJson:"+httpResponse.getStatusLine(),e) ; e.printStackTrace(); throw new RuntimeException(e); } return resultJson ; } /** * POST 方式访问 * @param client HttpClient * @param context HttpClientContext * @param url url * @param nvp List<NameValuePair> * @return 响应文本内容 */ public String httpPostRespTxt(HttpClient client, HttpClientContext context,String url,List<NameValuePair> nvp){ String webTxt; HttpPost httpPost = new HttpPost(url) ; httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36") ;//逃避反爬虫 HttpResponse response ; try{ httpPost.setEntity(new UrlEncodedFormEntity(nvp,"UTF-8")) ; response = client.execute(httpPost,context) ; webTxt = EntityUtils.toString(response.getEntity()) ; }catch (Exception e){ logger.error("POST 方式访问异常:"+url,e) ; throw new SimpleException(2,"Http响应:"+e.getMessage()) ; }finally { httpPost.abort() ; } return webTxt ; } /** * POST 方式访问 * @param client HttpClient * @param context HttpClientContext * @param url url * @param nvp List<NameValuePair> * @return HttpResponse */ public HttpResponse httpPostResp(HttpClient client, HttpClientContext context,String url,List<NameValuePair> nvp){ HttpPost httpPost = new HttpPost(url) ; httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36") ;//逃避反爬虫 HttpResponse response ; try{ httpPost.setEntity(new UrlEncodedFormEntity(nvp,"UTF-8")) ; response = client.execute(httpPost,context) ; }catch (Exception e){ logger.error("POST 方式访问异常:"+url,e) ; throw new SimpleException(2,"Http响应:"+e.getMessage()) ; }finally { httpPost.abort() ; } return response ; } /** * POST 方式访问 * @param client HttpClient * @param context HttpClientContext * @param url url * @param nvp List<NameValuePair> * @return 响应码 */ public int httpPost(HttpClient client, HttpClientContext context,String url,List<NameValuePair> nvp){ return httpPostResp(client,context,url,nvp).getStatusLine().getStatusCode() ; } /** * POST 方式访问 * @param url url * @param nvp List<NameValuePair> * @return 响应码 */ public int httpPost(String url,List<NameValuePair> nvp){ return httpPost(this.httpClient,this.httpClientContext,url,nvp) ; }}
登陆:
private final SimpleHttpClientUtil simpleHttpClientUtil = new SimpleHttpClientUtil() ; /** * 登陆电信:<br/> * 电信登陆页面是统一的<a href="http://login.189.cn/login"> http://login.189.cn/login</a>, * 登陆完成后重定向到各省二级页面http://www.189.cn/省名简写/<br/> * @param httpClient httpClient * @param httpClientContext httpClientContext * @param provinceID 省、自治区、直辖市编号 * @param cityCode 省、自治区、直辖市英文简写 * @param mobile 手机号码 * @param pwd 服务密码 * @return HttpResponse */ final boolean loginDx(HttpClient httpClient,HttpClientContext httpClientContext,String provinceID,String cityCode,String mobile,String pwd){ //提交表单 HttpResponse response = submitLoginForm(httpClient,httpClientContext,provinceID,mobile,pwd) ; //登陆重定向 boolean redirectLoginFlag = simpleRedirect(response,httpClient,httpClientContext) ; //重定向 到该省二级域名 boolean redirectToProvinceFlag = redirectToProvince(httpClient,httpClientContext,cityCode) ; return redirectLoginFlag && redirectToProvinceFlag ; } /** * 提交登陆表单<br/> * 登陆分三步:<br/> * 1.提交登陆表单<br/> * 2.单点登录重定向<br/> * 3.重定向到省份二级域名<br/><hr/> * @param httpClient httpClient * @param httpClientContext httpClientContext * @param provinceID 省、自治区、直辖市编号 * @param mobile 手机号码 * @param pwd 服务密码 * @return HttpResponse */ private HttpResponse submitLoginForm(HttpClient httpClient,HttpClientContext httpClientContext,String provinceID,String mobile,String pwd){ String url = "http://login.189.cn/login" ; List<NameValuePair> nvp = new ArrayList<>() ; nvp.add(new BasicNameValuePair("AreaCode", "")) ; nvp.add(new BasicNameValuePair("CityNo", "")) ; nvp.add(new BasicNameValuePair("Captcha", "")) ; nvp.add(new BasicNameValuePair("Account", mobile)) ; nvp.add(new BasicNameValuePair("UType", "201")) ; nvp.add(new BasicNameValuePair("ProvinceID", provinceID)) ; nvp.add(new BasicNameValuePair("RandomFlag", "0")) ; nvp.add(new BasicNameValuePair("Password", CryptoJsUtil.getInstance().encryptByAes(pwd))) ; return this.simpleHttpClientUtil.httpPostResp(httpClient,httpClientContext,url,nvp) ; } /** * 简单重定向 * @param response 重定向上游HttpResponse * @param httpClient HttpClient * @param httpClientContext HttpClientContext * @return 重定向是否成功 */ private boolean simpleRedirect(HttpResponse response,HttpClient httpClient,HttpClientContext httpClientContext){ return this.simpleHttpClientUtil.redirect(response,httpClient,httpClientContext)==200 ; } /** * 重定向 到 http://www.189.cn/省份 * @param httpClient httpClient * @param httpClientContext httpClientContext * @param cityCode 省、自治区、直辖市英文简写 * @return 重定向结果 */ private boolean redirectToProvince(HttpClient httpClient,HttpClientContext httpClientContext,String cityCode){ String url = "http://www.189.cn/"+cityCode ; return this.simpleHttpClientUtil.httpGet(httpClient,httpClientContext,url)==200 ; }
其余的就好弄了,需要注意的是,如果操作分两次请求,如在获取详单时发送短信验证码,和校验短信验证码是,就简单的解决办法就是将CookieStore进行缓存。
前一布操作是进行缓存:
redisUtil.set("cookie_"+mobile,httpClientContext.getCookieStore(),60*2L) ;
下一步提取:
HttpClientBuilder builder = HttpClients.custom() ; HttpClient httpClient = builder.build() ;//创建client实例 HttpClientContext httpClientContext = HttpClientContext.create() ;//创建一个上下文,cookie会自动跟踪 CookieStore cookieStore = (CookieStore)redisUtil.get("cookie_"+mobile) ; httpClientContext.setCookieStore(cookieStore) ;
这样我们就可以轻松搞定cookie问题。
- Httpclient写爬虫
- HttpClient爬虫
- httpClient download file(爬虫)
- HttpClient简单爬虫
- Httpclient-4.3.6 爬虫
- java HttpClient 爬虫
- HttpClient +JSOUP 代理 爬虫
- 基于HttpClient 多线程爬虫实践
- HttpClient使用详解 网络爬虫
- httpclient使用详解(爬虫)
- 用HttpClient实现网络爬虫
- httpclient使用详解(爬虫)
- httpclient使用详解(爬虫)
- Ending、网络爬虫-HttpClient系列
- HttpClient基础知识(java爬虫03)
- HttpClient,HttpParser实现简易爬虫
- httpclient写的话
- 写网络爬虫初探
- redis 报错 Redis protected-mode 配置文件没有真正启动
- 【重要】XSLT学习(九)通过JavaScript转化xml
- Python Word Count
- 【JZOJ 1319】邮递员(欧拉回路)
- 机器学习笔记一:线性回归
- Httpclient写爬虫
- XSLT学习(十)服务器上跨浏览器解决方案(解决上一章第九章的遗留问题)
- 【bzoj2506】calc
- 查看和调试Qt源码
- 机构自由度的计算例子
- 次短路 poj3255 Roadblocks
- 有关于环回接口
- MySQL大表优化方案
- 【LeetCode解题】4#Median of Two Sorted Arrays