Httpclient写爬虫

来源:互联网 发布:手机淘宝怎样创桌面 编辑:程序博客网 时间:2024/06/06 06:59

第1部分           了解爬虫

1.1     什么事爬虫

简单通俗的理解,就是通过Http请求模拟用户在浏览器操作行为的代码。

1.2     爬虫能做什么

常用于抓数据,通过一些列的http请求,将别人网站的内容抓到自己的数据库中。

1.3     爬虫的应用场景

刚使用,大家去别地找一找吧。

第2部分           基础知识准备

第2部分      

2.1     什么事http

对于这个问题,大家觉得可能很搞笑,这么简单的问题谁不知道。然而,我更相信绝大多数人都是一知半解。

记得第一次找找工作就有考官问,http与https的区别,还有如何跟踪一个会话。当场就懵掉了,时至今日我也不能全部说的明白。

跟着前辈们学学吧:http://www.cnblogs.com/rayray/p/3729533.html。

2.2     Session与cookie

上班坐地铁需要一个多小时的时间,闲来无事看看这边博客: http://blog.csdn.net/fangaoxin/article/details/6952954,感觉头脑清晰了不少,感谢这些爱写博客又这个严谨的前辈们,我们都应当向他们看起。

第3部分           环境准备

第3部分      

3.1     抓包工具fiddler4

网址:https://www.telerik.com/download/fiddler/fiddler4

安装:一路下一步即可,最好放在英文路径下。

3.2     使用fiddler4

1.      我们使用fiddler进行抓包,打开chrome浏览器的隐身模式输入电信官网。可以发现fiddler已经捕获到了我们的操作行为,而我们的浏览器行为就是一个个的http请求。

2.      Ok,我们打开一个请求,查看其详细内容,发现每个一点记录的都非常的清晰。而想写爬虫程序,这些东西都要搞明白。

例如:Headers(错误大量在这里)

      Cookie(如果涉及到权限部分,这里必须注意)

             http响应码,也是至关重要的,他决定你一个请求访问的链,如302必须自己写重定向过程。

3.      我们在看一个登陆的过程,我们可以查看登陆时需要提交的表单内容

4.      那么接下来爬虫的流程便清晰了,首先找到url,之后报文头,最后整理表单数据即可。

第4部分           Demo

Httpclient工具类:

import com.hbc.api.exception.SimpleException;import org.apache.http.Header;import org.apache.http.HttpResponse;import org.apache.http.HttpStatus;import org.apache.http.NameValuePair;import org.apache.http.client.HttpClient;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.protocol.HttpClientContext;import org.apache.http.entity.StringEntity;import org.apache.http.util.EntityUtils;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.util.List;/** * 简单的eHttpClient工具类 * * Created by zc on 2017/6/27. */public class SimpleHttpClientUtil {    private Logger logger = LoggerFactory.getLogger(getClass()) ;    private HttpClient httpClient ;    private HttpClientContext httpClientContext ;    /**     * 无参空构造     */    public SimpleHttpClientUtil(){}    /**     * 含参构造     * @param client httpClient     * @param context HttpClientContext     */    public SimpleHttpClientUtil(HttpClient client, HttpClientContext context){        this.httpClient = httpClient ;        this.httpClientContext = httpClientContext ;    }    /**     * 重定向     * @param response 重定向上游HttpResponse     * @param client HttpClient     * @param context HttpClientContext     * @return HttpResponse     */    public HttpResponse redirectResp(HttpResponse response, HttpClient client, HttpClientContext context){        Header header=response.getFirstHeader("Location") ;        return httpGetResp(client,context,header.getValue()) ;    }    /**     * 重定向     * @param response 重定向上游HttpResponse     * @param client HttpClient     * @param context HttpClientContext     * @return 响应码     */    public int redirect(HttpResponse response, HttpClient client, HttpClientContext context){        Header header=response.getFirstHeader("Location") ;        return httpGet(client,context,header.getValue()) ;    }    public int redirect(HttpResponse response){        return redirect(response,this.httpClient,this.httpClientContext) ;    }    /**     * GET 方式访问     *     * @param client HttpClient     * @param context HttpClientContext     * @param url url     * @return 响应文本内容     */    public String httpGetRespTxt(HttpClient client, HttpClientContext context,String url){        String webTxt ;        HttpGet httpGet = new HttpGet(url) ;        HttpResponse response ;        httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240") ;//逃避反爬虫        try{            response = client.execute(httpGet,context) ;            webTxt = EntityUtils.toString(response.getEntity()) ;        }catch (Exception e){            logger.error("GET 方式访问异常:"+url,e) ;            throw new SimpleException(2,"Http响应:"+e.getMessage()) ;        }finally {            httpGet.abort() ;        }        return webTxt ;    }    /**     * GET 方式访问     *     * @param client HttpClient     * @param context HttpClientContext     * @param url url     * @return HttpResponse     */    public HttpResponse httpGetResp(HttpClient client, HttpClientContext context,String url){        HttpGet httpGet = new HttpGet(url) ;        HttpResponse response ;        httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240") ;//逃避反爬虫        try{            response = client.execute(httpGet,context) ;        }catch (Exception e){            logger.error("GET 方式访问异常:"+url,e) ;            throw new SimpleException(2,"Http响应:"+e.getMessage()) ;        }finally {            httpGet.abort() ;        }        return response ;    }    /**     * GET 方式访问     *     * @param client HttpClient     * @param context HttpClientContext     * @param url url     * @return 响应码     */    public int httpGet(HttpClient client, HttpClientContext context,String url){        return httpGetResp(client,context,url).getStatusLine().getStatusCode() ;    }    /**     * GET 方式访问     * @param url url     * @return 响应码     */    public int httpGet(String url) {        return httpGet(this.httpClient,this.httpClientContext,url) ;    }    /**     * post json请求     * @param client HttpClient     * @param context HttpClientContext     * @param url url     * @param json json     * @return json     */    public String httpPostJson(HttpClient client, HttpClientContext context,String url,String json){        HttpPost httpPost = new HttpPost(url);        httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36") ;//逃避反爬虫        String resultJson="" ;        HttpResponse httpResponse = null ;        try {            StringEntity se = new StringEntity(json);            se.setContentEncoding("UTF-8");            se.setContentType("application/json");//发送json数据需要设置contentType            httpPost.setEntity(se);            httpResponse = client.execute(httpPost);            if(httpResponse.getStatusLine().getStatusCode() == HttpStatus.SC_OK){                resultJson= EntityUtils.toString(httpResponse.getEntity());// 返回json格式:            }        } catch (Exception e) {            System.out.println("httpPostJson:"+httpResponse.getStatusLine());            logger.error("POST 方式访问异常:"+url+" httpPostJson:"+httpResponse.getStatusLine(),e) ;            e.printStackTrace();            throw new RuntimeException(e);        }        return resultJson ;    }    /**     * POST 方式访问     * @param client HttpClient     * @param context HttpClientContext     * @param url url     * @param nvp List<NameValuePair>     * @return 响应文本内容     */    public String httpPostRespTxt(HttpClient client, HttpClientContext context,String url,List<NameValuePair> nvp){        String webTxt;        HttpPost httpPost = new HttpPost(url) ;        httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36") ;//逃避反爬虫        HttpResponse response ;        try{            httpPost.setEntity(new UrlEncodedFormEntity(nvp,"UTF-8")) ;            response = client.execute(httpPost,context) ;            webTxt = EntityUtils.toString(response.getEntity()) ;        }catch (Exception e){            logger.error("POST 方式访问异常:"+url,e) ;            throw new SimpleException(2,"Http响应:"+e.getMessage()) ;        }finally {            httpPost.abort() ;        }        return webTxt ;    }    /**     * POST 方式访问     * @param client HttpClient     * @param context HttpClientContext     * @param url url     * @param nvp List<NameValuePair>     * @return HttpResponse     */    public HttpResponse httpPostResp(HttpClient client, HttpClientContext context,String url,List<NameValuePair> nvp){        HttpPost httpPost = new HttpPost(url) ;        httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36") ;//逃避反爬虫        HttpResponse response ;        try{            httpPost.setEntity(new UrlEncodedFormEntity(nvp,"UTF-8")) ;            response = client.execute(httpPost,context) ;        }catch (Exception e){            logger.error("POST 方式访问异常:"+url,e) ;            throw new SimpleException(2,"Http响应:"+e.getMessage()) ;        }finally {            httpPost.abort() ;        }        return response ;    }    /**     * POST 方式访问     * @param client HttpClient     * @param context HttpClientContext     * @param url url     * @param nvp List<NameValuePair>     * @return 响应码     */    public int httpPost(HttpClient client, HttpClientContext context,String url,List<NameValuePair> nvp){        return httpPostResp(client,context,url,nvp).getStatusLine().getStatusCode() ;    }    /**     * POST 方式访问     * @param url url     * @param nvp List<NameValuePair>     * @return 响应码     */    public int httpPost(String url,List<NameValuePair> nvp){        return httpPost(this.httpClient,this.httpClientContext,url,nvp) ;    }}

登陆:

private final SimpleHttpClientUtil simpleHttpClientUtil = new SimpleHttpClientUtil() ;    /**     * 登陆电信:<br/>     *  电信登陆页面是统一的<a href="http://login.189.cn/login"> http://login.189.cn/login</a>,     * 登陆完成后重定向到各省二级页面http://www.189.cn/省名简写/<br/>     * @param httpClient httpClient     * @param httpClientContext httpClientContext     * @param provinceID 省、自治区、直辖市编号     * @param cityCode 省、自治区、直辖市英文简写     * @param mobile 手机号码     * @param pwd 服务密码     * @return HttpResponse     */    final boolean loginDx(HttpClient httpClient,HttpClientContext httpClientContext,String provinceID,String cityCode,String mobile,String pwd){        //提交表单        HttpResponse response = submitLoginForm(httpClient,httpClientContext,provinceID,mobile,pwd) ;        //登陆重定向        boolean redirectLoginFlag = simpleRedirect(response,httpClient,httpClientContext) ;        //重定向 到该省二级域名        boolean redirectToProvinceFlag = redirectToProvince(httpClient,httpClientContext,cityCode) ;        return redirectLoginFlag && redirectToProvinceFlag ;    }    /**     * 提交登陆表单<br/>     * 登陆分三步:<br/>     *  1.提交登陆表单<br/>     *  2.单点登录重定向<br/>     *  3.重定向到省份二级域名<br/><hr/>     * @param httpClient httpClient     * @param httpClientContext httpClientContext     * @param provinceID 省、自治区、直辖市编号     * @param mobile 手机号码     * @param pwd 服务密码     * @return HttpResponse     */    private HttpResponse submitLoginForm(HttpClient httpClient,HttpClientContext httpClientContext,String provinceID,String mobile,String pwd){        String url = "http://login.189.cn/login" ;        List<NameValuePair> nvp = new ArrayList<>() ;        nvp.add(new BasicNameValuePair("AreaCode", "")) ;        nvp.add(new BasicNameValuePair("CityNo", "")) ;        nvp.add(new BasicNameValuePair("Captcha", "")) ;        nvp.add(new BasicNameValuePair("Account", mobile)) ;        nvp.add(new BasicNameValuePair("UType", "201")) ;        nvp.add(new BasicNameValuePair("ProvinceID", provinceID)) ;        nvp.add(new BasicNameValuePair("RandomFlag", "0")) ;        nvp.add(new BasicNameValuePair("Password", CryptoJsUtil.getInstance().encryptByAes(pwd))) ;        return this.simpleHttpClientUtil.httpPostResp(httpClient,httpClientContext,url,nvp) ;    }    /**     * 简单重定向     * @param response 重定向上游HttpResponse     * @param httpClient HttpClient     * @param httpClientContext HttpClientContext     * @return 重定向是否成功     */    private boolean simpleRedirect(HttpResponse response,HttpClient httpClient,HttpClientContext httpClientContext){        return this.simpleHttpClientUtil.redirect(response,httpClient,httpClientContext)==200 ;    }    /**     * 重定向 到 http://www.189.cn/省份     * @param httpClient httpClient     * @param httpClientContext httpClientContext     * @param cityCode 省、自治区、直辖市英文简写     * @return 重定向结果     */    private boolean redirectToProvince(HttpClient httpClient,HttpClientContext httpClientContext,String cityCode){        String url = "http://www.189.cn/"+cityCode ;        return this.simpleHttpClientUtil.httpGet(httpClient,httpClientContext,url)==200 ;    }


其余的就好弄了,需要注意的是,如果操作分两次请求,如在获取详单时发送短信验证码,和校验短信验证码是,就简单的解决办法就是将CookieStore进行缓存。

前一布操作是进行缓存:

redisUtil.set("cookie_"+mobile,httpClientContext.getCookieStore(),60*2L) ;

下一步提取:

HttpClientBuilder builder = HttpClients.custom() ;        HttpClient httpClient = builder.build() ;//创建client实例        HttpClientContext httpClientContext = HttpClientContext.create() ;//创建一个上下文,cookie会自动跟踪        CookieStore cookieStore = (CookieStore)redisUtil.get("cookie_"+mobile) ;        httpClientContext.setCookieStore(cookieStore) ;

这样我们就可以轻松搞定cookie问题。


未完待续。。。。



原创粉丝点击