【爬虫】——模拟请求

来源：互联网发布：sql count 多个字段编辑：程序博客网时间：2024/06/05 19:48
需求

我们将要爬取人民网某些新闻中的数据，那我们该如何实现呢？
首先我们先实现第一步：根据url，模拟url请求，根据url获取网页中的内容。
模拟请求

public class HttpClientUtil {    static final int timeOut = 30 * 10000;  //  连接超时时间为3分钟    /**      * 模拟请求 ,支持 https     *       * @param url       资源地址      * @param map   参数列表      * @param encoding  编码      * @return      * @throws NoSuchAlgorithmException       * @throws KeyManagementException       * @throws IOException       * @throws ClientProtocolException       */      public static String post(String url, Map<String,String> map,String encoding) throws KeyManagementException, NoSuchAlgorithmException, ClientProtocolException, IOException {          String results = "";          //采用绕过验证的方式处理https请求          SSLContext sslcontext = SSLClient.createIgnoreVerifySSL();             // 设置协议http和https对应的处理socket链接工厂的对象          HostnameVerifier hv = new HostnameVerifier() {            public boolean verify(String urlHostName, SSLSession session) {                System.out.println("Warning: URL Host: " + urlHostName + " vs. "                        + session.getPeerHost());                return true;            }        };        LayeredConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslcontext, hv);        Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()                 .register("http", PlainConnectionSocketFactory.INSTANCE)                 .register("https", sslsf)                 .build();           PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);           HttpClients.custom().setConnectionManager(connManager);           //设置全局的标准cookie策略           RequestConfig config = RequestConfig.custom().setCookieSpec(CookieSpecs.IGNORE_COOKIES).build();           //创建自定义的httpclient对象           CloseableHttpClient client = HttpClients.custom().setConnectionManager(connManager).setDefaultRequestConfig(config).build();           //创建post方式请求对象           HttpPost httpPost = new HttpPost(url);        //设置请求和传输超时时间  毫秒         RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(timeOut).setConnectTimeout(timeOut).build();         httpPost.setConfig(requestConfig);         //装填参数           List<NameValuePair> nvps = new ArrayList<NameValuePair>();           if(map!=null){              for (Entry<String, String> entry : map.entrySet()) {                  nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));              }           }          //设置参数到请求对象中           httpPost.setEntity(new UrlEncodedFormEntity(nvps, encoding));           System.out.println("请求地址："+url);           System.out.println("请求参数："+nvps.toString());          //设置header信息          //指定报文头【Content-type】、【User-Agent】           httpPost.setHeader("Content-type", "application/x-www-form-urlencoded");           httpPost.setHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");           //执行请求操作，并拿到结果（同步阻塞）           CloseableHttpResponse response = client.execute(httpPost);          if(response.getStatusLine().getStatusCode()!=200){//             System.out.println("*******response.getStatusLine().getStatusCode()!=200*****");             try {                Thread.sleep(3000);                 response = client.execute(httpPost);            } catch (Exception e) {                e.printStackTrace();            }         }         /* //         if (response.getStatusLine().getStatusCode() != 200){             System.out.println("****************请求返回失败******************"+response.getStatusLine().getStatusCode());             try {                Thread.sleep(8000);                response = client.execute(httpPost); //再连一次            } catch (InterruptedException e) {                e.printStackTrace();            }         }*/        //获取结果实体           HttpEntity entity = response.getEntity();           if (entity != null) {              //按指定编码转换结果实体为String类型               results = EntityUtils.toString(entity, encoding);           }           EntityUtils.consume(entity);          //释放链接           response.close();           return results;      }  /** *  * @param url * @param params 可变参数，设置网页编码 * @return * @throws KeyManagementException * @throws NoSuchAlgorithmException * @throws ClientProtocolException * @throws IOException */    public static String get(String url,String ... params) {        String results = "";        try {            String encoding = CommonVariable.encoding_default;            if (params.length > 0) {                encoding = params[0];            }            //采用绕过验证的方式处理https请求            SSLContext sslcontext = SSLClient.createIgnoreVerifySSL();            // 设置协议http和https对应的处理socket链接工厂的对象            HostnameVerifier hv = new HostnameVerifier() {                public boolean verify(String urlHostName, SSLSession session) {                /*System.out.println("Warning: URL Host: " + urlHostName + " vs. "                        + session.getPeerHost());*/                    return true;                }            };            LayeredConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslcontext, hv);            Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()                    .register("http", PlainConnectionSocketFactory.INSTANCE)                    .register("https", sslsf)                    .build();            PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);            HttpClients.custom().setConnectionManager(connManager);            //设置全局的标准cookie策略            RequestConfig config = RequestConfig.custom().setCookieSpec(CookieSpecs.IGNORE_COOKIES).build();            //创建自定义的httpclient对象            CloseableHttpClient client = HttpClients.custom().setConnectionManager(connManager).setDefaultRequestConfig(config).build();            //创建GET方式请求对象            HttpGet httpGet = new HttpGet(url);            //设置请求和传输超时时间  毫秒            RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(timeOut).setConnectTimeout(timeOut).build();            httpGet.setConfig(requestConfig);            //设置header信息            //指定报文头【Content-type】、【User-Agent】            httpGet.setHeader("Content-type", "application/x-www-form-urlencoded");            httpGet.setHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");            //执行请求操作，并拿到结果（同步阻塞）            CloseableHttpResponse response = client.execute(httpGet);            if (response.getStatusLine().getStatusCode() != 200) {//                //System.out.println("*******response.getStatusLine().getStatusCode()!=200*****重连");               /* try {                    Thread.sleep(3000);                    response = client.execute(httpGet);                } catch (Exception e) {                    e.printStackTrace();                }*/            }            //获取结果实体            HttpEntity entity = response.getEntity();            if (entity != null) {                //按指定编码转换结果实体为String类型                results = EntityUtils.toString(entity, encoding);            }            EntityUtils.consume(entity);            //释放链接            response.close();        }        catch (Exception ex)        {            ex.printStackTrace();        }         return results;      }  }
当然我们这步的实现也可以采取很多优秀的框架，就不用自己写工具类来进行实现啦！
下一节会具体介绍，怎么获取想要的具体内容
1 0