HttpClient在java中的使用

来源：互联网发布：网络爱国事例编辑：程序博客网时间：2024/06/05 00:09

注：对下述文档的代码部分做了更新2016.03.03

项目中一直在使用HttpClient，版本是3.6的，负责维护的同事离职后，就没有更新过，在这次项目改版中决定对这块进行升级到4.3版本，查阅了一些资料写了部分出来，还不是很完善，后期有时间在更新

这里对httpclient如何获取原页面编码进行了一些处理，感觉还不是很完善，后期想到在处理吧

具体的理论东西就不说了，代码中注释都有一些说明。

直接上代码

[java] view plain copy
/** 
  * 项目名：NewsCrawlerV3.1 
  * 文件名：InfoLoad.java 
  * 作者：zhouyh 
  * 时间：2015-10-14 上午08:46:47 
  * 描述：TODO(用一句话描述该文件做什么)  
  */  
package com.boryou.module.load;  
  
import java.io.IOException;  
import java.io.InputStream;  
import java.io.InterruptedIOException;  
import java.net.MalformedURLException;  
import java.net.URI;  
import java.net.URISyntaxException;  
import java.net.URL;  
import java.net.UnknownHostException;  
import java.nio.charset.Charset;  
import java.security.KeyManagementException;  
import java.security.KeyStoreException;  
import java.security.NoSuchAlgorithmException;  
import java.security.cert.CertificateException;  
import java.security.cert.X509Certificate;  
import java.util.Random;  
import java.util.regex.Matcher;  
import java.util.regex.Pattern;  
import java.util.zip.GZIPInputStream;  
  
import javax.net.ssl.SSLContext;  
import javax.net.ssl.SSLException;  
  
import org.apache.http.Header;  
import org.apache.http.HeaderElement;  
import org.apache.http.HttpEntity;  
import org.apache.http.HttpEntityEnclosingRequest;  
import org.apache.http.HttpHost;  
import org.apache.http.HttpRequest;  
import org.apache.http.client.ClientProtocolException;  
import org.apache.http.client.HttpRequestRetryHandler;  
import org.apache.http.client.config.CookieSpecs;  
import org.apache.http.client.config.RequestConfig;  
import org.apache.http.client.methods.CloseableHttpResponse;  
import org.apache.http.client.methods.HttpGet;  
import org.apache.http.client.protocol.HttpClientContext;  
import org.apache.http.conn.ConnectTimeoutException;  
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;  
import org.apache.http.entity.ContentType;  
import org.apache.http.impl.client.CloseableHttpClient;  
import org.apache.http.impl.client.HttpClients;  
import org.apache.http.impl.client.LaxRedirectStrategy;  
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;  
import org.apache.http.protocol.HttpContext;  
import org.apache.http.ssl.SSLContextBuilder;  
import org.apache.http.ssl.TrustStrategy;  
import org.apache.http.util.ByteArrayBuffer;  
  
import com.boryou.constant.Constant;  
import com.boryou.util.Common;  
  
  
/** 
 * 类名： InfoLoad 
 * 包名： com.boryou.module.load 
 * 作者： zhouyh 
 * 时间： 2015-10-14 上午08:46:47 
 * 描述： 下载基础类，共监控、下载等模块使用  
 */  
public class InfoLoad {  
    // 创建httpclient连接池  
    private PoolingHttpClientConnectionManager httpClientConnectionManager = null;  
      
    /******单例模式声明开始******/  
    //类初始化时，自动实例化，饿汉单例模式  
    private static final InfoLoad infoLoad = new InfoLoad();  
    /** 
     *  
     * 方法名：getInfoLoadInstance 
     * 作者：zhouyh 
     * 创建时间：2015-10-14 上午08:59:54 
     * 描述：单例的静态方法，返回InfoLoad的实例 
     * @return 
     */  
    public static InfoLoad getInfoLoadInstance(){  
        return infoLoad;  
    }  
    /******单例模式声明结束******/  
    /** 
     * 私有的构造函数 
     */  
    private InfoLoad(){  
        //初始化httpClient  
        initHttpClient();  
    }  
    /** 
     *  
     * 方法名：initHttpClient 
     * 作者：zhouyh 
     * 创建时间：2015-10-14 上午11:00:30 
     * 描述：创建httpclient连接池，并初始化httpclient 
     */  
    public void initHttpClient(){  
        //创建httpclient连接池  
        httpClientConnectionManager = new PoolingHttpClientConnectionManager();  
        //设置连接池最大数量  
        httpClientConnectionManager.setMaxTotal(Constant.HTTPCLIENT_CONNECTION_COUNT);  
        //设置单个路由最大连接数量  
        httpClientConnectionManager.setDefaultMaxPerRoute(Constant.HTTPCLIENT_MAXPERROUTE_COUNT);  
    }  
    //请求重试机制  
    HttpRequestRetryHandler myRetryHandler = new HttpRequestRetryHandler() {  
        public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {  
            if (executionCount >= 3) {  
                // 超过三次则不再重试请求  
                return false;  
            }  
            if (exception instanceof InterruptedIOException) {  
                // Timeout  
                return false;  
            }  
            if (exception instanceof UnknownHostException) {  
                // Unknown host  
                return false;  
            }  
            if (exception instanceof ConnectTimeoutException) {  
                // Connection refused  
                return false;             
            }  
            if (exception instanceof SSLException) {  
                // SSL handshake exception  
                return false;  
            }  
            HttpClientContext clientContext = HttpClientContext.adapt(context);  
            HttpRequest request = clientContext.getRequest();  
            boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);  
            if (idempotent) {  
                // Retry if the request is considered idempotent  
                return true;  
            }  
            return false;  
        }  
    };  
    /** 
     *  
     * 方法名：getHttpClient 
     * 作者：zhouyh 
     * 创建时间：2016-2-18 下午01:23:32 
     * 描述：多线程调用时，需要创建自己的httpclient 
     * @return 
     */  
    public CloseableHttpClient getHttpClient(){       
        // 创建全局的requestConfig  
        RequestConfig requestConfig = RequestConfig.custom()  
                .setConnectTimeout(Constant.HTTPCLIENT_CONNECT_TIMEOUT)  
                .setSocketTimeout(Constant.HTTPCLIENT_SOCKET_TIMEOUT)  
                .setCookieSpec(CookieSpecs.BEST_MATCH).build();  
        // 声明重定向策略对象  
        LaxRedirectStrategy redirectStrategy = new LaxRedirectStrategy();  
          
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(httpClientConnectionManager)  
                                                    .setDefaultRequestConfig(requestConfig)  
                                                    .setRedirectStrategy(redirectStrategy)  
                                                    .setRetryHandler(myRetryHandler)  
                                                    .build();     
        return httpClient;   
    }  
      
    /** 
     *  
     * 方法名：loadForString 
     * 作者：zhouyh 
     * 创建时间：2015-10-14 下午02:22:19 
     * 描述：根据传入的url获取下载信息 
     * @param url 
     * @param type 
     * @return 
     */  
    public static String loadForString(String urlString, int type){  
        String src = "";  
        if(null==urlString || urlString.isEmpty() || !urlString.startsWith("http")){//如果urlString为null或者urlString为空，或urlString非http开头，返回src空值  
            return src;  
        }  
        //创建response  
        CloseableHttpResponse response = null;  
        HttpGet httpGet = null;  
        urlString = urlString.trim();//防止传入的urlString首尾有空格  
        //转化String url为URI,解决url中包含特殊字符的情况  
        try {  
            URL url = new URL(urlString);  
            URI uri = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), null);  
            httpGet = new HttpGet(uri);  
            //针对https采用SSL的方式创建httpclient  
//          if(urlString.startsWith("https")){  
//              httpClient = createSSLClientDefault();  
//              System.setProperty ("jsse.enableSNIExtension", "false");  
//          }  
            //设置请求头  
            httpGet.addHeader("Accept","*/*");  
//          httpGet.addHeader("Content-Type","application/x-www-form-urlencoded; charset=UTF-8");  
            httpGet.addHeader("Connection","keep-alive");  
            httpGet.addHeader("Accept-Encoding", "gzip, deflate");  
              
            //设置USER_AGENT  
            Random random = new Random();  
            int randomInt = random.nextInt(4);  
            System.err.println(randomInt);  
              
            httpGet.addHeader("User-Agent", Constant.USER_AGENT[randomInt]);  
            //此处的代理暂时注释  
//          String[] proxys = Constant.HTTPCLIENT_PROXY[randomInt].split("\\s+");  
//          //添加代理  
//          HttpHost proxy = new HttpHost(proxys[0].trim(), Integer.parseInt(proxys[1].trim()), "http");  
//          RequestConfig config = RequestConfig.custom().setProxy(proxy).build();  
//          httpGet.setConfig(config);    
            //执行请求        
            try {  
                if(urlString.startsWith("https")){  
                    System.setProperty ("jsse.enableSNIExtension", "false");  
                    response = createSSLClientDefault().execute(httpGet);  
                }else{  
                    response = infoLoad.getHttpClient().execute(httpGet);  
                }  
            } catch (Exception e) {  
                e.printStackTrace();  
            }  
              
            //得到响应状态码  
            int statuCode = response.getStatusLine().getStatusCode();  
            //根据状态码进行逻辑处理  
            switch (statuCode){  
            case 200:  
                //获得响应实体  
                HttpEntity entity = response.getEntity();  
                /**  
                 * 仿浏览器获取网页编码  
                 * 浏览器是先从content-type的charset（响应头信息）中获取编码，  
                 * 如果获取不了，则会从meta（HTML里的代码）中获取charset的编码值  
                 */  
                //第一步-->处理网页字符编码  
                String charset = null;  
                ContentType contentType = null;  
                contentType = ContentType.getOrDefault(entity);  
                Charset charsets = contentType.getCharset();  
                if(null != charsets){  
                    charset = charsets.toString();  
                }  
                //判断返回的数据流是否采用了gzip压缩  
                Header header = entity.getContentEncoding();  
                boolean isGzip = false;  
                if(null != header){  
                    for(HeaderElement headerElement : header.getElements()){  
                        if(headerElement.getName().equalsIgnoreCase("gzip")){  
                            isGzip = true;  
                        }  
                    }  
                }  
                //获得响应流  
                InputStream inputStream = entity.getContent();  
                ByteArrayBuffer buffer = new ByteArrayBuffer(4096);  
                byte[] tmp = new byte[4096];  
                int count;  
                if(isGzip){//如果采用了Gzip压缩，则进行gizp压缩处理  
                    GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);  
                    while((count=gzipInputStream.read(tmp)) != -1){  
                        buffer.append(tmp, 0, count);  
                    }  
                }else{//处理非gzip格式的数据  
                    while((count=inputStream.read(tmp)) != -1){  
                        buffer.append(tmp, 0, count);  
                    }  
                }  
                //第二步--->如果第一步contenttyp未获取到编码，这里从meta标签中获取  
                if(null==charset || "".equals(charset) || "null".equals(charset)   
                        || "zh-cn".equalsIgnoreCase(charset)){  
                    charset = getCharsetFromMetaTag(buffer, urlString);  
                }  
                //根据获取的字符编码转为string类型  
                src = new String(buffer.toByteArray(), charset);  
                //替换特殊编码  
                src = replaceStr(src);  
                //转化Unicode编码格式]  
                src = Common.decodeUnicode(src);  
//              System.out.println(src);  
                break;  
            case 400:  
                System.out.println("下载400错误代码，请求出现语法错误" + urlString);  
                //TODO 要进行判断是列表页还是正文页下载，再去修改数据库，下同  
                //TODO 此处添加对mongodb数据库的操作，将该url的isStart改为0，暂时不在进行监控，后续根据模板状态为0的进行修改  
                break;  
            case 403:  
                System.out.println("下载403错误代码，资源不可用" + urlString);                
                //TODO 此处添加对mongodb数据库的操作，将该url的isStart改为0，暂时不在进行监控，后续根据模板状态为0的进行修改  
                break;  
            case 404:  
                System.out.println("下载404错误代码，无法找到指定资源地址" + urlString);  
                //TODO 此处添加对mongodb数据库的操作，将该url的isStart改为0，暂时不在进行监控，后续根据模板状态为0的进行修改  
                break;  
            case 503:  
                System.out.println("下载503错误代码，服务不可用" + urlString);  
                //TODO 此处添加对mongodb数据库的操作，将该url的isStart改为0，暂时不在进行监控，后续根据模板状态为0的进行修改  
                break;  
            case 504:  
                System.out.println("下载504错误代码，网关超时" + urlString);  
                //TODO 此处添加对mongodb数据库的操作，将该url的isStart改为0，暂时不在进行监控，后续根据模板状态为0的进行修改  
                break;  
            }  
                  
        } catch (MalformedURLException e) {  
            //执行URL url = new URL()的异常  
            e.printStackTrace();  
        } catch (URISyntaxException e) {  
            //执行URI uri = new URI()的异常  
            e.printStackTrace();  
        } catch (ClientProtocolException e) {  
            // 执行httpClient.execute(httpGet)的异常  
            e.printStackTrace();  
        } catch (IOException e) {  
            // 执行httpClient.execute(httpGet)的异常  
            e.printStackTrace();  
        } finally{  
            if(response != null){  
                try {  
                    response.close();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }  
            }  
            httpGet.abort();    //结束后关闭httpGet请求  
            /** 
             * httpclient的链接有线程池管理，这里不用直接关闭 
             */  
//          try {//关闭连接  
//              httpClient.close();  
//          } catch (IOException e) {  
//              e.printStackTrace();  
//          }     
        }  
          
        return src;  
    }  
    /** 
     *  
     * 方法名：createSSLClientDefault 
     * 作者：zhouyh 
     * 创建时间：2015-10-14 下午03:03:30 
     * 描述：针对https采用SSL的方式创建httpclient 
     * @return 
     */  
    public static CloseableHttpClient createSSLClientDefault(){       
        try {             
            SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, new TrustStrategy(){  
            //信任所有  
            public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {  
                return true;  
            }}).build();  
  
            SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext);  
  
            return HttpClients.custom().setSSLSocketFactory(sslsf).build();  
  
        } catch (KeyManagementException e) {  
            e.printStackTrace();  
        } catch (NoSuchAlgorithmException e) {  
            e.printStackTrace();  
        } catch (KeyStoreException e) {  
            e.printStackTrace();  
        }  
              
        return  HttpClients.createDefault();  
    }  
    /** 
     *  
     * 方法名：getCharsetFromMetaTag 
     * 作者：zhouyh 
     * 创建时间：2015-10-14 下午05:23:08 
     * 描述：从meta标签中获取编码格式 
     * @param buffer 
     * @param url 
     * @return 
     */  
    public static String getCharsetFromMetaTag(ByteArrayBuffer buffer,String url){  
        String charset = null;  
        String regEx = Constant.CHARSET_REGEX;  
        Pattern p = Pattern.compile(regEx,  
                Pattern.CASE_INSENSITIVE);  
        Matcher m = p.matcher(new String(buffer.toByteArray()));  
        boolean result = m.find();  
        if (result) {  
            if (m.groupCount() == 1) {  
                charset = m.group(1);  
            }   
            System.err.println("网页 中的编码:" + charset + "\t url:" + url);  
        } else {  
            //出现未匹配的编码，先赋值为gbk  
            charset = "gbk";  
            System.out.println("字符编码未匹配到 : " + url);  
        }  
        return charset;  
    }  
    /** 
     *  
     * 方法名：replaceStr 
     * 作者：zhouyh 
     * 创建时间：2015-10-14 下午05:33:01 
     * 描述：替换原网页中的特殊字符 
     * @param src 
     * @return 
     */  
    public static String replaceStr(String src){  
        if (src == null || "".equals(src)) return null;  
        src = src.replaceAll("<!--", "");  
        src = src.replaceAll("-->", "");  
        src = src.replaceAll("<", "<");  
        src = src.replaceAll(">", ">");  
        src = src.replaceAll(""", "\"");  
        src = src.replaceAll(" ", " ");  
        src = src.replaceAll("&", "&");  
        return src;  
    }  
      
    /** 
     * 方法名：main 
     * 作者：zhouyh 
     * 创建时间：2015-10-14 上午08:46:47 
     * 描述：main方法 
     * @param args 
     */  
    public static void main(String[] args) {  
        // TODO Auto-generated method stub  
        Random random = new Random();  
        int randomInt = random.nextInt(4);  
        System.out.println(randomInt);  
//      InfoLoad.getInfoLoadInstance().loadForString("http://weixin.sogou.com/remind/doc_list.php?callback=jQuery111006747886650961997_1446517725478&from=web&uid=B31F8214DA30BE47F750B8BE2BF0E4AA%40qq.sohu.com&start=0&num=20&wordid=237&clear=1&_=1446517725480", 0);  
        InfoLoad.getInfoLoadInstance().loadForString("http://www.xinli001.com/info", 0);  
    }  
  
}  

[java] view plain copy
  

这里用到了IO常用的工具类IOUtils，详细了解可以参考这篇文章NO.63 [file]IO常用工具类IOUtils（Java读文件、写文件、打Zip包）

http://blog.csdn.net/hwwzyh/article/details/39184677

package main.utils;

import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.UnknownHostException;
import java.nio.charset.Charset;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLException;

import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.LaxRedirectStrategy;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.protocol.HttpContext;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.ssl.TrustStrategy;
import org.apache.http.util.ByteArrayBuffer;


/**
* 类名： InfoLoad
* 包名： com.boryou.module.load
* 作者： zhouyh
* 时间： 2015-10-14 上午08:46:47
* 描述：下载基础类，共监控、下载等模块使用
*/
public class InfoLoad {
// 创建httpclient连接池
private PoolingHttpClientConnectionManager httpClientConnectionManager = null;

/******单例模式声明开始******/
//类初始化时，自动实例化，饿汉单例模式
private static final InfoLoad infoLoad = new InfoLoad();
/**
*
* 方法名：getInfoLoadInstance
* 作者：zhouyh
* 创建时间：2015-10-14 上午08:59:54
* 描述：单例的静态方法，返回InfoLoad的实例
* @return
*/
public static InfoLoad getInfoLoadInstance(){
return infoLoad;
}
/******单例模式声明结束******/
/**
* 私有的构造函数
*/
private InfoLoad(){
//初始化httpClient
initHttpClient();
}
/**
*
* 方法名：initHttpClient
* 作者：zhouyh
* 创建时间：2015-10-14 上午11:00:30
* 描述：创建httpclient连接池，并初始化httpclient
*/
public void initHttpClient(){
//创建httpclient连接池
httpClientConnectionManager = new PoolingHttpClientConnectionManager();
//设置连接池最大数量
httpClientConnectionManager.setMaxTotal(5000);
//设置单个路由最大连接数量
httpClientConnectionManager.setDefaultMaxPerRoute(5000);
}
//请求重试机制
HttpRequestRetryHandler myRetryHandler = new HttpRequestRetryHandler() {
public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
if (executionCount >= 3) {
// 超过三次则不再重试请求
return false;
}
if (exception instanceof InterruptedIOException) {
// Timeout
return false;
}
if (exception instanceof UnknownHostException) {
// Unknown host
return false;
}
if (exception instanceof ConnectTimeoutException) {
// Connection refused
return false;
}
if (exception instanceof SSLException) {
// SSL handshake exception
return false;
}
HttpClientContext clientContext = HttpClientContext.adapt(context);
HttpRequest request = clientContext.getRequest();
boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
if (idempotent) {
// Retry if the request is considered idempotent
return true;
}
return false;
}
};
/**
*
* 方法名：getHttpClient
* 作者：zhouyh
* 创建时间：2016-2-18 下午01:23:32
* 描述：多线程调用时，需要创建自己的httpclient
* @return
*/
public CloseableHttpClient getHttpClient(){
// 创建全局的requestConfig
RequestConfig requestConfig = RequestConfig.custom()
.setConnectTimeout(3000)
.setSocketTimeout(2000)
.setCookieSpec(CookieSpecs.BEST_MATCH).build();
// 声明重定向策略对象
LaxRedirectStrategy redirectStrategy = new LaxRedirectStrategy();

CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(httpClientConnectionManager)
.setDefaultRequestConfig(requestConfig)
.setRedirectStrategy(redirectStrategy)
.setRetryHandler(myRetryHandler)
.build();
return httpClient;
}

/**
*
* 方法名：loadForString
* 作者：zhouyh
* 创建时间：2015-10-14 下午02:22:19
* 描述：根据传入的url获取下载信息
* @param url
* @param type
* @return
*/
public static String loadForString(String urlString, int type){
String src = "";
if(null==urlString || urlString.isEmpty() || !urlString.startsWith("http")){//如果urlString为null或者urlString为空，或urlString非http开头，返回src空值
return src;
}
//创建response
CloseableHttpResponse response = null;
HttpGet httpGet = null;
urlString = urlString.trim();//防止传入的urlString首尾有空格
//转化String url为URI,解决url中包含特殊字符的情况
try {
URL url = new URL(urlString);
URI uri = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), null);
httpGet = new HttpGet(uri);
//针对https采用SSL的方式创建httpclient
// if(urlString.startsWith("https")){
// httpClient = createSSLClientDefault();
// System.setProperty ("jsse.enableSNIExtension", "false");
// }
//设置请求头
httpGet.addHeader("Accept","*/*");
// httpGet.addHeader("Content-Type","application/x-www-form-urlencoded; charset=UTF-8");
httpGet.addHeader("Connection","keep-alive");
httpGet.addHeader("Accept-Encoding", "gzip, deflate");

//设置USER_AGENT
Random random = new Random();
int randomInt = random.nextInt(4);
System.err.println(randomInt);

httpGet.addHeader("User-Agent", "Moliza");
//此处的代理暂时注释
// String[] proxys = Constant.HTTPCLIENT_PROXY[randomInt].split("\\s+");
// //添加代理
// HttpHost proxy = new HttpHost(proxys[0].trim(), Integer.parseInt(proxys[1].trim()), "http");
// RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
// httpGet.setConfig(config);
//执行请求
try {
if(urlString.startsWith("https")){
System.setProperty ("jsse.enableSNIExtension", "false");
response = createSSLClientDefault().execute(httpGet);
}else{
response = infoLoad.getHttpClient().execute(httpGet);
}
} catch (Exception e) {
e.printStackTrace();
}

//得到响应状态码
int statuCode = response.getStatusLine().getStatusCode();
//根据状态码进行逻辑处理
switch (statuCode){
case 200:
//获得响应实体
HttpEntity entity = response.getEntity();
/**
* 仿浏览器获取网页编码
* 浏览器是先从content-type的charset（响应头信息）中获取编码，
* 如果获取不了，则会从meta（HTML里的代码）中获取charset的编码值
*/
//第一步-->处理网页字符编码
String charset = null;
ContentType contentType = null;
contentType = ContentType.getOrDefault(entity);
Charset charsets = contentType.getCharset();
if(null != charsets){
charset = charsets.toString();
}
//判断返回的数据流是否采用了gzip压缩
Header header = entity.getContentEncoding();
boolean isGzip = false;
if(null != header){
for(HeaderElement headerElement : header.getElements()){
if(headerElement.getName().equalsIgnoreCase("gzip")){
isGzip = true;
}
}
}
//获得响应流
InputStream inputStream = entity.getContent();
ByteArrayBuffer buffer = new ByteArrayBuffer(4096);
byte[] tmp = new byte[4096];
int count;
if(isGzip){//如果采用了Gzip压缩，则进行gizp压缩处理
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);
while((count=gzipInputStream.read(tmp)) != -1){
buffer.append(tmp, 0, count);
}
}else{//处理非gzip格式的数据
while((count=inputStream.read(tmp)) != -1){
buffer.append(tmp, 0, count);
}
}
//第二步--->如果第一步contenttyp未获取到编码，这里从meta标签中获取
if(null==charset || "".equals(charset) || "null".equals(charset)
|| "zh-cn".equalsIgnoreCase(charset)){
charset = getCharsetFromMetaTag(buffer, urlString);
}
//根据获取的字符编码转为string类型
src = new String(buffer.toByteArray(), charset);
//替换特殊编码
src = replaceStr(src);
//转化Unicode编码格式]
//src = Common.decodeUnicode(src);
// System.out.println(src);
break;
case 400:
System.out.println("下载400错误代码，请求出现语法错误" + urlString);
//TODO 要进行判断是列表页还是正文页下载，再去修改数据库，下同
//TODO 此处添加对mongodb数据库的操作，将该url的isStart改为0，暂时不在进行监控，后续根据模板状态为0的进行修改
break;
case 403:
System.out.println("下载403错误代码，资源不可用" + urlString);
//TODO 此处添加对mongodb数据库的操作，将该url的isStart改为0，暂时不在进行监控，后续根据模板状态为0的进行修改
break;
case 404:
System.out.println("下载404错误代码，无法找到指定资源地址" + urlString);
//TODO 此处添加对mongodb数据库的操作，将该url的isStart改为0，暂时不在进行监控，后续根据模板状态为0的进行修改
break;
case 503:
System.out.println("下载503错误代码，服务不可用" + urlString);
//TODO 此处添加对mongodb数据库的操作，将该url的isStart改为0，暂时不在进行监控，后续根据模板状态为0的进行修改
break;
case 504:
System.out.println("下载504错误代码，网关超时" + urlString);
//TODO 此处添加对mongodb数据库的操作，将该url的isStart改为0，暂时不在进行监控，后续根据模板状态为0的进行修改
break;
}

} catch (MalformedURLException e) {
//执行URL url = new URL()的异常
e.printStackTrace();
} catch (URISyntaxException e) {
//执行URI uri = new URI()的异常
e.printStackTrace();
} catch (ClientProtocolException e) {
// 执行httpClient.execute(httpGet)的异常
e.printStackTrace();
} catch (IOException e) {
// 执行httpClient.execute(httpGet)的异常
e.printStackTrace();
} finally{
if(response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
httpGet.abort(); //结束后关闭httpGet请求
/**
* httpclient的链接有线程池管理，这里不用直接关闭
*/
// try {//关闭连接
// httpClient.close();
// } catch (IOException e) {
// e.printStackTrace();
// }
}

return src;
}
/**
*
* 方法名：createSSLClientDefault
* 作者：zhouyh
* 创建时间：2015-10-14 下午03:03:30
* 描述：针对https采用SSL的方式创建httpclient
* @return
*/
public static CloseableHttpClient createSSLClientDefault(){
try {
SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, new TrustStrategy(){
//信任所有
public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {
return true;
}}).build();

SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext);

return HttpClients.custom().setSSLSocketFactory(sslsf).build();

} catch (KeyManagementException e) {
e.printStackTrace();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
} catch (KeyStoreException e) {
e.printStackTrace();
}

return HttpClients.createDefault();
}
/**
*
* 方法名：getCharsetFromMetaTag
* 作者：zhouyh
* 创建时间：2015-10-14 下午05:23:08
* 描述：从meta标签中获取编码格式
* @param buffer
* @param url
* @return
*/
public static String getCharsetFromMetaTag(ByteArrayBuffer buffer,String url){
String charset = null;
String regEx = "";//Constant.CHARSET_REGEX;
Pattern p = Pattern.compile(regEx,
Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(new String(buffer.toByteArray()));
boolean result = m.find();
if (result) {
if (m.groupCount() == 1) {
charset = m.group(1);
}
System.err.println("网页中的编码:" + charset + "\t url:" + url);
} else {
//出现未匹配的编码，先赋值为gbk
charset = "gbk";
System.out.println("字符编码未匹配到 : " + url);
}
return charset;
}
/**
*
* 方法名：replaceStr
* 作者：zhouyh
* 创建时间：2015-10-14 下午05:33:01
* 描述：替换原网页中的特殊字符
* @param src
* @return
*/
public static String replaceStr(String src){
if (src == null || "".equals(src)) return null;
src = src.replaceAll("", "");
src = src.replaceAll("<", "<");
src = src.replaceAll(">", ">");
src = src.replaceAll("\"", "\\\"");
src = src.replaceAll(" ", " ");
src = src.replaceAll("&", "&");
return src;
}

/**
* 方法名：main
* 作者：zhouyh
* 创建时间：2015-10-14 上午08:46:47
* 描述：main方法
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
Random random = new Random();
int randomInt = random.nextInt(4);
System.out.println(randomInt);
// InfoLoad.getInfoLoadInstance().loadForString("http://weixin.sogou.com/remind/doc_list.php?callback=jQuery111006747886650961997_1446517725478&from=web&uid=B31F8214DA30BE47F750B8BE2BF0E4AA%40qq.sohu.com&start=0&num=20&wordid=237&clear=1&_=1446517725480", 0);
InfoLoad.getInfoLoadInstance().loadForString("http://www.xinli001.com/info", 0);
}

}

阅读全文

0 0