利用cpdetector获取文件编码格式,同时得到网页内容。增加http/https通用方式

来源:互联网 发布:java需要英语基础吗 编辑:程序博客网 时间:2024/06/04 08:27
获取网页编码格式,同时得到网页内容。import info.monitorenter.cpdetector.io.ASCIIDetector;import info.monitorenter.cpdetector.io.CodepageDetectorProxy;import info.monitorenter.cpdetector.io.JChardetFacade;import info.monitorenter.cpdetector.io.ParsingDetector;import info.monitorenter.cpdetector.io.UnicodeDetector;import java.io.BufferedReader;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.URL;public class HtmlContentUtil {private static CodepageDetectorProxy detector = null;static{//获取探测编码器detector对象detector = CodepageDetectorProxy.getInstance();detector.add(JChardetFacade.getInstance());// 用到antlr.jar、chardet.jardetector.add(new ParsingDetector(false));detector.add(ASCIIDetector.getInstance());detector.add(UnicodeDetector.getInstance());}/** * @描述:  获取网页内容 * @说明: * @修改时间: 2016年6月22日 下午3:16:55 * @param url * @return * @throws Exception */public static String getContent(String url) throws Exception {if (!url.contains("http") && !url.contains("https")) {url = "http://" + url;}URL indexUrl = new URL(url);String fileEncode = getFileEncode(indexUrl);if(fileEncode == null){fileEncode = "utf-8";}HttpURLConnection httpConn = (HttpURLConnection) indexUrl.openConnection();InputStreamReader input = new InputStreamReader(httpConn.getInputStream(), fileEncode);BufferedReader bufReader = new BufferedReader(input);String line = "";StringBuilder contentBuf = new StringBuilder();while ((line = bufReader.readLine()) != null) {contentBuf.append(line);}String content = contentBuf.toString();return content;}/** * @描述:利用第三方开源包cpdetector获取文件编码格式 * @说明: * @修改时间: 2016年6月22日 下午3:16:36 * @param indexUrl * @return */public static String getFileEncode(URL indexUrl) {java.nio.charset.Charset charset = null;try {charset = detector.detectCodepage(indexUrl);} catch (Exception ex) {}if (charset != null) {if (charset.name().equals("void")) {return "GBK";//未知的编码默认为gbk}else{return charset.name();}} else{return null;}}public static void main(String[] args) {try {System.out.println(getContent("www.xjjz.gov.cn"));} catch (Exception e) {e.printStackTrace();}}}

以上只能获取http协议网站内容,增加一个http、https都能获取的方式

注册http客户端

/** * 创建httpclient * @return */public CloseableHttpClient buildHttpClient() {try {RegistryBuilder<ConnectionSocketFactory> builder = RegistryBuilder.create();ConnectionSocketFactory factory = new PlainConnectionSocketFactory();builder.register("http", factory);KeyStore trustStore = KeyStore.getInstance(KeyStore.getDefaultType());SSLContext context = SSLContexts.custom().useTLS().loadTrustMaterial(trustStore, new TrustStrategy() {@Overridepublic boolean isTrusted(X509Certificate[] chain, String authType)throws CertificateException {return true;}}).build();LayeredConnectionSocketFactory sslFactory = new SSLConnectionSocketFactory(context, SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);builder.register("https", sslFactory);Registry<ConnectionSocketFactory> registry = builder.build();PoolingHttpClientConnectionManager manager = new PoolingHttpClientConnectionManager(registry);ConnectionConfig connConfig = ConnectionConfig.custom().setCharset(Charset.forName(defaultEncoding)).build();SocketConfig socketConfig = SocketConfig.custom().setSoTimeout(100000).build();manager.setDefaultConnectionConfig(connConfig);manager.setDefaultSocketConfig(socketConfig);return HttpClientBuilder.create().setConnectionManager(manager).build();} catch (KeyStoreException e) {e.printStackTrace();} catch (KeyManagementException e) {e.printStackTrace();} catch (NoSuchAlgorithmException e) {e.printStackTrace();}return null;}

/** * @描述:  获取网页内容,支持http和https * @说明: * @修改时间: 2016年7月4日 上午10:20:27 * @param url * @return * @throws IOException  * @throws ClientProtocolException  */public static String getAllContent(String url) throws ClientProtocolException, IOException {if (!url.contains("http") && !url.contains("https")) {url = "http://" + url;}String fileEncode = getFileEncode(new URL(url));if(fileEncode == null){fileEncode = "utf-8";}CloseableHttpClient buildHttpClient = new HttpUtils().buildHttpClient();//RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000).setConnectionRequestTimeout(3000).setSocketTimeout(3000).build();HttpGet httpGet = new HttpGet(url);//httpGet.setConfig(requestConfig);CloseableHttpResponse response = buildHttpClient.execute(httpGet);HttpEntity entity = response.getEntity();String result = "";if (entity != null) {result = EntityUtils.toString(entity, Charset.forName(fileEncode));}return result;}



0 0
原创粉丝点击