java下载网页的方法

来源：互联网发布：电脑音乐录音软件编辑：程序博客网时间：2024/05/19 04:51

java下载网页的方法主要有两种：java自带的HttpURLConnection类和HttpClient类包，这两种方法有各自的好处，另外对于中文乱码的处理，本文在代码中有详细体现和比较，能够很好的消除中文乱码问题，供大家参考。下面就让我们在代码中领悟吧！

方法1：HttpURLConnection的两种不同解码方式

package com.learn.http.impl;import java.io.IOException;import java.io.InputStream;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import com.learn.http.Http;import com.learn.util.SingleMatch;public class HttpURLConnectionImp1 implements Http {/** * 采用Java自带的HttpURLConnection,优点：方便，不用导入其他包 * 缺点：在该方法中虽然对编码进行了转换，但是由于缓冲区大小的确定，如大小为1024字节，有可能会引起文字的切割不正确导致部分中文字乱码 * @param pageUrl * @param encoding * @return */public String getHtmlcodeWithoutHeader(String pageUrl, String encoding) {URL url = null;HttpURLConnection conn = null;InputStream in = null;StringBuffer sb = null;try {url = new URL(pageUrl);conn = (HttpURLConnection) url.openConnection();sb = new StringBuffer();if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) {in = conn.getInputStream();byte[] buf = new byte[1024];int len = 0;while ((len=in.read(buf)) != -1)sb.append(new String(buf, 0, len, encoding));in.close();}else System.err.println("访问网络失败！"+conn.getResponseCode());} catch (MalformedURLException e) {System.err.println("url格式不规范:"+e.getMessage());} catch (IOException e) {System.err.println("IO操作错误："+e.getMessage());}return sb.toString();}public String getHtml(String url) {String html = "";String firstEncoding = "utf-8";html = getHtmlcodeWithoutHeader(url, firstEncoding);String encoding = SingleMatch.match(html, "charset=\"?(\\S*?)\"|charset='?(\\S*?)'");if(encoding.toLowerCase().equals(firstEncoding)){}else if(encoding==""){}else{html = getHtmlcodeWithoutHeader(url, encoding);}return html;}}

package com.learn.http.impl;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import com.learn.http.Http;import com.learn.util.SingleMatch;public class HttpURLConnectionImp2 implements Http{/** * 该方法可以很好的解决中文乱码问题，同样采用java自带的HttpURLConnection类，方便 */public String getHtmlcodeWithoutHeader(String pageUrl, String encoding) {//Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 8580)); //设置代理服务器StringBuffer sb = new StringBuffer();try {URL url = new URL(pageUrl);//HttpURLConnection conn = (HttpURLConnection) url.openConnection(proxy);HttpURLConnection conn = (HttpURLConnection) url.openConnection();InputStream in = conn.getInputStream();BufferedReader br = new BufferedReader(new InputStreamReader(in, encoding));String line = null;while((line=br.readLine())!=null){sb.append(line);sb.append("\r\n");}br.close();in.close();} catch (MalformedURLException e) {System.err.println("url格式不规范:"+e.getMessage());} catch (IOException e) {System.err.println("IO操作错误："+e.getMessage());}return sb.toString();}public String getHtml(String url) {String html = "";String firstEncoding = "utf-8";html = getHtmlcodeWithoutHeader(url, firstEncoding);String encoding = SingleMatch.match(html, "charset=\"?(\\S*?)\"|charset='?(\\S*?)'");if(encoding.toLowerCase().equals(firstEncoding)){}else if(encoding==""){}else{html = getHtmlcodeWithoutHeader(url, encoding);}return html;}}

方法2：HttpClient的两种访问网页方式

package com.learn.http.impl;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.UnsupportedEncodingException;import org.apache.http.Header;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.StatusLine;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import com.learn.http.Http;import com.learn.util.SingleMatch;@SuppressWarnings("deprecation")public class HttpClientImp1 implements Http {/** * 好像这个方法已经不推荐了，呵呵 */public String getHtmlcodeWithoutHeader(String pageUrl, String encoding) {HttpClient client = new DefaultHttpClient();HttpGet get = new HttpGet(pageUrl);        String s = null;// HttpHost poxy = new HttpHost("127.0.0.1", 443);try {HttpResponse response = client.execute(get);StatusLine status = response.getStatusLine();System.out.println("状态行：" + status);Header[] heads = response.getAllHeaders();System.out.println("首部行：");for (Header h : heads)System.out.println("名称：" + h.getName() + " 值：" + h.getValue());HttpEntity entity = response.getEntity();InputStream in = null;if (entity != null) {in = entity.getContent();s =  inputStream2String(in, encoding);in.close();}} catch (ClientProtocolException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}return s;}private String inputStream2String(InputStream in,String charset){BufferedReader br;StringBuffer sb = new StringBuffer();try {br = new BufferedReader(new InputStreamReader(in,charset));String line = "";while ((line = br.readLine()) != null) {sb.append(line + "\n");}br.close();} catch (UnsupportedEncodingException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}return sb.toString();}public String getHtml(String url) {String html = "";String firstEncoding = "utf-8";html = getHtmlcodeWithoutHeader(url, firstEncoding);String encoding = SingleMatch.match(html, "charset=\"?(\\S*?)\"|charset='?(\\S*?)'");if(encoding.toLowerCase().equals(firstEncoding)){}else if(encoding==""){}else{html = getHtmlcodeWithoutHeader(url, encoding);}return html;}}

package com.learn.http.impl;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.UnsupportedEncodingException;import javax.net.ssl.HttpsURLConnection;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.HttpException;import org.apache.commons.httpclient.methods.GetMethod;import com.learn.http.Http;import com.learn.util.SingleMatch;public class HttpClientImp2 implements Http{/** * 该方法需要导入commons-httpclient包，功能更加强大，设置代理服务器，翻墙更方便哟 *///static String PROXY_HOST = "127.0.0.1";//static int PROXY_PORT = 8580;static HttpClient client = null;static {client = new HttpClient();/*client.getHostConfiguration().setProxy(PROXY_HOST,PROXY_PORT);Credentials credentials = new Credentials() {}; //代理匿名认证 AuthScopeAuthScope authscope = new AuthScope(PROXY_HOST,PROXY_PORT);client.getState().setProxyCredentials(authscope, credentials); *///client.getParams().setAuthenticationPreemptive(true);}public String getHtmlcodeWithoutHeader(String pageUrl, String encoding) {        String response = null;        GetMethod getMethod = new GetMethod(pageUrl);        try {client.executeMethod(getMethod);if(getMethod.getStatusCode()==HttpsURLConnection.HTTP_OK){InputStream in = getMethod.getResponseBodyAsStream();    response = inputStream2String(in, encoding);    in.close();}elseSystem.err.println("访问网络失败！");} catch (HttpException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}return response;}private String inputStream2String(InputStream in,String charset){BufferedReader br;StringBuffer sb = new StringBuffer();try {br = new BufferedReader(new InputStreamReader(in,charset));String line = "";while ((line = br.readLine()) != null) {sb.append(line + "\n");}br.close();} catch (UnsupportedEncodingException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}return sb.toString();}public String getHtml(String url) {String html = "";String firstEncoding = "utf-8";html = getHtmlcodeWithoutHeader(url, firstEncoding);String encoding = SingleMatch.match(html, "charset=\"?(\\S*?)\"|charset='?(\\S*?)'");if(encoding.toLowerCase().equals(firstEncoding)){}else if(encoding==""){}else{html = getHtmlcodeWithoutHeader(url, encoding);}return html;}}

每个类的方法注释中都有详细的优缺点说明，希望读者能从中受益。

下载源代码：java网页下载的四种不同实现方法

0 0