解决java用url中读取html源码时的乱码问题

来源:互联网 发布:飞狐软件官网 编辑:程序博客网 时间:2024/06/07 16:42

1、主要应用getContentType获取相应的网页编码方式:

pageUrl=new URL(urlString);

HttpURLConnection uc = (HttpURLConnection) pageUrl.openConnection();

String encoding=uc.getContentType();

2、再提取charset子串(这里使用"charset=",由于网页中的大小写不敏感,所以最好用正则表达式修改一下)

encoding=encoding.substring(encoding.indexOf("charset=")+8).trim();
//System.out.println("+"+encoding+"+");
// 创建网络流
BufferedReader reader=
new BufferedReader(new InputStreamReader(pageUrl.openStream(),encoding));

3、下面是源代码,注释的很清楚:

 1 import java.io.BufferedReader; 2 import java.io.IOException; 3 import java.io.InputStreamReader; 4 import java.io.UnsupportedEncodingException; 5 import java.net.HttpURLConnection; 6 import java.net.MalformedURLException; 7 import java.net.URL; 8  9 public class PageString {10     private StringBuffer strBuf=new StringBuffer();11     private URL pageUrl=null;12     public PageString(String urlString){13         try {14             //System.out.println(urlString);15             pageUrl=new URL(urlString);16             try {17                 //获取网页的编码方式,这里可以解决乱码问题18                 HttpURLConnection uc = (HttpURLConnection) pageUrl.openConnection();19                 String encoding=uc.getContentType();20                 encoding=encoding.substring(encoding.indexOf("charset=")+8).trim();21                 //System.out.println("+"+encoding+"+");22                 // 创建网络流23                 BufferedReader reader=24                 new BufferedReader(new InputStreamReader(pageUrl.openStream(),encoding));25                 String line;26                 // 读取网页内容27                 //new StringBuffer();28                 while((line=reader.readLine())!=null){29                     //System.out.println(line);30                    strBuf.append(line+"\t\n");31                 }32                 } catch (IOException e) {33                 // TODO Auto-generated catch block34                 e.printStackTrace();35                 }36         } catch (MalformedURLException e) {37             // TODO Auto-generated catch block38             e.printStackTrace();39         }40     }41     public StringBuffer getStrBuf() throws UnsupportedEncodingException {42         //System.out.println(new String(strBuf.toString().getBytes("gb2312")).toString());43         return this.strBuf;44     }45 }