简单爬虫中文乱码问题

来源:互联网 发布:旅游 知乎 编辑:程序博客网 时间:2024/06/06 04:54

对于下文的爬虫程序,红色语句中的编码方式要和下载网页的编码方式一致,就可以解决下载文件中文乱码的问题。


package chapter2;

import java.io.*;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.*;


public class WebCrawler {


/**
* @param args
*/
private static String Text_File_Path = "D:\\HtmlDownload\\htmlsrc.html";
public static void main(String[] args)throws IOException {
// TODO Auto-generated method stub
try
{
File file = new File(Text_File_Path);
FileWriter fpWriter = new FileWriter(file);
Socket webclient = new Socket("www.bnu.edu.cn",80);
PrintWriter result = new PrintWriter(webclient.getOutputStream(),true);
BufferedReader receiver = new BufferedReader(new InputStreamReader(webclient.getInputStream(),"utf-8"));
result.println("GET / HTTP/1.1");
result.println("Host: bnu.edu.cn");
result.println("Connection: Close");
result.println();

boolean bRet = true;
StringBuffer sBuffer = new StringBuffer(8096);
while(bRet)
{
if(receiver.ready())
{
int idx = 0;
while(idx!=-1)
{
idx = receiver.read();
if(idx=='<')
break;
}
while(idx!=-1)
{
sBuffer.append((char)idx);
idx=receiver.read();
}
bRet = false;
}
}
System.out.println(sBuffer.toString());
fpWriter.write(sBuffer.toString());
webclient.close();
fpWriter.close();
}
catch (UnknownHostException e)
{
System.err.println("无法指定主机。");
System.exit(1);
}
catch (IOException e)
{
System.err.println("下载失败,请检查输入地址是否正确!");
System.exit(1);
}


}


}

原创粉丝点击