爬网入门:JAVA抓取网站网页内容

来源:互联网 发布:linux vi命令退出 编辑:程序博客网 时间:2024/05/16 07:27
 爬网入门:JAVA抓取网站网页内容最近在用JAVA研究下爬网技术,呵呵,入了个门,把自己的心得和大家分享下以下提供二种方法,一种是用apache提供的包.另一种是用JAVA自带的.代码如下:// 第一种方法//这种方法是用apache提供的包,简单方便//但是要用到以下包:commons-codec-1.4.jar//    commons-httpclient-3.1.jar//    commons-logging-1.0.4.jar    public static String createhttpClient(String url, String param) {        HttpClient client = new HttpClient();        String response = null;        String keyword = null;        PostMethod postMethod = new PostMethod(url);   //        try {//            if (param != null)//                keyword = new String(param.getBytes("gb2312"), "ISO-8859-1");//        } catch (UnsupportedEncodingException e1) {//            // TODO Auto-generated catch block//            e1.printStackTrace();//        }        // NameValuePair[] data = { new NameValuePair("keyword", keyword) };        // // 将表单的值放入postMethod中        // postMethod.setRequestBody(data);        //    以上部分是带参数抓取,我自己把它注销了.大家可以把注销消掉研究下                   try {            int statusCode = client.executeMethod(postMethod);            response = new String(postMethod.getResponseBodyAsString()                    .getBytes("ISO-8859-1"), "gb2312");//这里要注意下 gb2312要和你抓取网页的编码要一样                        String p = response.replaceAll("\\&[a-zA-Z]{1,10};", "")                    .replaceAll("<[^>]*>", "");//去掉网页中带有html语言的标签            System.out.println(p);        } catch (Exception e) {            e.printStackTrace();        }        return response;    }    // 第二种方法    // 这种方法是JAVA自带的URL来抓取网站内容    public String getPageContent(String strUrl, String strPostRequest,            int maxLength) {        // 读取结果网页        StringBuffer buffer = new StringBuffer();        System.setProperty("sun.net.client.defaultConnectTimeout", "5000");        System.setProperty("sun.net.client.defaultReadTimeout", "5000");        try {            URL newUrl = new URL(strUrl);            HttpURLConnection hConnect = (HttpURLConnection) newUrl                    .openConnection();            // POST方式的额外数据            if (strPostRequest.length() > 0) {                hConnect.setDoOutput(true);                OutputStreamWriter out = new OutputStreamWriter(hConnect                        .getOutputStream());                out.write(strPostRequest);                out.flush();                out.close();            }            // 读取内容                        BufferedReader rd = new BufferedReader(new InputStreamReader(                    hConnect.getInputStream()));            int ch;            for (int length = 0; (ch = rd.read()) > -1                    && (maxLength <= 0 || length < maxLength); length++)                buffer.append((char) ch);            String s = buffer.toString();            s.replaceAll("\\&[a-zA-Z]{1,10};", "").replaceAll("<[^>]*>", "");            System.out.println(s);                        rd.close();            hConnect.disconnect();            return buffer.toString().trim();        } catch (Exception e) {            // return "错误:读取网页失败!";            //            return null;                     }    }然后写个测试类:public static void main(String[] args) {        String url = "http://www.renren.com";        String keyword = "人人";        createhttpClient p = new createhttpClient();        String response = p.createhttpClient(url, keyword); // 第一种方法        // p.getPageContent(url, "post", 100500);//第二种方法    }呵呵,看看控制台吧,是不是把网页的内容获取了第三种方法:import java.io.FileOutputStream;import java.io.InputStream;import java.io.OutputStream;import java.net.URL;public class GetUrlToHtml {    public static void main(String[] args) {        InputStream in = null;           OutputStream out = null;          try {            if ((args.length != 1)&& (args.length != 2))                 throw new IllegalArgumentException("Wrong number of args");                            URL url = new URL(args[0]);              in = url.openStream();                    if (args.length == 2)                         out = new FileOutputStream(args[1]);            else out = System.out;                 byte[] buffer = new byte[4096];            if(out==System.out){new String();}            int bytes_read;            while((bytes_read = in.read(buffer)) != -1){                out.write(buffer, 0, bytes_read);}                     }              catch (Exception e) {            System.err.println(e);            System.err.println("Usage: java GetURL <URL> [<filename>]");        }        finally {             try { in.close(); out.close(); } catch (Exception e) {}        }    }}