JAVA编写网络爬虫笔记（第二部分:httpClient下载页面）

来源：互联网发布：java 内部类构造函数编辑：程序博客网时间：2024/06/04 20:03

接着上一部分的内容，我们看一下怎样下载和解析页面。这里用到的java包主要是httpclient，可以去apache官网下载

我们是利用httpclient生成一个GetMethod的对象，这个对象可以请求网页，然后网页回应html源代码给我们，我们就可以保存为一个html文件或者txt文件，然后就进行下一步的信息提取了。

主要的代码实现

import java.io.DataOutputStream;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.PrintStream;import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.HttpException;import org.apache.commons.httpclient.HttpMethod;import org.apache.commons.httpclient.HttpStatus;import org.apache.commons.httpclient.methods.GetMethod;import org.apache.commons.httpclient.methods.PostMethod;import org.apache.commons.httpclient.params.HttpMethodParams;    public class DownLoadFile {        /**         * 根据URL和网页类型生成需要保存的网页的文件名，去除URL中的非文件名字符         * */    public String getFileNameByUrl(String url,String contentType){            //移除http:            url=url.substring(7);            //text/html类型            if(contentType.indexOf("html")!=-1){                url=url.replaceAll("[\\?/:*|<>\"]","_")+".html";                return url;            }            else            {                return url.replaceAll("[\\?:*|<>\"]","_")+"."+contentType.substring(contentType.lastIndexOf("/")+1);            }        }        /**         * 保存网页字节数组到本地文件,filepath为要保存文件的相对地址         * */        private void saveToLocal(String context ,String filePath){            try {                   File file = new File(filePath);                if(!file.exists()){                    file.createNewFile();}             PrintStream ps = new PrintStream(new FileOutputStream(file));             ps.append(context);}             catch (FileNotFoundException e) {                // TODO 自动生成的 catch 块                e.printStackTrace();            } catch (IOException e) {                // TODO 自动生成的 catch 块                e.printStackTrace();            }        }        /**         * 下载url所指向的网页         * */        public String downloadFile(String url){            String filePath =null;            String filePath1=null;            //生成HttpClient对象并设置参数            HttpClient httpClient=new HttpClient();            //2.生成GetMethod对象并设置参数            GetMethod getMethod =new GetMethod(url);              httpClient.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET,"gb2312");            //3.执行http get请求    try {                int statusCode=httpClient.executeMethod(getMethod);                //判断访问的状态码                if(statusCode!=HttpStatus.SC_OK){                    System.out.println("Method failed:"+getMethod.getStatusLine());                    filePath=null;                }                //处理HTTP内容            String  responseBody=getMethod.getResponseBodyAsString();                //根据网页url生成保存时的文件名                filePath="C:"+File.separator+"test1"+File.separator+"+getFileNameByUrl(url,getMethod.getResponseHeader('Content-Type').getValue())";                saveToLocal(responseBody,filePath);            } catch (HttpException e) {                // 发生异常，可能是协议不对或者返回的内容有问题                System.out.println("Please check your provided http address!");                e.printStackTrace();            } catch (IOException e) {                   // 发生网络异常                e.printStackTrace();            }finally{                //释放链接                getMethod.releaseConnection();            }            return filePath;        }    }

有了下载的文件，我们就可以从里面提取出url，然后放入队列，继续抓取。

0 0