JavaSpider学习

来源:互联网 发布:mac上铃声软件 编辑:程序博客网 时间:2024/06/04 18:24

javaSpider

所需jar包
jar包


1. 我的第一个测试

public class SpiderTest {/** * 爬取网站的源码 */@Testpublic void test1(){    BufferedReader bf = null;    try {        //获得URL对象        URL url = new URL("http://1483104508.55555.io/From");        //获得对应的inputStream流        InputStream input = url.openStream();        //存入BufferReader 并设置字符集        bf = new BufferedReader(new InputStreamReader(input, "utf-8"));        String str =null;        //读取数据        while((str = bf.readLine())!=null)        {            //打印            System.out.println(str);        }    } catch (Exception e) {        // TODO Auto-generated catch block        e.printStackTrace();    }finally {        //关闭BufferReader        if(bf!=null)            try {                bf.close();            } catch (IOException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }    }}/** * 通过代理访问 */@Testpublic void test2(){    BufferedReader bf = null;    try {        URL url = new URL("http://1483104508.55555.io/From");        //设置代理,XX为代理服务器IP,host为端口号        Proxy proxy  =  new Proxy(Type.HTTP,new InetSocketAddress("XXX.XXX.XXX.XXX", host));        //通过代理获得URLConnection连接        URLConnection u=url.openConnection(proxy);        //参考test1        InputStream input = u.getInputStream();        bf = new BufferedReader(new InputStreamReader(input, "utf-8"));        String str =null;        while((str = bf.readLine())!=null)        {            System.out.println(str);        }    } catch (Exception e) {        // TODO Auto-generated catch block        e.printStackTrace();    }finally {        if(bf!=null)            try {                bf.close();            } catch (IOException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }    }}}

2.HttpClient

public class SpiderTest1 {@Testpublic void test(){    HttpClient send = new DefaultHttpClient();    send.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY, new HttpHost("XXX.XXX.XXX.XXX", host));    //代理方式XX为ip,host为端口    HttpContext context = new BasicHttpContext();    HttpGet get = new HttpGet("http://1483104508.55555.io/From");//  HttpGet get = new HttpGet("http://www.baidu.com");    BufferedReader bf = null;    try {//      HttpResponse response= send.execute(get);//只获取消息体        HttpResponse response= send.execute(get,context);//可以获得消息头        HttpEntity entity = response.getEntity();//获得的是请求体        Object HTTP_CONNECTION = context.getAttribute(ExecutionContext.HTTP_CONNECTION);        System.out.println(HTTP_CONNECTION);//获得http连接        Object HTTP_PROXY_HOST =  context.getAttribute(ExecutionContext.HTTP_PROXY_HOST);        System.out.println(HTTP_PROXY_HOST);//代理主机host        Object HTTP_REQ_SENT = context.getAttribute(ExecutionContext.HTTP_REQ_SENT);        System.out.println(HTTP_REQ_SENT);//        Object HTTP_REQUEST =  context.getAttribute(ExecutionContext.HTTP_REQUEST);        System.out.println(HTTP_REQUEST);//获得request对象        Object HTTP_RESPONSE =  context.getAttribute(ExecutionContext.HTTP_RESPONSE);        System.out.println(HTTP_RESPONSE);//获得response对象        Object HTTP_TARGET_HOST =  context.getAttribute(ExecutionContext.HTTP_TARGET_HOST);        System.out.println(HTTP_TARGET_HOST);//目标主机host        HttpHost hh = (HttpHost)HTTP_TARGET_HOST; // 转换为HttpHost        String host = hh.getHostName();//获得主机名        System.out.println(host);        InputStream content = entity.getContent();        String contentCharSet = EntityUtils.getContentCharSet(entity);        bf = new BufferedReader(new InputStreamReader(content, contentCharSet));        String str = null;        while ((str = bf.readLine())!=null)        {            System.out.println(str);        }    } catch (Exception e) {        e.printStackTrace();    } finally {        if(bf!=null)        {            try {                bf.close();            } catch (IOException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }        }    }}}

3.爬取网页存储

public class SpiderTest2 {    /**     * 爬到网页并存储     */@Testpublic void test3() {    BufferedReader r = null;    HttpClient sender = new DefaultHttpClient();     HttpGet get = new HttpGet("http://www.btime.com/?from=ssk2");    try {        HttpResponse response = sender.execute(get); //获得response对象        HttpEntity entity = response.getEntity(); //得到请求体        InputStream in = entity.getContent();// 得到请求内容        // 直接将所拿到的流存放到文件里,转成html        IOUtils.copy(in, new FileOutputStream("F:/c.html"));//将流写入html文件    } catch (Exception e) {        e.printStackTrace();    }finally {    if (r != null) {        try {            r.close();        } catch (IOException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }    }    }}}

4.模拟登陆

 public class SpiderTest3 {/** * 模拟登陆 */@Testpublic void test(){    HttpClient send = new DefaultHttpClient();//先获取CLient对象    HttpContext context = new BasicHttpContext();//获取Context对象    HttpPost post = new HttpPost("http://1483104508.55555.io/blog/AdminLogin");    //通过post连接    List<NameValuePair> parms = new ArrayList<NameValuePair>();//参数集合    parms.add(new BasicNameValuePair("user", "root"));//添加参数    parms.add(new BasicNameValuePair("pwd", "root"));    BufferedReader bf = null;    try {        post.setEntity(new UrlEncodedFormEntity(parms,"utf-8"));//请求        HttpResponse response= send.execute(post,context);//可以获得消息头        HttpEntity entity = response.getEntity();//获得请求体        InputStream content = entity.getContent();//获得内容        String contentCharSet = EntityUtils.getContentCharSet(entity);//获得字符集        bf = new BufferedReader(new InputStreamReader(content, contentCharSet));        //获取BufferReader        String str = null;        while ((str = bf.readLine())!=null)        {            System.out.println(str);        }    } catch (Exception e) {        e.printStackTrace();    } finally {        if(bf!=null)        {            try {                bf.close();            } catch (IOException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }        }    }}}

模拟登陆后获得的为登陆后页面的信息,注意目标Post路径应为登陆所指的Servelet的路径