JavaSpider学习
来源:互联网 发布:mac上铃声软件 编辑:程序博客网 时间:2024/06/04 18:24
javaSpider
所需jar包
1. 我的第一个测试
public class SpiderTest {/** * 爬取网站的源码 */@Testpublic void test1(){ BufferedReader bf = null; try { //获得URL对象 URL url = new URL("http://1483104508.55555.io/From"); //获得对应的inputStream流 InputStream input = url.openStream(); //存入BufferReader 并设置字符集 bf = new BufferedReader(new InputStreamReader(input, "utf-8")); String str =null; //读取数据 while((str = bf.readLine())!=null) { //打印 System.out.println(str); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { //关闭BufferReader if(bf!=null) try { bf.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }}/** * 通过代理访问 */@Testpublic void test2(){ BufferedReader bf = null; try { URL url = new URL("http://1483104508.55555.io/From"); //设置代理,XX为代理服务器IP,host为端口号 Proxy proxy = new Proxy(Type.HTTP,new InetSocketAddress("XXX.XXX.XXX.XXX", host)); //通过代理获得URLConnection连接 URLConnection u=url.openConnection(proxy); //参考test1 InputStream input = u.getInputStream(); bf = new BufferedReader(new InputStreamReader(input, "utf-8")); String str =null; while((str = bf.readLine())!=null) { System.out.println(str); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { if(bf!=null) try { bf.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }}}
2.HttpClient
public class SpiderTest1 {@Testpublic void test(){ HttpClient send = new DefaultHttpClient(); send.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY, new HttpHost("XXX.XXX.XXX.XXX", host)); //代理方式XX为ip,host为端口 HttpContext context = new BasicHttpContext(); HttpGet get = new HttpGet("http://1483104508.55555.io/From");// HttpGet get = new HttpGet("http://www.baidu.com"); BufferedReader bf = null; try {// HttpResponse response= send.execute(get);//只获取消息体 HttpResponse response= send.execute(get,context);//可以获得消息头 HttpEntity entity = response.getEntity();//获得的是请求体 Object HTTP_CONNECTION = context.getAttribute(ExecutionContext.HTTP_CONNECTION); System.out.println(HTTP_CONNECTION);//获得http连接 Object HTTP_PROXY_HOST = context.getAttribute(ExecutionContext.HTTP_PROXY_HOST); System.out.println(HTTP_PROXY_HOST);//代理主机host Object HTTP_REQ_SENT = context.getAttribute(ExecutionContext.HTTP_REQ_SENT); System.out.println(HTTP_REQ_SENT);// Object HTTP_REQUEST = context.getAttribute(ExecutionContext.HTTP_REQUEST); System.out.println(HTTP_REQUEST);//获得request对象 Object HTTP_RESPONSE = context.getAttribute(ExecutionContext.HTTP_RESPONSE); System.out.println(HTTP_RESPONSE);//获得response对象 Object HTTP_TARGET_HOST = context.getAttribute(ExecutionContext.HTTP_TARGET_HOST); System.out.println(HTTP_TARGET_HOST);//目标主机host HttpHost hh = (HttpHost)HTTP_TARGET_HOST; // 转换为HttpHost String host = hh.getHostName();//获得主机名 System.out.println(host); InputStream content = entity.getContent(); String contentCharSet = EntityUtils.getContentCharSet(entity); bf = new BufferedReader(new InputStreamReader(content, contentCharSet)); String str = null; while ((str = bf.readLine())!=null) { System.out.println(str); } } catch (Exception e) { e.printStackTrace(); } finally { if(bf!=null) { try { bf.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }}}
3.爬取网页存储
public class SpiderTest2 { /** * 爬到网页并存储 */@Testpublic void test3() { BufferedReader r = null; HttpClient sender = new DefaultHttpClient(); HttpGet get = new HttpGet("http://www.btime.com/?from=ssk2"); try { HttpResponse response = sender.execute(get); //获得response对象 HttpEntity entity = response.getEntity(); //得到请求体 InputStream in = entity.getContent();// 得到请求内容 // 直接将所拿到的流存放到文件里,转成html IOUtils.copy(in, new FileOutputStream("F:/c.html"));//将流写入html文件 } catch (Exception e) { e.printStackTrace(); }finally { if (r != null) { try { r.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }}}
4.模拟登陆
public class SpiderTest3 {/** * 模拟登陆 */@Testpublic void test(){ HttpClient send = new DefaultHttpClient();//先获取CLient对象 HttpContext context = new BasicHttpContext();//获取Context对象 HttpPost post = new HttpPost("http://1483104508.55555.io/blog/AdminLogin"); //通过post连接 List<NameValuePair> parms = new ArrayList<NameValuePair>();//参数集合 parms.add(new BasicNameValuePair("user", "root"));//添加参数 parms.add(new BasicNameValuePair("pwd", "root")); BufferedReader bf = null; try { post.setEntity(new UrlEncodedFormEntity(parms,"utf-8"));//请求 HttpResponse response= send.execute(post,context);//可以获得消息头 HttpEntity entity = response.getEntity();//获得请求体 InputStream content = entity.getContent();//获得内容 String contentCharSet = EntityUtils.getContentCharSet(entity);//获得字符集 bf = new BufferedReader(new InputStreamReader(content, contentCharSet)); //获取BufferReader String str = null; while ((str = bf.readLine())!=null) { System.out.println(str); } } catch (Exception e) { e.printStackTrace(); } finally { if(bf!=null) { try { bf.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }}}
模拟登陆后获得的为登陆后页面的信息,注意目标Post路径应为登陆所指的Servelet的路径
阅读全文
1 0
- JavaSpider学习
- 关于javaspider
- 学习
- 学习
- 学习
- 学习
- 学习
- 学习
- 学习
- 学习
- 学习
- 学习
- 学习
- 学习
- 学习
- 学习
- 学习
- 学习
- UVA 208 Firetruck
- Re:从零开始的Spring Security Oauth2(三)
- 人脸识别demo
- HDU 2243 AC自动机+矩阵快速幂
- LESS 学习demo
- JavaSpider学习
- VS2015动态库编程提高篇之远程注入资源汇总
- javaseday15(String)
- 新萝卜家园windows xp 安装之后fonts中字体在 word ps中不显示
- Android Studio 解决resolve dependencies 'classpath'卡住
- 炎炎夏日,清凉办公
- 生成树协议配置与管理STP——2
- 设计模式-简单工厂模式/静态工厂方法(Static Factory Method)
- UVA1336FixingTheGreatWall