【 网络爬虫】java 使用Socket, HttpUrlConnection方式抓取数据
来源:互联网 发布:摄影网络销售技巧 编辑:程序博客网 时间:2024/05/17 04:04
结构:
公共方法
- url任务的pojo类
import com.tsj.simple.enumeration.TaskLevel;/** * url任务的pojo类 * @author tsj-pc */public class UrlPojo { public UrlPojo(String url) { this.url = url; } @Override public String toString() { return "UrlPojo [taskLevel=" + taskLevel + ", url=" + url + "]"; } public UrlPojo(String url, TaskLevel taskLevel) { this.url = url; this.taskLevel = taskLevel; } private String url; private TaskLevel taskLevel = TaskLevel.MIDDLE; public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public TaskLevel getTaskLevel() { return taskLevel; } public void setTaskLevel(TaskLevel taskLevel) { this.taskLevel = taskLevel; } public HttpURLConnection getConnection() { try { URL url = new URL(this.url); URLConnection connection = url.openConnection(); if (connection instanceof HttpURLConnection) { return (HttpURLConnection) connection; }else { throw new Exception("connection is errr!"); } } catch (Exception e) { e.printStackTrace(); } return null; } public String getHost() { try { URL url = new URL(this.url); return url.getHost(); } catch (Exception e) { e.printStackTrace(); } return null; }}
- url任务的pojo类
package com.tsj.simple.pojos;/** * 抓取结果的封装 * @author tsj-pc */public class CrawlResultPojo { private boolean isSuccess; private String pageContent; private int httpStatuCode; public boolean isSuccess() { return isSuccess; } public void setSuccess(boolean isSuccess) { this.isSuccess = isSuccess; } public String getPageContent() { return pageContent; } public void setPageContent(String pageContent) { this.pageContent = pageContent; } public int getHttpStatuCode() { return httpStatuCode; } public void setHttpStatuCode(int httpStatuCode) { this.httpStatuCode = httpStatuCode; }}
- 枚举类
public enum TaskLevel { HIGH,MIDDLE,LOW }
- 接口类
public interface ICrawler { public CrawlResultPojo crawl(UrlPojo urlPojo); }
- 实现类
(1)Socket方式抓取数据
public class SocketCrawlerImpl implements ICrawler { @Override public CrawlResultPojo crawl(UrlPojo urlPojo) { CrawlResultPojo crawlResultPojo = new CrawlResultPojo(); if (urlPojo == null || urlPojo.getUrl() == null) { crawlResultPojo.setSuccess(false); crawlResultPojo.setPageContent(null); return crawlResultPojo; } //抓取的ip 端口号 String host = urlPojo.getHost(); if (host == null) { crawlResultPojo.setSuccess(false); crawlResultPojo.setPageContent(null); return crawlResultPojo; } //为了提高写入的效率,使用了字符流的缓冲区。 //BufferedReader 缓冲方式文本读取 BufferedWriter bw = null; BufferedReader br = null; try { Socket socket = new Socket(host, 80); //设置过期时间 socket.setKeepAlive(true); // socket.setSoTimeout(1000); bw = new BufferedWriter(new OutputStreamWriter(socket .getOutputStream())); //协议地址 //bw.write("GET " + urlPojo.getUrl() + " HTTP/1.0\r\n"); bw.write("GET " + urlPojo.getUrl() + " HTTP/1.1\r\n"); bw.write("HOST:" + host + "\r\n"); bw.write("\r\n");// 在行的结束符\r\n之前没有任何数据,说明这时候http head输出给服务器端完成 bw.flush();// 清空缓存流 br = new BufferedReader(new InputStreamReader(socket .getInputStream())); StringBuilder stringBuilder = new StringBuilder(); String line = null; while ((line = br.readLine()) != null) { // System.out.println(line); stringBuilder.append(line + "\n"); } crawlResultPojo.setSuccess(true); crawlResultPojo.setPageContent(stringBuilder.toString()); return crawlResultPojo; } catch (Exception e) { e.printStackTrace(); } finally { try { if (bw != null) { bw.close(); } if (br != null) { br.close(); } } catch (Exception e) { e.printStackTrace(); System.out.println("流最终未关闭,请检查!"); } } return null; } public static void main(String[] args) { SocketCrawlerImpl socketCrawlerImpl = new SocketCrawlerImpl(); UrlPojo urlPojo = new UrlPojo("http://www.baidu.com"); // UrlPojo urlPojo = new UrlPojo("http://www.qq.com"); // UrlPojo urlPojo = new UrlPojo( // "http://www.hao123.com/?tn=97961594_hao_pg"); CrawlResultPojo crawlResultPojo=socketCrawlerImpl.crawl(urlPojo); System.out.println(crawlResultPojo.getPageContent()); System.out.println("done!"); }}
(2) HttpUrlConnection方式抓取数据
public class HttpUrlConnectionCrawlerImpl implements ICrawler { @Override public CrawlResultPojo crawl(UrlPojo urlPojo) { CrawlResultPojo crawlResultPojo=new CrawlResultPojo(); StringBuilder stringBulider=new StringBuilder(); if (urlPojo ==null || urlPojo.getUrl()==null) { crawlResultPojo.setSuccess(false); crawlResultPojo.setPageContent(null); } HttpURLConnection httpUrlConnection=urlPojo.getConnection(); BufferedReader br=null; String line=null; try { br=new BufferedReader(new InputStreamReader(httpUrlConnection.getInputStream(),"utf-8")); //br=new BufferedReader(new InputStreamReader(httpUrlConnection.getInputStream(),"gb2312 ")); while ((line = br.readLine())!=null) { //System.out.print(line); stringBulider.append(line + "\n"); } crawlResultPojo.setSuccess(true); crawlResultPojo.setPageContent(stringBulider.toString()); } catch (Exception e) { e.printStackTrace(); }finally{ try { if (br!=null) { br.close(); } } catch (Exception e) { e.printStackTrace(); System.out.println("流最终为关闭"); } } return crawlResultPojo; } public static void main(String[] args) { HttpUrlConnectionCrawlerImpl httpUrlConnectionCrawlerImpl = new HttpUrlConnectionCrawlerImpl(); UrlPojo urlPojo = new UrlPojo("http://www.baidu.com"); // UrlPojo urlPojo = new UrlPojo("http://www.qq.com"); // UrlPojo urlPojo = new UrlPojo( // "http://www.hao123.com/?tn=97961594_hao_pg"); CrawlResultPojo crawlResultPojo=httpUrlConnectionCrawlerImpl.crawl(urlPojo); System.out.println(crawlResultPojo.getPageContent()); System.out.println("done!"); }}
- 包含业务逻辑的抓取管理器
整合两种方法
public class CrawlerManager { private ICrawler crawler; public CrawlerManager(boolean isSocket) { if (isSocket) { this.crawler = new SocketCrawlerImpl(); } else { this.crawler = new HttpUrlConnectionCrawlerImpl(); } } public CrawlResultPojo crawl(UrlPojo urlPojo) { return this.crawler.crawl(urlPojo); } public static void main(String[] args) { CrawlerManager crawlerManager = new CrawlerManager(false); UrlPojo urlPojo = new UrlPojo("http://www.qq.com"); CrawlResultPojo crawlResultPojo=crawlerManager.crawl(urlPojo); System.out.println("CrawlResultPojo---"+crawlResultPojo.getPageContent()); }}
结果:
0 0
- 【 网络爬虫】java 使用Socket, HttpUrlConnection方式抓取数据
- 【网络爬虫】使用HttpClient4.3.5抓取数据
- java实现网络爬虫--抓取网站数据
- java 开发用到网络爬虫,抓取某某网站数据经历
- java抓取网页 --- 网络爬虫
- java爬虫实现数据抓取
- 使用HttpURLConnection和使用HttpClient方式请求网络采用get方式和post方式请求数据
- Jsoup实现网络爬虫抓取数据
- 【网络爬虫】HttpClient抓取+解析+存储数据
- HttpURLConnection抓取数据
- 【网络爬虫】【java】微博爬虫(二):如何抓取HTML页面及HttpClient使用
- HttpURLConnection GET 方式 请求网络数据
- 网络爬虫,使用NodeJs抓取RSS新闻
- java爬虫抓取网络上的图片
- java 网络爬虫之多线程抓取文件
- Java实现网络爬虫001-抓取网页
- 网络爬虫/数据抓取,反爬虫(更新版)
- 分布式爬虫:使用Scrapy抓取数据
- 秒杀多线程第八篇 经典线程同步 信号量Semaphore
- 利用TortoiseGit(小乌龟)将项目上传至GitHub网站
- 输入1个数输出其二进制表示中1的个数
- Node js 项目启动报错,错误码,events.js:141,throw er; //Unhandled 'error' event,解决办法
- 秒杀多线程第九篇 经典线程同步总结 关键段 事件 互斥量 信号量
- 【 网络爬虫】java 使用Socket, HttpUrlConnection方式抓取数据
- 最近在学泛型,头有点大
- 集合框架-Map集合功能概述
- 秒杀多线程第十篇 生产者消费者问题
- CodeForces.158A Next Round (水模拟)
- python字符串索引
- wp-auto post pro插件【自动采集】
- POJ 2186 popular cows (tarjan + 缩点)
- [LeetCode]405. Convert a Number to Hexadecimal