【 网络爬虫】java 使用Socket, HttpUrlConnection方式抓取数据

来源:互联网 发布:摄影网络销售技巧 编辑:程序博客网 时间:2024/05/17 04:04

结构:

这里写图片描述

公共方法

  • url任务的pojo类
import com.tsj.simple.enumeration.TaskLevel;/** * url任务的pojo类 * @author tsj-pc */public class UrlPojo {    public UrlPojo(String url) {        this.url = url;    }    @Override    public String toString() {        return "UrlPojo [taskLevel=" + taskLevel + ", url=" + url + "]";    }    public UrlPojo(String url, TaskLevel taskLevel) {        this.url = url;        this.taskLevel = taskLevel;    }    private String url;    private TaskLevel taskLevel = TaskLevel.MIDDLE;    public String getUrl() {        return url;    }    public void setUrl(String url) {        this.url = url;    }    public TaskLevel getTaskLevel() {        return taskLevel;    }    public void setTaskLevel(TaskLevel taskLevel) {        this.taskLevel = taskLevel;    }    public HttpURLConnection getConnection() {        try {            URL url = new URL(this.url);            URLConnection connection = url.openConnection();            if (connection instanceof HttpURLConnection) {                return (HttpURLConnection) connection;            }else {              throw new Exception("connection is errr!");               }        } catch (Exception e) {            e.printStackTrace();        }        return null;    }    public String getHost() {        try {            URL url = new URL(this.url);            return url.getHost();        } catch (Exception e) {            e.printStackTrace();        }        return null;    }}
  • url任务的pojo类
package com.tsj.simple.pojos;/** *  抓取结果的封装 * @author tsj-pc */public class CrawlResultPojo {    private boolean isSuccess;    private String pageContent;    private int httpStatuCode;    public boolean isSuccess() {        return isSuccess;    }    public void setSuccess(boolean isSuccess) {        this.isSuccess = isSuccess;    }    public String getPageContent() {        return pageContent;    }    public void setPageContent(String pageContent) {        this.pageContent = pageContent;    }    public int getHttpStatuCode() {        return httpStatuCode;    }    public void setHttpStatuCode(int httpStatuCode) {        this.httpStatuCode = httpStatuCode;    }}
  • 枚举类
public enum TaskLevel {    HIGH,MIDDLE,LOW }
  • 接口类
public interface ICrawler {    public CrawlResultPojo crawl(UrlPojo urlPojo);    }
  • 实现类
    (1)Socket方式抓取数据
public class SocketCrawlerImpl  implements ICrawler  {    @Override    public CrawlResultPojo crawl(UrlPojo urlPojo) {        CrawlResultPojo crawlResultPojo = new CrawlResultPojo();        if (urlPojo == null || urlPojo.getUrl() == null) {            crawlResultPojo.setSuccess(false);            crawlResultPojo.setPageContent(null);            return crawlResultPojo;        }        //抓取的ip 端口号        String host = urlPojo.getHost();        if (host == null) {            crawlResultPojo.setSuccess(false);            crawlResultPojo.setPageContent(null);            return crawlResultPojo;        }        //为了提高写入的效率,使用了字符流的缓冲区。        //BufferedReader 缓冲方式文本读取        BufferedWriter bw = null;        BufferedReader br = null;        try {            Socket socket = new Socket(host, 80);            //设置过期时间            socket.setKeepAlive(true);            // socket.setSoTimeout(1000);            bw = new BufferedWriter(new OutputStreamWriter(socket                    .getOutputStream()));            //协议地址            //bw.write("GET " + urlPojo.getUrl() + " HTTP/1.0\r\n");            bw.write("GET " + urlPojo.getUrl() + " HTTP/1.1\r\n");            bw.write("HOST:" + host + "\r\n");            bw.write("\r\n");// 在行的结束符\r\n之前没有任何数据,说明这时候http head输出给服务器端完成            bw.flush();// 清空缓存流            br = new BufferedReader(new InputStreamReader(socket                    .getInputStream()));            StringBuilder stringBuilder = new StringBuilder();            String line = null;            while ((line = br.readLine()) != null) {                // System.out.println(line);                stringBuilder.append(line + "\n");            }            crawlResultPojo.setSuccess(true);            crawlResultPojo.setPageContent(stringBuilder.toString());            return crawlResultPojo;        } catch (Exception e) {            e.printStackTrace();        } finally {            try {                if (bw != null) {                    bw.close();                }                if (br != null) {                    br.close();                }            } catch (Exception e) {                e.printStackTrace();                System.out.println("流最终未关闭,请检查!");            }        }        return null;    }    public static void main(String[] args) {        SocketCrawlerImpl socketCrawlerImpl = new SocketCrawlerImpl();        UrlPojo urlPojo = new UrlPojo("http://www.baidu.com");        // UrlPojo urlPojo = new UrlPojo("http://www.qq.com");        // UrlPojo urlPojo = new UrlPojo(        // "http://www.hao123.com/?tn=97961594_hao_pg");        CrawlResultPojo crawlResultPojo=socketCrawlerImpl.crawl(urlPojo);        System.out.println(crawlResultPojo.getPageContent());               System.out.println("done!");    }}

(2) HttpUrlConnection方式抓取数据

public class HttpUrlConnectionCrawlerImpl implements ICrawler {    @Override    public CrawlResultPojo crawl(UrlPojo urlPojo) {        CrawlResultPojo crawlResultPojo=new CrawlResultPojo();        StringBuilder stringBulider=new StringBuilder();        if (urlPojo ==null || urlPojo.getUrl()==null) {            crawlResultPojo.setSuccess(false);            crawlResultPojo.setPageContent(null);        }        HttpURLConnection httpUrlConnection=urlPojo.getConnection();        BufferedReader br=null;        String line=null;        try {            br=new BufferedReader(new InputStreamReader(httpUrlConnection.getInputStream(),"utf-8"));            //br=new BufferedReader(new InputStreamReader(httpUrlConnection.getInputStream(),"gb2312 "));            while ((line = br.readLine())!=null) {                //System.out.print(line);                               stringBulider.append(line + "\n");            }            crawlResultPojo.setSuccess(true);            crawlResultPojo.setPageContent(stringBulider.toString());        } catch (Exception e) {            e.printStackTrace();        }finally{            try {                if (br!=null) {                    br.close();                                 }            } catch (Exception e) {                e.printStackTrace();                System.out.println("流最终为关闭");            }        }        return crawlResultPojo;    }    public static void main(String[] args) {        HttpUrlConnectionCrawlerImpl httpUrlConnectionCrawlerImpl = new HttpUrlConnectionCrawlerImpl();        UrlPojo urlPojo = new UrlPojo("http://www.baidu.com");        // UrlPojo urlPojo = new UrlPojo("http://www.qq.com");        // UrlPojo urlPojo = new UrlPojo(        // "http://www.hao123.com/?tn=97961594_hao_pg");        CrawlResultPojo crawlResultPojo=httpUrlConnectionCrawlerImpl.crawl(urlPojo);        System.out.println(crawlResultPojo.getPageContent());               System.out.println("done!");    }}
  • 包含业务逻辑的抓取管理器
    整合两种方法
public class CrawlerManager {    private ICrawler crawler;     public CrawlerManager(boolean isSocket) {        if (isSocket) {            this.crawler = new SocketCrawlerImpl();        } else {            this.crawler = new HttpUrlConnectionCrawlerImpl();        }       }    public CrawlResultPojo crawl(UrlPojo urlPojo) {        return this.crawler.crawl(urlPojo);    }    public static void main(String[] args) {        CrawlerManager crawlerManager = new CrawlerManager(false);        UrlPojo urlPojo = new UrlPojo("http://www.qq.com");        CrawlResultPojo crawlResultPojo=crawlerManager.crawl(urlPojo);              System.out.println("CrawlResultPojo---"+crawlResultPojo.getPageContent());    }}

结果:
这里写图片描述

0 0
原创粉丝点击