java爬虫之基于httpclient的简单Demo(二)

来源:互联网 发布:种子文件下载软件 编辑:程序博客网 时间:2024/05/22 15:41

延续demo1的 java爬虫的2种爬取方式(HTTP||Socket)简单Demo(一),demo2出炉啦,大家想学爬虫都可以从这个网盘学习哦:https://pan.baidu.com/s/1pJJrcqJ#list/path=%2F

免费课程,非常不错。其实还是主要学习一个httpclient,httpclient全是英文文档,看的我心累啊



package com.simple.crawImpl;import com.simple.Icrawl.ICrawl;import com.simple.pojos.CrawlResultPojo;import com.simple.pojos.UrlPojo;import org.apache.http.HttpEntity;import org.apache.http.ParseException;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpUriRequest;import org.apache.http.client.methods.RequestBuilder;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.URI;import java.net.URISyntaxException;import java.util.HashMap;import java.util.Map;import java.util.Map.Entry;/** * * Created by lewis on 2016/10/16. */public class HttpClientCrawlerImpl implements ICrawl{    public CloseableHttpClient httpClient = HttpClients.custom().build();           //创建定制HttpClient    @Override    public CrawlResultPojo crawl(UrlPojo urlpojo) {        if(urlpojo==null){            return null;        }        CrawlResultPojo crawlResultPojo = new CrawlResultPojo();                //结果集        CloseableHttpResponse response = null;                                  //HTTP返回的各种信息集合,包含协议http标准,httpcode状态码        BufferedReader br = null;                                               //        try {            HttpGet httpGet = new HttpGet(urlpojo.getUrl());            response = httpClient.execute(httpGet);            HttpEntity entity = response.getEntity();                                       //获取输入流            InputStreamReader isr = new InputStreamReader(entity.getContent(),"utf-8");     //字节流转化为字符流,设置编码            br =new BufferedReader(isr);            String line =null;            StringBuilder context = new StringBuilder();            while((line=br.readLine())!=null){                context.append(line+"\n");            }            crawlResultPojo.setSuccess(true);            crawlResultPojo.setPageContent(context.toString());            return crawlResultPojo;        } catch (IOException e) {            e.printStackTrace();            crawlResultPojo.setSuccess(false);        }finally {            try {                if (br!=null)                    br.close();                                                                 //关闭流                if(response!=null)                    response.close();            } catch (IOException e) {                e.printStackTrace();            }        }        return crawlResultPojo;    }    /**     * 带参数post的urlpojo     * */    public CrawlResultPojo crawl4Post(UrlPojo urlPojo){        if(urlPojo==null||urlPojo.getUrl()==null){            return null;        }        CrawlResultPojo crawlResultPojo = new CrawlResultPojo();        BufferedReader br= null;        try {            RequestBuilder rb = RequestBuilder.post().setUri(new URI(urlPojo.getUrl()));            Map<String,Object> parasMap = urlPojo.getParasMap() ;            if(parasMap!=null){                for(Entry<String,Object> entry:parasMap.entrySet()){                    rb.addParameter(entry.getKey(),entry.getValue().toString());                }            }            HttpUriRequest httpUriRequest = rb.build();            HttpEntity entity =httpClient.execute(httpUriRequest).getEntity();            InputStreamReader isr=new InputStreamReader(entity.getContent(),"utf-8");            br = new BufferedReader(isr);            String line = null;            StringBuilder stringBuilder = new StringBuilder();            while((line=br.readLine())!=null){                stringBuilder.append(line+"\n");            }            crawlResultPojo.setPageContent(stringBuilder.toString());            crawlResultPojo.setSuccess(true);            return crawlResultPojo;        } catch (URISyntaxException e) {            e.printStackTrace();        } catch (ClientProtocolException e) {            e.printStackTrace();        } catch (IOException e) {            e.printStackTrace();        } finally {            try {                if(br!=null)                    br.close();            } catch (IOException e) {                e.printStackTrace();            }        }        crawlResultPojo.setSuccess(false);        return crawlResultPojo;    }    public static void main(String []args){        HttpClientCrawlerImpl httpClientCrawlerImpl = new HttpClientCrawlerImpl();        String url = "http://www.wangdaizhijia.com/front_select-plat";        UrlPojo urlPojo = new UrlPojo(url);        Map<String, Object> parasMap = new HashMap<String, Object>();        int max_page_number = 1000;        parasMap.put("currPage", 30);        parasMap.put("params", "");        parasMap.put("sort", 0);        urlPojo.setParasMap(parasMap);        CrawlResultPojo resultPojo = httpClientCrawlerImpl.crawl4Post(urlPojo);        print(resultPojo);        resultPojo=httpClientCrawlerImpl.crawl(urlPojo);        print(resultPojo);    }    public static void print(Object s){        System.out.println(s);    }}


0 0
原创粉丝点击