HttpClient学习笔记

来源:互联网 发布:淘宝卖家7天不发货 编辑:程序博客网 时间:2024/06/05 04:43

HttpClient 是 Apache Jakarta Common 下的子项目,用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包,并且它支持 HTTP 协议最新的版本和建议。HttpClient 已经应用在很多的项目中。爬虫主要是用HttpClient模拟浏览器请求第三方站点url,然后响应,获取网页数据,然后用Jsoup来提取我们需要的信息;

jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

package com.httpclient.jsoup;import java.io.IOException;import org.apache.http.HttpEntity;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class JsoupDemo {    public static void main(String[] args) throws ClientProtocolException, IOException {        CloseableHttpClient HttpClient = HttpClients.createDefault();        HttpGet get = new HttpGet("http://www.xicidaili.com/");        /*HttpHost proxy = new HttpHost("182.112.228.38",80);        RequestConfig config = RequestConfig.custom().setProxy(proxy).setConnectTimeout(10000).setSocketTimeout(10000).build();        get.setConfig(config );*/        get.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0");        CloseableHttpResponse respose = HttpClient.execute(get);         System.out.println("状态:"+respose.getStatusLine().getStatusCode());        HttpEntity entity = respose.getEntity();        System.out.println("Content-Type:"+entity.getContentType().getValue());        String content = EntityUtils.toString(entity, "utf-8");        System.out.println("内容:"+content);        respose.close();        Document doc = Jsoup.parse(content);        Elements elements= doc.getElementsByTag("title");        System.out.println("网页标题:"+elements.get(0).text());        Element element2=doc.getElementById("header");        String navTop=element2.text();        System.out.println("id 为header的内容:"+navTop);        Elements postItemElements=doc.getElementsByClass("site_name"); // 根据样式名称来查询DOM        for(Element e:postItemElements){            System.out.println("根据样式名称来查询DOM"+e.html());        }        Elements widthElements=doc.getElementsByAttribute("width"); // 根据属性名来查询DOM        for(Element e:widthElements){            System.out.println("根据属性名来查询DOM"+e.toString());        }        Elements targetElements=doc.getElementsByAttributeValue("width", "11%");  // 根据属性名和属性值来查询DOM        for(Element e:targetElements){            System.out.println("根据属性名和属性值来查询DOM"+e.toString());        }        Elements linkElements=doc.select(".subtitle .country");        for(Element e:linkElements){            System.out.println("根据class:"+e.text());        }        Elements hrefElements=doc.select("a[href]"); //         for(Element e:hrefElements){            System.out.println("带有href属性的a元素"+e.toString());        }        Elements imgElements=doc.select("img[src$=.png]"); //         for(Element e:imgElements){            System.out.println("查找扩展名为.png的图片DOM节点"+e.toString());        }    }}
原创粉丝点击