Jsoup学习

来源:互联网 发布:java 工作流设计器 编辑:程序博客网 时间:2024/05/17 21:51

Jsoup

jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

主要功能:
1. 从一个URL,文件或字符串中解析HTML;
2. 使用DOM或CSS选择器来查找、取出数据;
3. 可操作HTML元素、属性、文本;

        /**         * 根据属性选择         */        public class JsoupDemo02 {            public static void main(String[] args) throws Exception{                //新建HttpClient                CloseableHttpClient httpClient = HttpClients.createDefault();                HttpGet httpGet = new HttpGet("https://www.cnblogs.com/");                //返回响应                CloseableHttpResponse response = null;                //执行请求                try {                    response = httpClient.execute(httpGet);                    //获取实体                    HttpEntity entity = response.getEntity();                    //获取内容                    String content = EntityUtils.toString(entity,"utf-8");                    //获取Dom元素                    Document document = Jsoup.parse(content);                    //获取tag为标题的元素                    Elements elementsByTag = document.getElementsByTag("title");                    //打印标题内容                    System.out.println("网页第一个标题: "+elementsByTag.get(0).text());                    //获取网页id为site_nav_top的文档对象                    Element elementById = document.getElementById("site_nav_top");                    //打印该id对应的内容                    System.out.println("id为site_nav_top的内容:"+elementById.html());                    //获取class为post_item的元素                    Elements elementsByClass = document.getElementsByClass("post_item");                    for (Element element : elementsByClass) {                        System.out.println("class属性为post_item: " + element.html());                    }                    //获取属性是height的元素                    Elements elementsByAttribute = document.getElementsByAttribute("height");                    for (Element element : elementsByAttribute) {                        System.out.println("属性为height: " + element.toString());                    }                    //获取属性target的值为_blank的元素                    Elements elementsByAttributeValue = document.getElementsByAttributeValue("target", "_blank");                    for (Element element : elementsByAttributeValue) {                        System.out.println("属性target的值为_blank: "+ element.toString());                    }                } catch (ClientProtocolException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                } finally {                    //关闭资源                    if (response != null) {                        try {                            response.close();                        } catch (IOException e) {                            e.printStackTrace();                        }                    }                }                //关闭资源                httpClient.close();            }        }        /**         * 根据选择器选择         */        public class JsoupDemo03 {            public static void main(String[] args) throws Exception{                //新建HttpClient                CloseableHttpClient httpClient = HttpClients.createDefault();                HttpGet httpGet = new HttpGet("https://www.cnblogs.com/");                //返回响应                CloseableHttpResponse response = null;                //执行请求                try {                    response = httpClient.execute(httpGet);                    //获取实体                    HttpEntity entity = response.getEntity();                    //获取内容                    String content = EntityUtils.toString(entity,"utf-8");                    //获取Dom元素                    Document document = Jsoup.parse(content);                    //使用id选择器                    Elements selectElements = document.select("#post_list .post_item .post_item_body h3 a");                    for (Element element : selectElements) {                        System.out.println("标题: "+element.text());                        System.out.println("博客地址: " + element.attr("href"));                        System.out.println("target: " + element.attr("target"));                    }                    System.out.println("------------");                    //选择带有href属性的a标签元素                    Elements hrefElements = document.select("a[href]");                    for (Element element : hrefElements) {                        System.out.println("a标签的链接: "+element.toString());                    }                    //选择扩展名为.png的图片                    Elements imgElements = document.select("img[src$=.png]");                    for (Element element : imgElements) {                        System.out.println("图片: " + element.toString() );                    }                    //获取第一个标题                    Element title = document.getElementsByTag("title").first();                    System.out.println("网页标题: " + title.text());                } catch (ClientProtocolException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                } finally {                    //关闭资源                    if (response != null) {                        try {                            response.close();                        } catch (IOException e) {                            e.printStackTrace();                        }                    }                }                //关闭资源                httpClient.close();            }        }
0 0
原创粉丝点击