Java爬虫(二)--解析一个HTML字符串

来源:互联网 发布:antlr sql解析 github 编辑:程序博客网 时间:2024/06/03 18:58

使用Jsoup的静态方法 Jsoup.parse(String html) 把 html 字符串解析为一个新的 Documnet 文档,然后通过使用 Document 中的方法或者它父类 Element 和 Node 中的方法取得相关的数据。

下面以获取
http://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient/4.5.3 这个页面中所有的 table 为例子。

1、原网页界面:

这里写图片描述

2、源码:

package com.makerspace.html.jsoup;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.HttpClientBuilder;import org.jsoup.Jsoup;import org.jsoup.helper.StringUtil;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/** * 描述:通过Jsoup获取页面信息 * 公司: www.makerspace.com * @autho guanqin_li * @time 2017年8月25日 下午3:41:47*/public class JsoupTest {    public void Test() {        // HttpClient httpClient = new DefaultHttpClient();//DefaultHttpClient已经被弃用        HttpClient httpClient = HttpClientBuilder.create().build();        HttpGet get = new HttpGet("http://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient/4.5.3");        try {            HttpResponse resp = httpClient.execute(get);            HttpEntity entity = resp.getEntity();            String html = convertStreamToString(entity.getContent());// 将输入流InputStream转换为String            if (!StringUtil.isBlank(html)) {                Document doc = Jsoup.parse(html);// 把html字符串解析为一个新的Documnet文档                if (null != doc) {                    Elements elements = doc.getElementsByTag("table");// 通过tag标签获取元素                    if (null != elements && elements.size() > 0) {                        for (Element element : elements) {                            System.out.println("===================================================");                            System.out.println(element.toString());                        }                    }                }            } else {                System.out.println("html str is blank !!");            }        } catch (Exception e) {            e.printStackTrace();        }    }    /**     * 方法名称:convertStreamToString     * 方法说明:将输入流InputStream转换为String     * @param is     * @return     * @autho guanqin_li      * @time 2017年8月25日 下午3:56:27     */    public static String convertStreamToString(InputStream is) {        BufferedReader reader = new BufferedReader(new InputStreamReader(is));        StringBuilder sb = new StringBuilder();        String line = null;        try {            while ((line = reader.readLine()) != null) {                sb.append(line + "\n");            }        } catch (IOException e) {            e.printStackTrace();        } finally {            try {                is.close();            } catch (IOException e) {                e.printStackTrace();            }        }        return sb.toString();    }    public static void main(String[] args) {        new JsoupTest().Test();    }}

3、获取该页面所有 table 的结果:

===================================================<table class="grid" width="100%"> <tbody>  <tr>   <th>License</th>   <td><span class="b lic">Apache 2.0</span></td>  </tr>  <tr>   <th>Categories</th>   <td><a href="/open-source/http-clients" class="b c">HTTP Clients</a></td>  </tr>  <tr>   <th>HomePage</th>   <td><a href="http://hc.apache.org/httpcomponents-client" rel="nofollow"> http://hc.apache.org/httpcomponents-client </a></td>  </tr>  <tr>   <th>Date</th>   <td>(Jan 21, 2017) </td>  </tr>  <tr>   <th style="width: 12em;">Files</th>   <td><a class="vbtn" href="http://central.maven.org/maven2/org/apache/httpcomponents/httpclient/4.5.3/httpclient-4.5.3.jar">Download (JAR)</a><span style="color: black; font-weight: bold;"> (696 KB) </span></td>  </tr>  <tr>   <th style="width: 12em;">Repositories</th>   <td><a class="b lic" href="/repos/central">Central</a></td>  </tr>  <tr>   <th>Used By</th>   <td><a href="/artifact/org.apache.httpcomponents/httpclient/usages"><b>5,584 artifacts</b></a></td>  </tr> </tbody></table>===================================================<table class="grid" style="vertical-align: middle" width="100%"> <thead>  <tr>   <th style="width: 12em; text-align: center">Category/License</th>   <th style="width: 28px"></th>   <th> Group / Artifact</th>   <th style="text-align: center">Version</th>   <th style="text-align: center">Updates</th>  </tr> </thead> <tbody>  <tr>   <td style="text-align: right"><a href="/open-source/base64-libraries" class="b c">Base64</a><br><span class="b lic">Apache 2.0</span></td>   <td>    <picture>     <source type="image/webp" srcset="https://d2j3q9yua85jt3.cloudfront.net/img/a117c37ecc0ed0750c48bd4755638e06">     <img class="im-logo" width="28" height="28" src="https://d2j3q9yua85jt3.cloudfront.net/img/c44e3998569145e628d7d13a288ba5a">    </picture></td>   <td><a href="/artifact/commons-codec">commons-codec</a> » <a href="/artifact/commons-codec/commons-codec">commons-codec</a></td>   <td style="text-align: center"><a class="vbtn release" href="/artifact/commons-codec/commons-codec/1.9"> 1.9</a></td>   <td style="text-align: center"><a class="vbtn release" href="/artifact/commons-codec/commons-codec/1.10">1.10</a></td>  </tr>  <tr>   <td style="text-align: right"><a href="/open-source/logging-frameworks" class="b c">Logging</a><br><span class="b lic">Apache 2.0</span></td>   <td>    <picture>     <source type="image/webp" srcset="https://d2j3q9yua85jt3.cloudfront.net/img/a117c37ecc0ed0750c48bd4755638e06">     <img class="im-logo" width="28" height="28" src="https://d2j3q9yua85jt3.cloudfront.net/img/bb2b8886770c45c0e0d07cc97851a6f1">    </picture></td>   <td><a href="/artifact/commons-logging">commons-logging</a> » <a href="/artifact/commons-logging/commons-logging">commons-logging</a></td>   <td style="text-align: center"><a class="vbtn release" href="/artifact/commons-logging/commons-logging/1.2"> 1.2</a></td>   <td style="text-align: center"></td>  </tr>  <tr>   <td style="text-align: right"><a href="/open-source/http-clients" class="b c">HTTP Clients</a><br><span class="b lic">Apache 2.0</span></td>   <td>    <picture>     <source type="image/webp" srcset="https://d2j3q9yua85jt3.cloudfront.net/img/d0a90db48344c7126a6b54f1f3089347">     <img class="im-logo" width="28" height="28" src="https://d2j3q9yua85jt3.cloudfront.net/img/e722b6263952f18e235cd8e0985546a1">    </picture></td>   <td><a href="/artifact/org.apache.httpcomponents">org.apache.httpcomponents</a> » <a href="/artifact/org.apache.httpcomponents/httpcore">httpcore</a></td>   <td style="text-align: center"><a class="vbtn release" href="/artifact/org.apache.httpcomponents/httpcore/4.4.6"> 4.4.6</a></td>   <td style="text-align: center"></td>  </tr> </tbody></table>===================================================<table class="grid" style="vertical-align: middle" width="100%"> <thead>  <tr>   <th style="width: 12em; text-align: center">Category/License</th>   <th style="width: 28px"></th>   <th> Group / Artifact</th>   <th style="text-align: center">Version</th>   <th style="text-align: center">Updates</th>  </tr> </thead> <tbody>  <tr>   <td style="text-align: right"><a href="/open-source/testing-frameworks" class="b c">Testing</a><br><span class="b lic">EPL 1.0</span></td>   <td>    <picture>     <source type="image/webp" srcset="https://d2j3q9yua85jt3.cloudfront.net/img/d0f73ecf7a2ecdc8f1082e67dbb5617d">     <img class="im-logo" width="28" height="28" src="https://d2j3q9yua85jt3.cloudfront.net/img/7cb2d4617d97415f562bd5711c429a95">    </picture></td>   <td><a href="/artifact/junit">junit</a> » <a href="/artifact/junit/junit">junit</a></td>   <td style="text-align: center"><a class="vbtn release" href="/artifact/junit/junit/4.11"> 4.11</a></td>   <td style="text-align: center"><a class="vbtn release" href="/artifact/junit/junit/4.12">4.12</a></td>  </tr>  <tr>   <td style="text-align: right"><a href="/open-source/mocking" class="b c">Mocking</a><br><span class="b lic">MIT</span></td>   <td>    <picture>     <source type="image/webp" srcset="https://d2j3q9yua85jt3.cloudfront.net/img/883ab1fd45b7243b298b18790e70cf3e">     <img class="im-logo" width="28" height="28" src="https://d2j3q9yua85jt3.cloudfront.net/img/ab8655889ce452e2fa4713aa13b8428f">    </picture></td>   <td><a href="/artifact/org.mockito">org.mockito</a> » <a href="/artifact/org.mockito/mockito-core">mockito-core</a></td>   <td style="text-align: center"><a class="vbtn release" href="/artifact/org.mockito/mockito-core/1.8.5"> 1.8.5</a></td>   <td style="text-align: center"><a class="vbtn release" href="/artifact/org.mockito/mockito-core/2.9.0">2.9.0</a></td>  </tr> </tbody></table>===================================================<table class="grid" width="100%"> <thead>  <tr>   <th style="width: 16em;">License</th>   <th>URL</th>  </tr> </thead> <tbody>  <tr>   <td>The Apache Software License, Version 2.0</td>   <td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt" rel="nofollow"> http://www.apache.org/licenses/LICENSE-2.0.txt </a></td>  </tr>  <tr>   <td>Apache License, Version 2.0</td>   <td><a href="http://www.apache.org/licenses/LICENSE-2.0.txt" rel="nofollow"> http://www.apache.org/licenses/LICENSE-2.0.txt </a></td>  </tr> </tbody></table>===================================================<table class="grid" width="100%"> <thead>  <tr>   <th style="width: 16em;">Name</th>   <th>Email</th>   <th>Dev Id</th>   <th>Roles</th>   <th>Organization</th>  </tr> </thead> <tbody>  <tr>   <td>Ortwin Glueck</td>   <td> oglueck -at- apache.org</td>   <td>oglueck</td>   <td>Emeritus PMC</td>   <td></td>  </tr>  <tr>   <td>Oleg Kalnichevski</td>   <td> olegk -at- apache.org</td>   <td>olegk</td>   <td>Committer, PMC</td>   <td></td>  </tr>  <tr>   <td>Asankha C. Perera</td>   <td> asankha -at- apache.org</td>   <td>asankha</td>   <td>Committer, PMC Chair</td>   <td></td>  </tr>  <tr>   <td>Sebastian Bazley</td>   <td> sebb -at- apache.org</td>   <td>sebb</td>   <td>Committer, PMC</td>   <td></td>  </tr>  <tr>   <td>Erik Abele</td>   <td> erikabele -at- apache.org</td>   <td>erikabele</td>   <td>Committer, PMC</td>   <td></td>  </tr>  <tr>   <td>Ant Elder</td>   <td> antelder -at- apache.org</td>   <td>antelder</td>   <td>Committer, PMC</td>   <td></td>  </tr>  <tr>   <td>Paul Fremantle</td>   <td> pzf -at- apache.org</td>   <td>pzf</td>   <td>Committer, PMC</td>   <td></td>  </tr>  <tr>   <td>Roland Weber</td>   <td> rolandw -at- apache.org</td>   <td>rolandw</td>   <td>Emeritus PMC</td>   <td></td>  </tr>  <tr>   <td>Sam Berlin</td>   <td> sberlin -at- apache.org</td>   <td>sberlin</td>   <td>Committer</td>   <td></td>  </tr>  <tr>   <td>Sean C. Sullivan</td>   <td> sullis -at- apache.org</td>   <td>sullis</td>   <td>Committer</td>   <td></td>  </tr>  <tr>   <td>Jonathan Moore</td>   <td> jonm -at- apache.org</td>   <td>jonm</td>   <td>Committer, PMC</td>   <td></td>  </tr>  <tr>   <td>Gary Gregory</td>   <td> ggregory -at- apache.org</td>   <td>ggregory</td>   <td>Committer</td>   <td></td>  </tr>  <tr>   <td>William Speirs</td>   <td> wspeirs at apache.org</td>   <td>wspeirs</td>   <td>Committer</td>   <td></td>  </tr>  <tr>   <td>Karl Wright</td>   <td> kwright -at- apache.org</td>   <td>kwright</td>   <td>Committer</td>   <td></td>  </tr>  <tr>   <td>Francois-Xavier Bonnet</td>   <td> fx -at- apache.org</td>   <td>fx</td>   <td>Committer</td>   <td></td>  </tr> </tbody></table>
原创粉丝点击