httpclient解析网页,htmlparser获取制定元素

来源:互联网 发布:公众号排版软件 编辑:程序博客网 时间:2024/05/25 08:13

前几天遇到一个解析网页的问题,以前未接触过,查了好多资料,最后决定使用httpclient解析。

解析成字符串,然后再用htmlparser解析字符形式的html文件,从而找到相应的元素。

首先下载:

httpclient-4.4.3.jar

httpclient-cache-4.4.3.jar

httpmime-4.4.3.jarhttpcore-4.4.2.jar

commons-logging-1.1.3.jar

htmlparser.jar

下面是我的java类:

package com.test;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.List;import org.apache.http.HttpEntity;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.AndFilter;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.nodes.TagNode;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;public class MyHttpClient {    public String getContext(String url) {        CloseableHttpClient httpclient = HttpClients.createDefault();        HttpGet httpget = new HttpGet(url);        CloseableHttpResponse response = null;        InputStream inputstream = null;        try {            response = httpclient.execute(httpget);            HttpEntity entity = response.getEntity();            inputstream = entity.getContent();            BufferedReader in = new BufferedReader(new InputStreamReader(                    inputstream));            StringBuffer myhtml = new StringBuffer();            String line = "";            while ((line = in.readLine()) != null) {                myhtml.append(line);            }            String resultTD = getTagContent(myhtml.toString(), "td", 22);            if (resultTD.equals("") || resultTD.equals(null)) {                return "无";            }            // System.out.println(resultTD);            String[] result1 = resultTD.split(":");            if (result1.length >= 2) {                StringBuffer s1 = new StringBuffer(result1[1]);                for (int i = 0; i < 5; i++) {                    s1.deleteCharAt(s1.length() - 1);                }                String[] result2 = s1.toString().split(" ");                String[] result3 = result1[2].split(" ");                String[] result22 = new String[result2.length];                String[] result33 = new String[result3.length];                // System.out.println(result2.length + "---" + result3.length);                int i = 0;                int j = 0;                for (String str : result2) {                    if (!" ".equals(str) && !"".equals(str)) {                        // System.out.println("--" + str + "--");                        result22[i] = str;                        i++;                    }                }                for (String str : result3) {                    if (!" ".equals(str) && !"".equals(str)) {                        result33[j] = str;                        j++;                    }                }                String s2 = "";                String s3 = "";                if (result2.length >1) {                    s2 = result22[0] + "|" + result22[1];                } else {                    s2 = "";                }                if (result3.length >1) {                    s3 = result33[0] + "|" + result33[1];                } else {                    s3 = "";                }                String ss = s2 + "|" + s3;                return ss;            }        } catch (ClientProtocolException e) {            e.printStackTrace();        } catch (IOException e) {            e.printStackTrace();        }        return "无";    }    public String getTagContent(String content, String tagName, int tdnumber) {        Parser myParser = Parser.createParser(content, "UTF-8");        NodeFilter tagFilter = new TagNameFilter(tagName);        List<NodeFilter> attributeFiltersList = new ArrayList<NodeFilter>();        attributeFiltersList.add(0, tagFilter);        NodeFilter filter = new AndFilter(                attributeFiltersList                        .toArray(new NodeFilter[attributeFiltersList.size()]));        try {            NodeList list = myParser.parse(filter);            if (list.size() == 0) {                return null;            }            TagNode node = (TagNode) list.elementAt(tdnumber);            return node.toPlainTextString();        } catch (ParserException e) {            return null;        }    }}


0 0
原创粉丝点击