Java

来源:互联网 发布:2016淘宝刷单技巧 编辑:程序博客网 时间:2024/05/21 11:14

maven依赖

<dependencies><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.10.2</version></dependency><dependency><groupId>javax.xml</groupId><artifactId>jaxp-api</artifactId><version>1.4.2</version></dependency><dependency><groupId>net.sourceforge.htmlcleaner</groupId><artifactId>htmlcleaner</artifactId><version>2.9</version></dependency></dependencies>
代码

import java.io.IOException;import javax.xml.xpath.XPath;import javax.xml.xpath.XPathConstants;import javax.xml.xpath.XPathFactory;import org.htmlcleaner.CleanerProperties;import org.htmlcleaner.DomSerializer;import org.htmlcleaner.HtmlCleaner;import org.htmlcleaner.TagNode;import org.jsoup.Connection;import org.jsoup.Jsoup;import org.w3c.dom.Document;import org.w3c.dom.Node;import org.w3c.dom.NodeList;/** * @author Crunchify.com * */public class CrunchifyXMLXPathParser {public static void main(String[] args) throws Exception {String url = "http://tv.youku.com/?spm=a2hww.20023042.m_223471.5~5~H2~A";String exp = "//*[@id='nav-second']/div/ul//li";String html = null;try {Connection connect = Jsoup.connect(url);html = connect.get().body().html();} catch (IOException e) {e.printStackTrace();}HtmlCleaner hc = new HtmlCleaner();TagNode tn = hc.clean(html);Document dom = new DomSerializer(new CleanerProperties()).createDOM(tn);XPath xPath = XPathFactory.newInstance().newXPath();Object result;result = xPath.evaluate(exp, dom, XPathConstants.NODESET);if (result instanceof NodeList) {NodeList nodeList = (NodeList) result;System.out.println(nodeList.getLength());for (int i = 0; i < nodeList.getLength(); i++) {Node node = nodeList.item(i);/** * Node.getTextContent() 此属性返回此节点及其后代的文本内容。 * Node.getFirstChild()  此节点的第一个子节点。 * Node.getAttributes() 包含此节点的属性的 NamedNodeMap(如果它是 Element);否则为 null * 如果想获取相应对象的相关属性,可以调用  getAttributes().getNamedItem("属性名") 方法 */System.out.println(node.getNodeValue() == null ? node.getFirstChild().getAttributes().getNamedItem("href") : node.getNodeValue());}}}}
运行结果



解析的html




1 0