java jsoup解析html实例
来源:互联网 发布:科学冷知识知乎 编辑:程序博客网 时间:2024/05/16 19:30
package jsoup;import java.io.File;import java.util.HashMap;import java.util.Map;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.safety.Whitelist;import org.jsoup.select.Elements;//http://www.open-open.com/jsoup/public class TestDocument {public static void main(String[] args) throws Exception {// parseBodyFragment();// parserHTML();// parseGmail();// download();// parserFromFile();// parseLink();// visitDom();// select();// parserURL();//Cleaner();setContent();}private static void setContent() {String html = "<p>An <a href='http://example.com/'><b>example</b><div>test</div></a><span>字体</span> <li><li>link.</p>";Document doc = Jsoup.parse(html);Element div = doc.select("div").first(); // <div></div>div.html("<p>lorem ipsum</p>"); // <div><p>lorem ipsum</p></div>div.prepend("<p>First</p>");// 在div前添加html内容div.append("<p>Last</p>");// 在div之后添加html内容// 添完后的结果: <div><p>First</p><p>lorem ipsum</p><p>Last</p></div>Element span = doc.select("span").first(); // <span>One</span>span.wrap("<li><a href='http://example.com/'></a></li>");// 添完后的结果: <li><a href="http://example.com"><span>One</span></a></li>Element div2 = doc.select("li").first(); // <div></div>div2.text("five > four"); // <div>five > four</div>div2.prepend("First ");div2.append(" Last");doc.select("div.masthead").attr("title", "jsoup").addClass("round-box");System.out.println(doc);}private static void Cleaner() {String unsafe = "<p><a href='http://example.com/' onclick='stealCookies()'>Link</a></p>";String safe = Jsoup.clean(unsafe, Whitelist.basic());System.out.println(safe);// now: <p><a href="http://example.com/" rel="nofollow">Link</a></p>f}private static void parserURL() throws Exception {Document doc = Jsoup.connect("http://www.open-open.com/").get();Element link = doc.select("a").first();String relHref = link.attr("href"); // == "/"String absHref = link.attr("abs:href"); // "http://www.open-open.com/"System.out.println(relHref);System.out.println(absHref);}private static void select() {String html = "<p>An <a href='http://example.com/'><b>example</b></a> link.</p>";Document doc = Jsoup.parse(html);// 解析HTML字符串返回一个Document实现Element link = doc.select("a").first();// 查找第一个a元素String text = doc.body().text(); // "An example link"//取得字符串中的文本String linkHref = link.attr("href"); // "http://example.com/"//取得链接地址String linkText = link.text(); // "example""//取得链接地址中的文本String linkOuterH = link.outerHtml();// "<a href="http://example.com"><b>example</b></a>"String linkInnerH = link.html(); // "<b>example</b>"//取得链接内的html内容System.out.println(text);System.out.println(linkHref);System.out.println(linkText);System.out.println(linkInnerH);System.out.println(linkOuterH);}private static void visitDom() throws Exception {File input = new File("d:/login.html");Document doc = Jsoup.parse(input, "UTF-8", "http://www.baidu.com/");Element content = doc.getElementById("body");Elements links = content.getElementsByTag("a");for (Element link : links) {String linkHref = link.attr("href");String linkText = link.text();System.out.println(linkHref);System.out.println(linkText);}}private static void parseLink() {String html = "<p>An <a href='http://example.com/'><b>example</b></a> link.</p>";Document doc = Jsoup.parse(html);// 解析HTML字符串返回一个Document实现Element link = doc.select("a").first();// 查找第一个a元素String text = doc.body().text(); // "An example link"//取得字符串中的文本String linkHref = link.attr("href"); // "http://example.com/"//取得链接地址String linkText = link.text(); // "example""//取得链接地址中的文本String linkOuterH = link.outerHtml();// "<a href="http://example.com"><b>example</b></a>"String linkInnerH = link.html(); // "<b>example</b>"//取得链接内的html内容System.out.println(text);System.out.println(linkHref);System.out.println(linkText);System.out.println(linkOuterH);System.out.println(linkInnerH);}private static void parserFromFile() throws Exception {File input = new File("d:/login.html");Document doc = Jsoup.parse(input, "UTF-8", "http://www.baidu.com/");System.err.println(doc);}private static void download() throws Exception {Document doc = Jsoup.connect("http://www.baidu.com/").data("query","Java").userAgent("Mozilla").cookie("auth", "token").timeout(3000).get();System.out.println(doc);}private static void parserHTML() {String html = "<html><head><title>First parse</title></head>"+ "<body><p>Parsed HTML into a doc.</p></body></html>";Document doc = Jsoup.parse(html);System.out.println(doc);}private static void parseGmail() throws Exception {Document doc = Jsoup.connect("https://accounts.google.com/ServiceLogin").get();Element content = doc.getElementById("gaia_loginform");// System.out.println(content);Elements inputs = content.select("input[name]");// StringBuffer sb=new StringBuffer();Map<String, String> maps = new HashMap<String, String>();for (Element element : inputs) {// System.out.println(element);String name = element.attr("name");String value = element.attr("value");// System.out.println(name+"="+value);if (value != null && !"".equals(value)) {maps.put(name, value);}}// Email= Passwd=System.out.println(maps);}// 解析body片段private static void parseBodyFragment() {String html = "<div><p>Lorem ipsum.</p>";Document doc = Jsoup.parseBodyFragment(html);Element body = doc.body();System.out.println(body);}}
package jsoup;import org.jsoup.Jsoup;import org.jsoup.helper.Validate;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import java.io.IOException;//http://www.open-open.com/jsoup/public class ListLinks { public static void main(String[] args) throws IOException { Validate.isTrue(args.length == 1, "usage: supply url to fetch"); String url = args[0]; print("Fetching %s...", url); Document doc = Jsoup.connect(url).get(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); Elements imports = doc.select("link[href]"); print("\nMedia: (%d)", media.size()); for (Element src : media) { if (src.tagName().equals("img")) print(" * %s: <%s> %sx%s (%s)", src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"), trim(src.attr("alt"), 20)); else print(" * %s: <%s>", src.tagName(), src.attr("abs:src")); } print("\nImports: (%d)", imports.size()); for (Element link : imports) { print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel")); } print("\nLinks: (%d)", links.size()); for (Element link : links) { print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35)); } } private static void print(String msg, Object... args) { System.out.println(String.format(msg, args)); } private static String trim(String s, int width) { if (s.length() > width) return s.substring(0, width-1) + "."; else return s; }}
0 0
- java jsoup解析html实例
- Jsoup 解析Html源码实例
- Jsoup 解析Html源码实例
- Jsoup 解析Html源码实例
- jsoup解析html文件实例
- Java HTML 解析器:jsoup
- jsoup: Java HTML 解析器
- java利用JSOUP解析html
- jsoup java html解析器
- java-jsoup-解析html文本
- Jsoup解析HTML实例(1)
- Jsoup解析HTML实例(2)
- jsoup解析HTML,爬取小说实例
- Jsoup学习笔记3:Jsoup 解析Html源码实例
- Jsoup学习笔记4:Jsoup 解析Html源码实例
- Java 的HTML 解析器-jsoup
- Java Jsoup从Html中解析歌词
- java-jsoup解析html页面的内容
- Boost::bimap
- Cairo 图形指南 (4) —— 基本绘图
- ubuntu10/12安装vmtools
- boost智能指针小结
- 兴业银行托管主代销国内首只互联网大数据基金
- java jsoup解析html实例
- 创建数据库表默认字段封装SQL
- Mdi窗体客户区无边框显示
- boost::regex
- VC++6.0不能显示MSDN解决办法
- Cairo 图形指南 (5) —— 形状与填充
- Linux(CentOS)终端类型及相互转换方式
- hdu2188悼念512汶川大地震遇难同胞——选拔志愿者
- Cairo 图形指南 (6) —— 透明