Java网页资源抓取例子(使用第三方包Jsoup解析Html)

来源:互联网 发布:红色管弦乐 知乎 编辑:程序博客网 时间:2024/05/17 23:38

直接上代码:

import java.io.IOException;import java.util.ArrayList;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class Test {public static void main(String[] args) throws IOException {ArrayList<String> imageUrlBuilder = new ArrayList<String>();ArrayList<String> httpUrlBuilder = new ArrayList<String>();httpUrlBuilder.add("http://news.baidu.com/");int i = 0;String url = null;while((url = httpUrlBuilder.get(i))!= null){try{Document doc = Jsoup.connect(url).get();System.out.println("==============当前url"+url+"下有图片链接===============");Elements imgLinks = doc.getElementsByTag("img");for (Element link : imgLinks) {  String linkHref = link.attr("src");    if(linkHref.startsWith("http")){  imageUrlBuilder.add(linkHref);  System.out.println(linkHref);  }}Elements links = doc.getElementsByTag("a");for (Element link : links) {  String linkHref = link.attr("href");  //如果数组中含有此链接字符串就不添加  if(linkHref.startsWith("http")&&!httpUrlBuilder.contains(linkHref)){  httpUrlBuilder.add(linkHref);  }}}catch(Exception e){continue;}System.out.println();System.out.println("httpUrl数目"+httpUrlBuilder.size());i++;}}}
第三方包下载地址:http://jsoup.org/download


0 0
原创粉丝点击