Jsoup之网络爬虫

来源:互联网 发布:杭州规划美工设计招聘 编辑:程序博客网 时间:2024/05/17 17:54

使用Jsoup进行简单的批量抓取网页图片。

package com.wh.web;import com.wh.util.HttpUtil;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import java.io.IOException;/** * Created by John on 2017/7/21. * 使用Jsoup实现简单的网络爬虫 */public class Jsoup {    public static void main(String[] args) throws IOException {        Document document = org.jsoup.Jsoup.connect("http://www.mm131.com/qingchun/").get();        Elements elements = document.select(".main .list-left dd:not(.page)>a");        for (Element ele:elements) {            String href = ele.attr("href");            System.out.println(href);            Elements img = ele.select("img");            String imgSrc = img.attr("src");            System.out.println(imgSrc);            String title = ele.text();            System.out.println(title);            HttpUtil.getRequestStream(imgSrc,"H:/upload/" + title + ".jpg");        }    }}
package com.kaishengit.test;import com.kaishengit.util.HttpUtil;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import org.junit.Test;import java.io.IOException;public class JSoupTestCase {    @Test    public void testGetImage() throws IOException {        for(int i = 2;i < 5;i++) {            Document document = Jsoup.connect("http://www.topit.me/pop?p="+i).cookie("is_click", "1").get();            Elements elements = document.select("#content .catalog .e>a");            for (Element element : elements) {                String href = element.attr("href");                System.out.println("href:" + href);                Document bigImageDoc = Jsoup.connect(href).cookie("is_click", "1").get();                Element imgElement = bigImageDoc.select("#content>a").first();                String imgSrc = imgElement.attr("href");                String fileName = imgSrc.substring(imgSrc.lastIndexOf("/") + 1);                System.out.println(imgSrc);                HttpUtil.getRequestStream(imgSrc, "D:/upload/" + fileName);            }        }    }}
原创粉丝点击