利用Jsoup爬取网站的图片，保存到本地

来源：互联网发布：河北工业大学知乎编辑：程序博客网时间：2024/06/05 04:16

jsoup 是一款Java 的HTML解析器，可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API，可通过DOM，CSS以及类似于jQuery的操作方法来取出和操作数据。

学习的过程中可能会用到其他的API，在此附上链接：http://www.open-open.com/jsoup/parsing-a-document.htm，本文参考了：http://blog.csdn.net/withiter/article/details/15339579，http://blog.csdn.net/csh159/article/details/7310009，欢迎小伙伴们一起来学习，讨论

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupTest {
public static void main(String[] args) throws IOException {
JsoupTest jsoupTest = new JsoupTest();
String url = "http://tieba.baidu.com/p/4549504175";
// 1.jsoup 的简单应用
jsoupTest.getHtmlElements(url);
}

private static int count = 0;

// 爬取网络的图片到本地
public void saveToFile(String destUrl) {

FileOutputStream fos = null;
BufferedInputStream bis = null;
HttpURLConnection httpUrl = null;
URL url = null;
int BUFFER_SIZE = 1024;
byte[] buf = new byte[BUFFER_SIZE];
int size = 0;
try {
url = new URL(destUrl);
httpUrl = (HttpURLConnection) url.openConnection();
httpUrl.connect();
bis = new BufferedInputStream(httpUrl.getInputStream());
String imgName = destUrl.substring(7, destUrl.lastIndexOf("."));
System.out.println(imgName);
File dir = new File("f://img");
if (!dir.exists()) {
dir.mkdirs();
}
File file = new File("f:\\img\\haha" + count + ".jpg");
System.out.println(file.getAbsolutePath());

fos = new FileOutputStream(file);
while ((size = bis.read(buf)) != -1) {
fos.write(buf, 0, size);
}
fos.flush();
} catch (IOException e) {
System.out.println("IOException");
} catch (ClassCastException e) {
System.out.println("ClassCastException");
} finally {
count++;
try {
fos.close();
bis.close();
httpUrl.disconnect();
} catch (IOException e) {
} catch (NullPointerException e) {
}
}
}

// 解析url的元素
private void getHtmlElements(String url) {
try {
Document doc = Jsoup.connect(url).get();
// 获取后缀名为jpg的img元素
Elements pngs = doc.select("img[src$=.jpg]");
for (Element element : pngs) {
saveToFile(element.attr("src"));
}
} catch (IOException e) {
e.printStackTrace();
}
}
}

1 0