Jsoup做的网络爬虫

来源：互联网发布：mac截图后存在哪了编辑：程序博客网时间：2024/05/16 12:51

package com.html;

import java.io.*;
import java.net.*;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
//import java.net.URLConnection;

/**
* 搜索引擎爬虫
* @author Administrator
*
*/
public class HtmlJsoup {

public static String getHtmlResourceByURL(String url,String encoding)
{
//申明一个存储网页源代码的容器
StringBuffer buffer=new StringBuffer();
URL urlObj=null;
URLConnection uc=null;
InputStreamReader in=null;
BufferedReader reader=null;
String tempLine=null;

try{
//建立网络连接
urlObj=new URL(url);
//打开网络连接
uc=urlObj.openConnection();
//建立网络的输入流
in=new InputStreamReader
(uc.getInputStream(),encoding);
//写入缓冲文件流
reader=new BufferedReader(in);
//临时变量
tempLine=null;
//循环读取文件流
while((tempLine=reader.readLine())!=null){
buffer.append(tempLine+"\n");//循环不断的追加数据
}
}catch(Exception e){
e.printStackTrace();
System.out.println("connection timeout...");
}finally{
if(in!=null){
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

return buffer.toString();
}

/**
* 根据一个图片的URL地址，通过这个URL批量下载图片到服务器的磁盘
* @author Administrator
* @param imgURL 要下载图片服务器地址
* @param filePath 下载完成后保存到服务器的图片地址
*/
public static void downImages(String imgURL,
String filePath){
String fileName=imgURL.substring(
imgURL.lastIndexOf("/"));

try{
//创建文件的目录
File files=new File(filePath);
//判断是否存在文件夹
if(!files.exists()){
files.mkdirs();
}
//获取图片文件下载地址URL
URL url=new URL(imgURL);
//连接网络图片地址
HttpURLConnection uc=(HttpURLConnection)url.openConnection();
//获取连接的输出流
InputStream is=uc.getInputStream();

//创建文件
File file=new File(filePath+fileName);
/*D:\Java_Tanzhou\MySearch\images\0x200_v2.jpg*/
//创建输出流，写入文件
FileOutputStream out=new FileOutputStream(file);
int i=0;
while((i=is.read())!=-1){
out.write(i);
}
is.close();
out.close();
}catch(Exception e){
e.printStackTrace();
}

}

//Java的入口函数
public static void main(String[] args)
{
//System.out.println("abcdef");

//根据网址和页面的编码集来获取网页的源代码
String htmlResource=getHtmlResourceByURL("http://finance.qq.com/",/*中超的网址*/
"gb2312");
System.out.println(htmlResource);
//解析源代码
Document document=Jsoup.parse(htmlResource);//网页的源代码
//获取网页的图片
//图片标签<img src="" alt="" width="" height="" />
Elements elements=document.getElementsByTag("img");
for(Element element:elements){
String imgSrc=element.attr("src");
String imgPath=imgSrc;
System.out.println("正在下载的图片地址： "+imgSrc);
downImages(imgPath,"D:\\Java_Tanzhou\\MySearch\\images\\");
System.out.println("图片下载成功！-----------");
}

//解析我们需要下载的内容部分

}

}

在这个程序中，需要安装Jsoup插件，也就是直接把Jsoup复制到web-inf下面的lib文件夹下面

0 0