java 网络爬虫之多线程抓取文件

来源:互联网 发布:php显示图片缩略图 编辑:程序博客网 时间:2024/05/17 07:11

记得这个是去年的东西了,今天重新拿出来重温,一些知识都模糊了很多。

一共六个类文件加上一个jar包,Demo文件是主文件;DownloadFile文件的作用是从网络URL上下载文件下来,别人已经封装好了拿来用;DownloadThread文件作用是多线程爬取文件下来,速度快;HttpUtils文件作用是将URL网页装换为可操作的document文件,也是别人已经封装好的;MD5不用我说了吧;Task是处理文件的类;

1 Demo.java

import java.util.ArrayList;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class Demo {/** * @param args */public static ArrayList<Task> arr = new ArrayList<Task>();public static void main(String[] args) {GeiALLimgUrl("http://www.csdn.net"); // 封装目标urlint maxindex = 2; // 设置的多线程个数,修改多少个随你DownloadThread[] d = new DownloadThread[maxindex];for (int i = 0; i < maxindex; i++) {d[i] = new DownloadThread(i);d[i].start();}}public static void GeiALLimgUrl(String url) {try {String result = HttpUtils.doGet(url);Document doc = Jsoup.parse(result);Elements links = doc.select("img");for (Element imgs : links) {System.out.println(imgs.attr("src")); // 抓取的当前URL页面上的图片imgarr.add(new Task(imgs.attr("src"))); // 先存放在集合里,后续再操作}} catch (Exception e) {e.printStackTrace();}}public static Task getTask() {for (Task s : arr) {if (!s.hasDownloaded) {s.hasDownloaded = true;return s;}}return null;}}

2 Task.java

public class Task {//图片地址public String imageUrl="";//图片是否被下载了?public  boolean hasDownloaded=false;//图片的名字public String filename;//构造函数,提供图片的URL就可以了public Task(String url){imageUrl=url;filename=MD5.string2MD5(url);  //对图片加密,利于爬取的各种操作int last=imageUrl.lastIndexOf(".");String ext=imageUrl.substring(last+1);filename=filename +"."+ext;System.out.println("文件名:"+filename);}}

3 DownloadThread.java

import java.io.IOException;public class DownloadThread extends Thread{//当前ID号 public int ID;public boolean exit=false;public DownloadThread(int id){ID=id;}@Overridepublic void run() {// TODO Auto-generated method stub//super.run();DownloadFile download=new DownloadFile();while(!exit){//从任务列表中读取一个没有被下载的任务Task target=Demo.getTask();if(target!=null){//下载System.out.println(ID);try {download.downLoadFromUrl(target.imageUrl, target.filename, "c:\\images");    } catch (IOException e) {e.printStackTrace();}}else{System.out.println("我是第"+ID+"个线程,我现在没有任务");//没有任务,休息一下try {Thread.sleep(1000);} catch (InterruptedException e) {e.printStackTrace();}}}}}

4 DownloadFile.java

import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.net.HttpURLConnection;import java.net.URL;public class DownloadFile {/** * 从网络Url中下载文件 * @param urlStr * @param fileName * @param savePath * @throws IOException */public  void  downLoadFromUrl(String urlStr,String fileName,String savePath) throws IOException{URL url = new URL(urlStr);  HttpURLConnection conn = (HttpURLConnection)url.openConnection();                  //设置超时间为3秒conn.setConnectTimeout(3*1000);//防止屏蔽程序抓取而返回403错误conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");//得到输入流InputStream inputStream = conn.getInputStream();  //获取自己数组byte[] getData = readInputStream(inputStream);    //文件保存位置File saveDir = new File(savePath);if(!saveDir.exists()){saveDir.mkdir();}File file = new File(saveDir+File.separator+fileName); if(file.exists()){System.out.println("文件已存在,不用重复下载");return;}FileOutputStream fos = new FileOutputStream(file);     fos.write(getData); if(fos!=null){fos.close();  }if(inputStream!=null){inputStream.close();}System.out.println("info:"+url+" download success"); }/** * 从输入流中获取字节数组 * @param inputStream * @return * @throws IOException */public   byte[] readInputStream(InputStream inputStream) throws IOException {  byte[] buffer = new byte[1024];  int len = 0;  ByteArrayOutputStream bos = new ByteArrayOutputStream();  while((len = inputStream.read(buffer)) != -1) {  bos.write(buffer, 0, len);  }  bos.close();  return bos.toByteArray();  }  }

5 HttpUtils.java

import java.io.BufferedReader;import java.io.InputStream;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.zip.GZIPInputStream;public class HttpUtils {//根据url访问服务器,返回服务器响应文本public static String doGet(String url) throws Exception {     //创建一个URL对象,URL    URL localURL = new URL(url);          //设置代理服务器    System.setProperty("http.proxyHost", "127.0.0.1");          System.setProperty("http.proxyPort", "8888");             URLConnection connection = localURL.openConnection();        HttpURLConnection httpURLConnection = (HttpURLConnection)connection;           //设置请求头部的属性        httpURLConnection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E");                        //保存输入输出流的对象        InputStream inputStream = null;        InputStreamReader inputStreamReader = null;        BufferedReader reader = null;        StringBuffer resultBuffer = new StringBuffer();        String tempLine = null;                //302强制浏览器跳转,200 ok        if (httpURLConnection.getResponseCode() >= 300) {            throw new Exception("HTTP Request is not success, Response code is " + httpURLConnection.getResponseCode());        }                try {            inputStream = httpURLConnection.getInputStream();                      //get header by 'key'        String encoding = httpURLConnection.getHeaderField("Content-Encoding");                    //如果返回的是压缩HTML代码        if(encoding!=null && encoding.equals("gzip"))        {         System.out.println("这是一个压缩的HTML\n");             GZIPInputStream gzin;               gzin = new GZIPInputStream(inputStream);              //对返回页面内容进行utf-8解码,从而中文不会乱码             inputStreamReader = new InputStreamReader(gzin,"gbk");                    }        else        {           inputStreamReader = new InputStreamReader(inputStream,"gbk");        }            reader = new BufferedReader(inputStreamReader);                        while ((tempLine = reader.readLine()) != null) {                resultBuffer.append(tempLine+"\n");            }                    } finally {                        if (reader != null) {                reader.close();            }                        if (inputStreamReader != null) {                inputStreamReader.close();            }                        if (inputStream != null) {                inputStream.close();            }                    }                return resultBuffer.toString();}/* * currentBase当前搜索网页的URL * target是从网页标签提取出来的URL(例如href等) * */public static String getURL(String currentUrl,String targetUrl){String temp=targetUrl;//当前页面的路径//例如:http://www.gdmec.cn/cs/csnew/index.html//应该要分析出:http://www.gdmec.cn/cs/csnew/String currentBase="";String resultURL="";if(currentUrl.endsWith("/")){currentBase=currentUrl;}else{int lastPos=currentUrl.lastIndexOf("/");currentBase=currentUrl.substring(0,lastPos+1);}System.out.println("currentBase:"+currentBase);if(temp.startsWith("http")){return resultURL;}else if(temp.startsWith("../")){//resultURL=currentBase+temp.substring(2);}else if(temp.startsWith("./")){resultURL=currentBase+temp.substring(2);}else if(temp.startsWith("//")){resultURL="http:"+temp;}else if(temp.startsWith("/")){resultURL=currentBase+temp.substring(1);}else{resultURL=currentBase+temp;}return resultURL;}}

6 MD5.java

import java.security.MessageDigest;public class MD5 {/***     * MD5加码 生成32位md5码     */     public static String string2MD5(String inStr){         MessageDigest md5 = null;         try{             md5 = MessageDigest.getInstance("MD5");         }catch (Exception e){             System.out.println(e.toString());             e.printStackTrace();             return "";         }         char[] charArray = inStr.toCharArray();         byte[] byteArray = new byte[charArray.length];          for (int i = 0; i < charArray.length; i++)             byteArray[i] = (byte) charArray[i];         byte[] md5Bytes = md5.digest(byteArray);         StringBuffer hexValue = new StringBuffer();         for (int i = 0; i < md5Bytes.length; i++){             int val = ((int) md5Bytes[i]) & 0xff;             if (val < 16)                 hexValue.append("0");             hexValue.append(Integer.toHexString(val));         }         return hexValue.toString();      }  }

 jar包 jsoup-1.9.2.jar

这里是爬取网络上指定url的图片,其他的比如爬取兼职信息,天气信息等也可以,当然,爬取过多随时会被墙掉,而且一些网页会使用get 或者post来获取信息,这时就要适当修改爬取的方式了,还有一些网页是异步加载,就留给你们自己尝试了。


0 0