多线程批量图片下载器

来源:互联网 发布:bladed软件用户手册 编辑:程序博客网 时间:2024/06/02 07:05

近来因为在找到一个非常好的图片网站,心痒难熬,欲将网站上图片一网打尽,但又疲于一张张下载,于是折腾了几天,弄出这么一个东西来。哈,可以一网打尽了,足足下了30多G.

很普通的东西,图片地址用正则表达式解释

用了多个线程同时开动下载。

工作流程:先将搜索的结果页面源代码下载,然后将从源HTML代码里寻找符合条件的地址,再从这个地址里继续下载源HTML代码,再从这个得到的源HTML代码里寻找符合的图片的最终地址,然后可以下载了。

/*** file name:Downloader.java** version:* time:2011-10-29* Copyright nenglong Corporation 2011  **/package download;import java.io.BufferedInputStream;import java.io.BufferedOutputStream;import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.io.PrintStream;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.regex.Matcher;import java.util.regex.Pattern;import config.Config;/**** project name:HttpSms class name:Downloader class decription:* author:Administrator Allen.Chen build time:2011-10-29 上午08:45:11 modify* man:Administrator Allen.Chen modify time:2011-10-29 上午08:45:11 ps:** @version**/public class Downloader {     /**     *     *the function work for:get html code. parameter: return:String     * (?<=(<a[^<+]>)) (<?=(.*?Download.*?</a>))     */ Config conf = new Config();     String picUrlRegex = conf.getProperty("picUrlRegex");     String postUrlRegex = conf.getProperty("postUrlRegex");     String domainName = conf.getProperty("domainName");    PrintStream ps = null;     String []folders =  conf.getProperty("folders").split(",");     public Downloader(PrintStream ps) {            this.ps = ps;     }     public String getHtmlCode(String url) {          URL u;          String temp = "";          BufferedReader br = null;          boolean flag = true;          while (flag) {               try {                    ps.print("网页"+"源代码要开始获取了哦..网页地址为:"+url);                    u = new URL(url);                    URLConnection uc = null;                    uc = u.openConnection();                    br = new BufferedReader(new InputStreamReader(uc                              .getInputStream(), "GBK"));                    String line = "";                    while ((line = br.readLine()) != null) {                         temp += line;                    }                    flag = false;               } catch (MalformedURLException e) {                    // TODO Auto-generated catch block                   // writeInfo(url, "d:\\error.log");                    e.printStackTrace();               } catch (IOException e) {                    // TODO Auto-generated catch block                   // writeInfo(url, "d:\\error.log");                    ps.print("网页"  + "源代码获取失败..不过,不要灰心,5秒后我们再来....网页地址为:"+ url);                    try {                         Thread.sleep(5000);                    } catch (InterruptedException e1) {                         // TODO Auto-generated catch block                         e1.printStackTrace();                    }               }               if(br ==null){                    continue;               }               try {                    br.close();               } catch (IOException e) {                    // TODO Auto-generated catch block                    e.printStackTrace();               }          }          return temp;     }     /**     *     *the function work for:get all pic url in the page. parameter:     * return:String     *     *     */     public String getspecifiedUrl(String htmlCode, String regex) {          Pattern p = Pattern.compile(regex);          Matcher m = p.matcher(htmlCode);          String allPicUrl = "";          while (m.find()) {               if (allPicUrl == "") {                    allPicUrl = m.group();               } else {                    String picurl = m.group();                    Pattern p1 = Pattern.compile(picurl);                    Matcher m1 = p1.matcher(allPicUrl);                    if (!m1.find())                         allPicUrl = allPicUrl + "," + m.group();               }          }          return allPicUrl;     }     /**     * to download picture whose address show in specified page.     */     public void downloadPagePic(String pageUrl) {     writeInfo(pageUrl, conf.getProperty("infoPath"));     //conf.setProperty("nowPageIndex",)          String htmlCode = getHtmlCode(pageUrl);          // ps.print(htmlCode);          String postList = getspecifiedUrl(htmlCode, getPostUrlRegex());          // ps.print(postList);          String list[] = postList.split(",");          for (int i = 0; i < list.length; i++) {               String postFullUrl = domainName + list[i];                             // ps.print(postFullUrl);               String pagePic = getHtmlCode(postFullUrl);               String picUrlList = getspecifiedUrl(pagePic, getPicUrlRegex());               // ps.print(picUrl);               ArrayList<String> arr = getAllUnique(picUrlList, ",");               for (int j = 0; j < arr.size(); j++) {                    //ps.print("下载图片"+arr.get(j)+"开始........");                    // writeInfo(arr.get(j));                    // String a =                    // "http://konachan.com/image/5953a90905b587cd041bf96acdc512ea/Konachan.com%20-%20118173%20cleavage%20fang%20long_hair%20no_bra%20open_shirt%20panties%20pink_eyes%20underwear%20yuyi.png";                                   saveImage(arr.get(j), folders);               }          }     }     public void writeInfo(String pageUrl, String path) {          OutputStreamWriter fos = null;          try {               fos = new OutputStreamWriter(new FileOutputStream(new File(path),                         true));          } catch (FileNotFoundException e) {               // TODO Auto-generated catch block               e.printStackTrace();          }          try {               fos.write("\r\n" + pageUrl);          } catch (IOException e1) {               // TODO Auto-generated catch block               e1.printStackTrace();          }          try {               fos.close();          } catch (IOException e) {               // TODO Auto-generated catch block               e.printStackTrace();          }     }      /*   public boolean filter(String [] fi,String imageFullName){          for(int i=0;i<fi.length;i++){                    File f = new File(fi[i]+File.separator+imageFullName);                    if(f.exists()){                         return false;                    }                        }          return true;     }*/     public void saveImage(String imageUrl,String[] folders) {//folders[0]为现在保存图片的文件夹,其他为之前保存图片的文件夹          URL url = null;          String imageExtension = null;          int i = imageUrl.lastIndexOf('.');          if ((i > 0) && (i < (imageUrl.length() - 1))) {               imageExtension = imageUrl.substring(i + 1);          }          String imageName = "";          try {               imageName = imageUrl.replaceAll("[:/\\.]","_");               if(imageName.length()>234){               imageName = imageName.substring(27,234);               }else{               imageName = imageName.substring(27,imageName.length());               }          } catch (StringIndexOutOfBoundsException e) {               e.printStackTrace();               return;          }          URLConnection uc = null;          InputStream is = null;          BufferedOutputStream bos = null;          BufferedInputStream bis = null;         // String imageFullName = imageName+imageExtension;          File fi = null;          for(int j = 0 ;j<folders.length;j++){                    File fi1 = new File(folders[j] + File.separator + imageName + "."                      + imageExtension);          if(fi1.exists()){          ps.print("这一张已经有相同的了,不能再下了,下过的图片地址为:"+ imageUrl);          return;          }          }                                       fi = new File(folders[0] + File.separator + imageName + "."                    + imageExtension);          boolean flag = true;          while (flag) {               try {            ps.print("开始下载图片了哦,Let's go!图片地址为:"+imageUrl);                    url = new URL(imageUrl);                    uc = url.openConnection();                    is = uc.getInputStream();                    bos = new BufferedOutputStream(new FileOutputStream(fi));                    bis = new BufferedInputStream(is);                byte[] b = new byte[100];                    int line;                    while ((line = bis.read()) != -1) {                         bos.write(line);                    }                    flag = false;               } catch (IOException e) {                    System.out                              .println("图片" + "下载失败....,不过不要灰心哦,5秒后我们再来..图片地址为:"+ imageUrl);                    try {                         Thread.sleep(5000);                    } catch (InterruptedException e1) {                         // TODO Auto-generated catch block                         e1.printStackTrace();                    }               }               if(bos==null|bis==null){                    continue;               }                    try {                         bos.close();                         bis.close();                    } catch (IOException e) {                         // TODO Auto-generated catch block                         e.printStackTrace();                    }                        }     }     /*     * public void saveToFile(BufferedImage bi,String path,String     * imageName,String imageExtension) { SimpleDateFormat sdf = new     * SimpleDateFormat("yyyymmddhhmmss"); String name = sdf.format(new Date());     * File fi = new File(path); fi.mkdirs(); File path =     * FileSystemView.getFileSystemView().getHomeDirectory(); File f = new     * File(path + File.separator + imageName + "." + imageExtension); try {     * ImageIO.write(bi, imageExtension, f); } catch (IOException e) {     * e.printStackTrace(); } }     */     public ArrayList<String> getAllUnique(String strList, String seperator) {          String[] list = strList.split(seperator);          ArrayList arr = new ArrayList();          for (int k = 0; k < list.length; k++) {               arr.add(list[k]);          }          for (int i = 0; i < arr.size(); i++) {               for (int j = 0; j < arr.size() - i - 1; j++) {                    if (arr.get(i).equals(arr.get(i + j + 1))) {                         arr.remove(i + j + 1);                    }               }          }          return arr;     }     /**     * picUrlRegex     *     * @return the picUrlRegex     */     public String getPicUrlRegex() {          return picUrlRegex;     }     /**     * @param picUrlRegex     *            the picUrlRegex to set     */     public void setPicUrlRegex(String picUrlRegex) {          this.picUrlRegex = picUrlRegex;     }     /**     * postUrlRegex     *     * @return the postUrlRegex     */     public String getPostUrlRegex() {          return postUrlRegex;     }     /**     * @param postUrlRegex     *            the postUrlRegex to set     */     public void setPostUrlRegex(String postUrlRegex) {          this.postUrlRegex = postUrlRegex;     }}

这是下载的代码。

图片的命名以图片的地址进行处理得来的,可以保证不会下载到相同的图片。

另外:用多线程下载的话,如果请求的线程超过7个,或者更多,那么就会被网站拒绝,一堆下载失败弹出来,有没有什么好的解决方法,不吝指教。

原创粉丝点击