多线程批量图片下载器
来源:互联网 发布:bladed软件用户手册 编辑:程序博客网 时间:2024/06/02 07:05
近来因为在找到一个非常好的图片网站,心痒难熬,欲将网站上图片一网打尽,但又疲于一张张下载,于是折腾了几天,弄出这么一个东西来。哈,可以一网打尽了,足足下了30多G.
很普通的东西,图片地址用正则表达式解释
用了多个线程同时开动下载。
工作流程:先将搜索的结果页面源代码下载,然后将从源HTML代码里寻找符合条件的地址,再从这个地址里继续下载源HTML代码,再从这个得到的源HTML代码里寻找符合的图片的最终地址,然后可以下载了。
/*** file name:Downloader.java** version:* time:2011-10-29* Copyright nenglong Corporation 2011 **/package download;import java.io.BufferedInputStream;import java.io.BufferedOutputStream;import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.io.PrintStream;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.regex.Matcher;import java.util.regex.Pattern;import config.Config;/**** project name:HttpSms class name:Downloader class decription:* author:Administrator Allen.Chen build time:2011-10-29 上午08:45:11 modify* man:Administrator Allen.Chen modify time:2011-10-29 上午08:45:11 ps:** @version**/public class Downloader { /** * *the function work for:get html code. parameter: return:String * (?<=(<a[^<+]>)) (<?=(.*?Download.*?</a>)) */ Config conf = new Config(); String picUrlRegex = conf.getProperty("picUrlRegex"); String postUrlRegex = conf.getProperty("postUrlRegex"); String domainName = conf.getProperty("domainName"); PrintStream ps = null; String []folders = conf.getProperty("folders").split(","); public Downloader(PrintStream ps) { this.ps = ps; } public String getHtmlCode(String url) { URL u; String temp = ""; BufferedReader br = null; boolean flag = true; while (flag) { try { ps.print("网页"+"源代码要开始获取了哦..网页地址为:"+url); u = new URL(url); URLConnection uc = null; uc = u.openConnection(); br = new BufferedReader(new InputStreamReader(uc .getInputStream(), "GBK")); String line = ""; while ((line = br.readLine()) != null) { temp += line; } flag = false; } catch (MalformedURLException e) { // TODO Auto-generated catch block // writeInfo(url, "d:\\error.log"); e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block // writeInfo(url, "d:\\error.log"); ps.print("网页" + "源代码获取失败..不过,不要灰心,5秒后我们再来....网页地址为:"+ url); try { Thread.sleep(5000); } catch (InterruptedException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } if(br ==null){ continue; } try { br.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return temp; } /** * *the function work for:get all pic url in the page. parameter: * return:String * * */ public String getspecifiedUrl(String htmlCode, String regex) { Pattern p = Pattern.compile(regex); Matcher m = p.matcher(htmlCode); String allPicUrl = ""; while (m.find()) { if (allPicUrl == "") { allPicUrl = m.group(); } else { String picurl = m.group(); Pattern p1 = Pattern.compile(picurl); Matcher m1 = p1.matcher(allPicUrl); if (!m1.find()) allPicUrl = allPicUrl + "," + m.group(); } } return allPicUrl; } /** * to download picture whose address show in specified page. */ public void downloadPagePic(String pageUrl) { writeInfo(pageUrl, conf.getProperty("infoPath")); //conf.setProperty("nowPageIndex",) String htmlCode = getHtmlCode(pageUrl); // ps.print(htmlCode); String postList = getspecifiedUrl(htmlCode, getPostUrlRegex()); // ps.print(postList); String list[] = postList.split(","); for (int i = 0; i < list.length; i++) { String postFullUrl = domainName + list[i]; // ps.print(postFullUrl); String pagePic = getHtmlCode(postFullUrl); String picUrlList = getspecifiedUrl(pagePic, getPicUrlRegex()); // ps.print(picUrl); ArrayList<String> arr = getAllUnique(picUrlList, ","); for (int j = 0; j < arr.size(); j++) { //ps.print("下载图片"+arr.get(j)+"开始........"); // writeInfo(arr.get(j)); // String a = // "http://konachan.com/image/5953a90905b587cd041bf96acdc512ea/Konachan.com%20-%20118173%20cleavage%20fang%20long_hair%20no_bra%20open_shirt%20panties%20pink_eyes%20underwear%20yuyi.png"; saveImage(arr.get(j), folders); } } } public void writeInfo(String pageUrl, String path) { OutputStreamWriter fos = null; try { fos = new OutputStreamWriter(new FileOutputStream(new File(path), true)); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } try { fos.write("\r\n" + pageUrl); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { fos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /* public boolean filter(String [] fi,String imageFullName){ for(int i=0;i<fi.length;i++){ File f = new File(fi[i]+File.separator+imageFullName); if(f.exists()){ return false; } } return true; }*/ public void saveImage(String imageUrl,String[] folders) {//folders[0]为现在保存图片的文件夹,其他为之前保存图片的文件夹 URL url = null; String imageExtension = null; int i = imageUrl.lastIndexOf('.'); if ((i > 0) && (i < (imageUrl.length() - 1))) { imageExtension = imageUrl.substring(i + 1); } String imageName = ""; try { imageName = imageUrl.replaceAll("[:/\\.]","_"); if(imageName.length()>234){ imageName = imageName.substring(27,234); }else{ imageName = imageName.substring(27,imageName.length()); } } catch (StringIndexOutOfBoundsException e) { e.printStackTrace(); return; } URLConnection uc = null; InputStream is = null; BufferedOutputStream bos = null; BufferedInputStream bis = null; // String imageFullName = imageName+imageExtension; File fi = null; for(int j = 0 ;j<folders.length;j++){ File fi1 = new File(folders[j] + File.separator + imageName + "." + imageExtension); if(fi1.exists()){ ps.print("这一张已经有相同的了,不能再下了,下过的图片地址为:"+ imageUrl); return; } } fi = new File(folders[0] + File.separator + imageName + "." + imageExtension); boolean flag = true; while (flag) { try { ps.print("开始下载图片了哦,Let's go!图片地址为:"+imageUrl); url = new URL(imageUrl); uc = url.openConnection(); is = uc.getInputStream(); bos = new BufferedOutputStream(new FileOutputStream(fi)); bis = new BufferedInputStream(is); byte[] b = new byte[100]; int line; while ((line = bis.read()) != -1) { bos.write(line); } flag = false; } catch (IOException e) { System.out .println("图片" + "下载失败....,不过不要灰心哦,5秒后我们再来..图片地址为:"+ imageUrl); try { Thread.sleep(5000); } catch (InterruptedException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } if(bos==null|bis==null){ continue; } try { bos.close(); bis.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /* * public void saveToFile(BufferedImage bi,String path,String * imageName,String imageExtension) { SimpleDateFormat sdf = new * SimpleDateFormat("yyyymmddhhmmss"); String name = sdf.format(new Date()); * File fi = new File(path); fi.mkdirs(); File path = * FileSystemView.getFileSystemView().getHomeDirectory(); File f = new * File(path + File.separator + imageName + "." + imageExtension); try { * ImageIO.write(bi, imageExtension, f); } catch (IOException e) { * e.printStackTrace(); } } */ public ArrayList<String> getAllUnique(String strList, String seperator) { String[] list = strList.split(seperator); ArrayList arr = new ArrayList(); for (int k = 0; k < list.length; k++) { arr.add(list[k]); } for (int i = 0; i < arr.size(); i++) { for (int j = 0; j < arr.size() - i - 1; j++) { if (arr.get(i).equals(arr.get(i + j + 1))) { arr.remove(i + j + 1); } } } return arr; } /** * picUrlRegex * * @return the picUrlRegex */ public String getPicUrlRegex() { return picUrlRegex; } /** * @param picUrlRegex * the picUrlRegex to set */ public void setPicUrlRegex(String picUrlRegex) { this.picUrlRegex = picUrlRegex; } /** * postUrlRegex * * @return the postUrlRegex */ public String getPostUrlRegex() { return postUrlRegex; } /** * @param postUrlRegex * the postUrlRegex to set */ public void setPostUrlRegex(String postUrlRegex) { this.postUrlRegex = postUrlRegex; }}
这是下载的代码。
图片的命名以图片的地址进行处理得来的,可以保证不会下载到相同的图片。
另外:用多线程下载的话,如果请求的线程超过7个,或者更多,那么就会被网站拒绝,一堆下载失败弹出来,有没有什么好的解决方法,不吝指教。
- 多线程批量图片下载器
- 多线程图片下载
- 多线程图片下载
- 批量图片下载器(整站下载)
- python批量图片下载
- 多线程:图片下载案例
- 多线程:图片下载案例
- 网络图片下载 多线程
- 网络爬虫之批量图片下载
- 全网页批量图片下载办法
- Python:批量编写图片下载程序
- 图片下载器类
- android图片下载器
- 百度图片下载器
- 多线程图片下载程序的修改心得.
- Android多线程方式处理图片下载及显示
- iOS多线程编程及简单封装图片下载
- 图片下载
- A*算法
- python_bisect模块的使用
- js调用谷歌地图
- 加速 MySQL 导入导出的方法
- .net中调用VC6生成的dll问题之操作系统无法运行
- 多线程批量图片下载器
- 深入剖析Isolate-user-VLAN工作原理
- SilverLight:使用MVVM实现View层在程序运行时自动生成控件并且取得其值
- linux的常用命令
- poj-2567 prufer 编码
- windows vista/win7/2008 CVSNT打不开的问题
- String.Format 源字符串包含大括号的小问题
- 创建联系人、短信的桌面快捷方式
- jquery ajax的使用