java 网络爬虫之多线程抓取文件
来源:互联网 发布:php显示图片缩略图 编辑:程序博客网 时间:2024/05/17 07:11
记得这个是去年的东西了,今天重新拿出来重温,一些知识都模糊了很多。
一共六个类文件加上一个jar包,Demo文件是主文件;DownloadFile文件的作用是从网络URL上下载文件下来,别人已经封装好了拿来用;DownloadThread文件作用是多线程爬取文件下来,速度快;HttpUtils文件作用是将URL网页装换为可操作的document文件,也是别人已经封装好的;MD5不用我说了吧;Task是处理文件的类;
1 Demo.java
import java.util.ArrayList;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class Demo {/** * @param args */public static ArrayList<Task> arr = new ArrayList<Task>();public static void main(String[] args) {GeiALLimgUrl("http://www.csdn.net"); // 封装目标urlint maxindex = 2; // 设置的多线程个数,修改多少个随你DownloadThread[] d = new DownloadThread[maxindex];for (int i = 0; i < maxindex; i++) {d[i] = new DownloadThread(i);d[i].start();}}public static void GeiALLimgUrl(String url) {try {String result = HttpUtils.doGet(url);Document doc = Jsoup.parse(result);Elements links = doc.select("img");for (Element imgs : links) {System.out.println(imgs.attr("src")); // 抓取的当前URL页面上的图片imgarr.add(new Task(imgs.attr("src"))); // 先存放在集合里,后续再操作}} catch (Exception e) {e.printStackTrace();}}public static Task getTask() {for (Task s : arr) {if (!s.hasDownloaded) {s.hasDownloaded = true;return s;}}return null;}}
2 Task.java
public class Task {//图片地址public String imageUrl="";//图片是否被下载了?public boolean hasDownloaded=false;//图片的名字public String filename;//构造函数,提供图片的URL就可以了public Task(String url){imageUrl=url;filename=MD5.string2MD5(url); //对图片加密,利于爬取的各种操作int last=imageUrl.lastIndexOf(".");String ext=imageUrl.substring(last+1);filename=filename +"."+ext;System.out.println("文件名:"+filename);}}
3 DownloadThread.java
import java.io.IOException;public class DownloadThread extends Thread{//当前ID号 public int ID;public boolean exit=false;public DownloadThread(int id){ID=id;}@Overridepublic void run() {// TODO Auto-generated method stub//super.run();DownloadFile download=new DownloadFile();while(!exit){//从任务列表中读取一个没有被下载的任务Task target=Demo.getTask();if(target!=null){//下载System.out.println(ID);try {download.downLoadFromUrl(target.imageUrl, target.filename, "c:\\images"); } catch (IOException e) {e.printStackTrace();}}else{System.out.println("我是第"+ID+"个线程,我现在没有任务");//没有任务,休息一下try {Thread.sleep(1000);} catch (InterruptedException e) {e.printStackTrace();}}}}}
4 DownloadFile.java
import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.net.HttpURLConnection;import java.net.URL;public class DownloadFile {/** * 从网络Url中下载文件 * @param urlStr * @param fileName * @param savePath * @throws IOException */public void downLoadFromUrl(String urlStr,String fileName,String savePath) throws IOException{URL url = new URL(urlStr); HttpURLConnection conn = (HttpURLConnection)url.openConnection(); //设置超时间为3秒conn.setConnectTimeout(3*1000);//防止屏蔽程序抓取而返回403错误conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");//得到输入流InputStream inputStream = conn.getInputStream(); //获取自己数组byte[] getData = readInputStream(inputStream); //文件保存位置File saveDir = new File(savePath);if(!saveDir.exists()){saveDir.mkdir();}File file = new File(saveDir+File.separator+fileName); if(file.exists()){System.out.println("文件已存在,不用重复下载");return;}FileOutputStream fos = new FileOutputStream(file); fos.write(getData); if(fos!=null){fos.close(); }if(inputStream!=null){inputStream.close();}System.out.println("info:"+url+" download success"); }/** * 从输入流中获取字节数组 * @param inputStream * @return * @throws IOException */public byte[] readInputStream(InputStream inputStream) throws IOException { byte[] buffer = new byte[1024]; int len = 0; ByteArrayOutputStream bos = new ByteArrayOutputStream(); while((len = inputStream.read(buffer)) != -1) { bos.write(buffer, 0, len); } bos.close(); return bos.toByteArray(); } }
5 HttpUtils.java
import java.io.BufferedReader;import java.io.InputStream;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.zip.GZIPInputStream;public class HttpUtils {//根据url访问服务器,返回服务器响应文本public static String doGet(String url) throws Exception { //创建一个URL对象,URL URL localURL = new URL(url); //设置代理服务器 System.setProperty("http.proxyHost", "127.0.0.1"); System.setProperty("http.proxyPort", "8888"); URLConnection connection = localURL.openConnection(); HttpURLConnection httpURLConnection = (HttpURLConnection)connection; //设置请求头部的属性 httpURLConnection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E"); //保存输入输出流的对象 InputStream inputStream = null; InputStreamReader inputStreamReader = null; BufferedReader reader = null; StringBuffer resultBuffer = new StringBuffer(); String tempLine = null; //302强制浏览器跳转,200 ok if (httpURLConnection.getResponseCode() >= 300) { throw new Exception("HTTP Request is not success, Response code is " + httpURLConnection.getResponseCode()); } try { inputStream = httpURLConnection.getInputStream(); //get header by 'key' String encoding = httpURLConnection.getHeaderField("Content-Encoding"); //如果返回的是压缩HTML代码 if(encoding!=null && encoding.equals("gzip")) { System.out.println("这是一个压缩的HTML\n"); GZIPInputStream gzin; gzin = new GZIPInputStream(inputStream); //对返回页面内容进行utf-8解码,从而中文不会乱码 inputStreamReader = new InputStreamReader(gzin,"gbk"); } else { inputStreamReader = new InputStreamReader(inputStream,"gbk"); } reader = new BufferedReader(inputStreamReader); while ((tempLine = reader.readLine()) != null) { resultBuffer.append(tempLine+"\n"); } } finally { if (reader != null) { reader.close(); } if (inputStreamReader != null) { inputStreamReader.close(); } if (inputStream != null) { inputStream.close(); } } return resultBuffer.toString();}/* * currentBase当前搜索网页的URL * target是从网页标签提取出来的URL(例如href等) * */public static String getURL(String currentUrl,String targetUrl){String temp=targetUrl;//当前页面的路径//例如:http://www.gdmec.cn/cs/csnew/index.html//应该要分析出:http://www.gdmec.cn/cs/csnew/String currentBase="";String resultURL="";if(currentUrl.endsWith("/")){currentBase=currentUrl;}else{int lastPos=currentUrl.lastIndexOf("/");currentBase=currentUrl.substring(0,lastPos+1);}System.out.println("currentBase:"+currentBase);if(temp.startsWith("http")){return resultURL;}else if(temp.startsWith("../")){//resultURL=currentBase+temp.substring(2);}else if(temp.startsWith("./")){resultURL=currentBase+temp.substring(2);}else if(temp.startsWith("//")){resultURL="http:"+temp;}else if(temp.startsWith("/")){resultURL=currentBase+temp.substring(1);}else{resultURL=currentBase+temp;}return resultURL;}}
6 MD5.java
import java.security.MessageDigest;public class MD5 {/*** * MD5加码 生成32位md5码 */ public static String string2MD5(String inStr){ MessageDigest md5 = null; try{ md5 = MessageDigest.getInstance("MD5"); }catch (Exception e){ System.out.println(e.toString()); e.printStackTrace(); return ""; } char[] charArray = inStr.toCharArray(); byte[] byteArray = new byte[charArray.length]; for (int i = 0; i < charArray.length; i++) byteArray[i] = (byte) charArray[i]; byte[] md5Bytes = md5.digest(byteArray); StringBuffer hexValue = new StringBuffer(); for (int i = 0; i < md5Bytes.length; i++){ int val = ((int) md5Bytes[i]) & 0xff; if (val < 16) hexValue.append("0"); hexValue.append(Integer.toHexString(val)); } return hexValue.toString(); } }
jar包 jsoup-1.9.2.jar
这里是爬取网络上指定url的图片,其他的比如爬取兼职信息,天气信息等也可以,当然,爬取过多随时会被墙掉,而且一些网页会使用get 或者post来获取信息,这时就要适当修改爬取的方式了,还有一些网页是异步加载,就留给你们自己尝试了。
0 0
- java 网络爬虫之多线程抓取文件
- java抓取网页 --- 网络爬虫
- java实现网络爬虫--抓取网站数据
- java爬虫抓取网络上的图片
- Java实现网络爬虫001-抓取网页
- java 网络数据传输之多线程下载
- Java网络编程之多线程Client-Server
- 网络编程--JAVA之多线程下载
- 网络爬虫, Java爬虫,信息抓取的实现
- 网络爬虫内容抓取
- 网络爬虫-视频抓取
- python爬虫进阶之多线程
- iOS网络之多线程
- java抓取、java网络爬虫实例项目jnc
- java 学习:网络爬虫--中国人才热线邮箱抓取
- WebCollector2.X 网络JAVA爬虫入门(抓取百度百科)
- java算法-网络爬虫抓取网页并保存
- 【 网络爬虫】java 使用Socket, HttpUrlConnection方式抓取数据
- Android 多线程handler,runnable,asyctask使用
- strtok的实现——用于按给定符号分割字符串的函数
- 【OpenCV】大牛博客
- 二叉树的创建、先序、中序、后序、层序的递归与非递归算法(java)
- javaString常用方法
- java 网络爬虫之多线程抓取文件
- Android synchronized()同步线程
- Unity琐碎(3) UGUI 图文混排解决方案和优化
- Python标准库05 存储对象 (pickle包,cPickle包)
- 工作常用的git指令
- Android 缓存流(BufferedInputStream和BufferedOutputStream)
- 对队列各种操作的实现(C语言)
- bash预定义变量+read(获取键盘输入)
- java 1916 字符串扩展