图片爬虫程序 JAVA

来源:互联网 发布:淮南网络宾馆 编辑:程序博客网 时间:2024/06/05 12:25
package com.liyiwen.Crawler;import java.io.*;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.Collection;import java.util.SplittableRandom;import java.util.concurrent.*;import java.util.regex.Matcher;import java.util.regex.Pattern;/** * Created by dell on 2015/7/18. *//** * 正则表达式中\表示转义,java字符出串\也表示转义 */public class ImgCrawler implements Runnable {    public static void main(String[] urls) throws FileNotFoundException, InterruptedException, IOException, ExecutionException{        ImgCrawler imgCrawler = new ImgCrawler();        ArrayList<Thread> threads = new ArrayList<Thread>();        for (int i = 0; i < ImgCrawler.threads; ++i){            Thread thread = new Thread(imgCrawler);            thread.start();            threads.add(thread);        }        Thread.currentThread().sleep(10000);        for (Thread t : threads){            t.interrupt();        }        System.out.println(imgCrawler.getWantedUrls().toString() + "李意文");        printToFile(imgCrawler.getWantedUrls());        System.out.println("succeed");    }    public static void printToFile(Collection<String> strs) throws FileNotFoundException, IOException{        File file = new File("crawler.html");        if (!file.exists()){            file.createNewFile();        }         Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8"));        writer.write(1);        for (String str : strs){            writer.append(str + "\r\n");        }        writer.flush();    }    static private int threads;    static {        threads = Runtime.getRuntime().availableProcessors();    }    public BlockingQueue<String> getWantedUrls() {        return wantedUrls;    }    private BlockingQueue<String> wantedUrls;    private BlockingQueue<String> handledUrls;    public ImgCrawler(){        wantedUrls = new LinkedBlockingQueue<String>();        handledUrls = new LinkedBlockingQueue<String>();        handledUrls.add("http://www.dedeshe.com/html/article/2015-7/index26785.html");    }    @Override    public void run() {        try{            for (int i = 0; i < 1000; i++ ){                String url = null;                url = handledUrls.take();                crawl(url);                if (Thread.currentThread().isInterrupted()){                    break;                }            }        }        catch (Throwable throwable){            System.out.println(throwable.getMessage());        }    }    private void crawl(String url) throws InterruptedException{        try {            URL crawledUrl = new URL(url);            URLConnection urlConnection = crawledUrl.openConnection();            BufferedReader reader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), "utf-8"));            String line = null;            ArrayList<String> matchUrls = null;            ArrayList<String> hrefs = null;            while ((line = reader.readLine()) != null) {               // System.out.println(line);                matchUrls = matchWantedUrl(line);                if (null != matchUrls && !matchUrls.isEmpty()) {                    for (String matchUrl : matchUrls) {                        wantedUrls.put(matchUrl);                    }                }                hrefs = matchHref(line, url);                if (null != hrefs && !hrefs.isEmpty()) {                    for (String href : hrefs) {                        handledUrls.put(href);                    }                }            }        }        catch (MalformedURLException e){            System.out.println(e.getMessage());            System.out.println("url 错误");        }        catch (IOException e){            System.out.println("不能打开连接");            System.out.println(e.getMessage());        }    }    private ArrayList<String> matchWantedUrl(String line){        ArrayList<String> wantedUrsl = new ArrayList<String>();        Pattern pattern = Pattern.compile("<img .*?src=\"http.*?\".*?>");        Matcher matcher = pattern.matcher(line);        while (matcher.find()){            wantedUrsl.add(matcher.group(0));        }        return wantedUrsl;    }    private ArrayList<String> matchHref(String line,  String url){        String rootURL = null;        Pattern rootURLPattern = Pattern.compile("(http.+com).*");        Matcher rootURLMatcher = rootURLPattern.matcher(url);        if (rootURLMatcher.find()){             rootURL = rootURLMatcher.group(1);            System.out.println("根目录: " + rootURL);        }        ArrayList<String> wantedHrefs = new ArrayList<String>();        Pattern pattern = Pattern.compile("<a href=\"(.+?)\"");        Matcher matcher = pattern.matcher(line);        while (matcher.find()){            if (!matcher.group(1).startsWith("http") && rootURL != null){                wantedHrefs.add(rootURL + matcher.group(1));                System.out.println(rootURL + matcher.group(1));            }else{                wantedHrefs.add(matcher.group(1));                System.out.println(matcher.group(1));            }        }        return wantedHrefs;    }}
0 0
原创粉丝点击