一个简单网络爬虫示例

来源：互联网发布：零速争霸奥迪双钻淘宝编辑：程序博客网时间：2024/05/02 23:36

在学生时期，可能听到网络爬虫这个词会觉得很高大上，但是它的简单实现可能学生都不难懂。网络爬虫应用，就是把整个互联网真的就当做一张网，像蜘蛛网那样，应用就像一个虫子，在网上面按照一定的规则爬动。现在互联网应用最广的就是http(s)协议了，本文例子就是基于使用http(s)协议的，只作为示例，不涉及复杂的算法（实际上是最重要的）。

设计思路：
程序入口从一个或多个url开始，通过http(s)获取url的内容，对获取到内容处理，获取内容中需要爬取的信息，获取到内容中的url链接，再重复以上步骤。
不多说，详情看代码已经注释：

/** * 功能概要：主程序 * * @author hwz */public class MainApp {    private Integer corePoolSize = 10;    private Integer maxPoolSize = 20;    private ThreadPoolExecutor executor;    /** 工作队列 */    private SpiderQueue workQueue;    public void start(String url) throws Exception {        //初始化线程池        LinkedBlockingDeque<Runnable> executorQueue = new LinkedBlockingDeque<Runnable>(maxPoolSize);        executor = new ThreadPoolExecutor(corePoolSize, maxPoolSize, 60L, TimeUnit.SECONDS,                 executorQueue);        workQueue = new SpiderQueue(1024);        SpiderUrl spiderUrl = new SpiderUrl(url, 0);        try {            workQueue.add(spiderUrl);        }        catch (Exception e) {            System.out.println("insert url into workQueue error,url=" + url);            e.printStackTrace();        }        //提交第一个执行任务       executor.submit(new SimpleSpider(workQueue, "thread-" + "main"));       int i=0;       int idle = 0;       while(true) {           //判断是否增加更多线程执行任务           if (workQueue.size() > 20 && executor.getActiveCount() < maxPoolSize) {               idle = 0;               System.out.println("submit new thread,workQueue.size=" + workQueue.size() +                        ",executorQueue.activeCount=" + executor.getActiveCount() + ",i=" + i);               executor.submit(new SimpleSpider(workQueue, "thread-" + i++));               Thread.sleep(500);           }           else if (workQueue.size() == 0){               idle++;               System.out.println("main method, idle times=" + idle);               //主线程空闲20次，结束运行               if (idle > 20) {                   System.out.println("main method, idle times=" + idle + ",end!");                   break;               }               Thread.sleep(1000);           }           else {               Thread.sleep(2000);           }       }       System.out.println("End!,workQueue.size=" + workQueue.size() +                        ",executorQueue.activeCount=" + executor.getActiveCount() + ",executorQueue.CompletedTaskCount" +               executor.getCompletedTaskCount() +  ",i=" + i);       workQueue.printAll();       executor.shutdown();       System.exit(0);    }    public static void main(String[] args) throws Exception {        MainApp app = new MainApp();        app.start("http://www.csdn.net/");    }}

/** *  * 功能概要：自定义爬虫工作同步队列，使用ArrayList实现 * * @author hwz */public class SpiderQueue {    /** 存储器 */    private List<SpiderUrl> queue;    public SpiderQueue(int size) {        queue = new ArrayList<SpiderUrl>(size);    }    public synchronized void add(SpiderUrl spiderUrl) {        queue.add(spiderUrl);    }    public synchronized SpiderUrl poll() {        if (queue.isEmpty()) {            return null;        }        //控制台打印结果，方便查看        SpiderUrl spiderUrl = queue.remove(0);        System.out.println("SpiderQueue,poll,SpiderUrl=" + spiderUrl.toString() + ",remain size=" + queue.size());        return spiderUrl;    }    public synchronized SpiderUrl peek() {        if (queue.isEmpty()) {            return null;        }        return queue.get(0);    }    public synchronized boolean isExsit(SpiderUrl spiderUrl) {        return queue.contains(spiderUrl);    }    public synchronized int size() {        return queue.size();    }    public void printAll() {        System.out.println("Enter printAll.");        for (SpiderUrl spiderUrl : queue) {            System.out.println(spiderUrl);        }    }}

/** *  * 功能概要：爬虫工作的url * * @author hwz */public class SpiderUrl {    /** http(s) url */    private String url;    /** 该url是入口url的第几层  */    private int deep;    public SpiderUrl(String url, int deep) {        this.url = url;        this.deep = deep;    }    public String getUrl() {        return url;    }    public void setUrl(String url) {        this.url = url;    }    public int getDeep() {        return deep;    }    public void setDeep(int deep) {        this.deep = deep;    }    @Override    public boolean equals(Object obj) {        if (!(obj instanceof SpiderUrl)) {            return false;        }        SpiderUrl oth = (SpiderUrl) obj;        return this.url.equals(oth.getUrl());    }    @Override    public int hashCode() {        return url.hashCode();    }    @Override    public String toString() {        return getClass().toString() + "[url:" + url + ",deep:" + deep +"]";    }}

/** *  * 功能概要：爬虫工作类，主要实现类 * * @author hwz */public class SimpleSpider implements Runnable{    private String threadName;    private SpiderUrl url;    private SpiderQueue workQueue;    public SimpleSpider(SpiderQueue workQueue, String threadName) {        this.workQueue = workQueue;        this.threadName = threadName;    }    @Override    public void run() {        System.out.println(threadName + " start run");        //连续空闲10次循环，结束任务        int idle = 0;        while (idle < 10) {            url = workQueue.poll();            if (url != null) {                //url 解析                parseUrl(url);                idle = 0;            }            else {                System.out.println(threadName + " idle...,times=" + idle++);                try {                    Thread.sleep(1000);                }                catch (InterruptedException e) {                    e.printStackTrace();                }            }        }        System.out.println(threadName + " end run...");    }    /**     * url解析     * @param url     * @return void     */    private void parseUrl(SpiderUrl url) {        if (url == null) {            return;        }        try {            int deep = url.getDeep() + 1;            URL netUrl = new URL(url.getUrl());            URLConnection connection = netUrl.openConnection();            String contentType = connection.getContentType();            //获取内容            String resource = getResource(connection);            //获取标题            String title = getTitle(resource);            //获取链接            List<String> urls = getUrls(resource);            System.out.println(threadName +  ",parseUrl url=" + url + ",contentType=" + contentType + ",title=" + title + ",urls=" + urls);            //控制爬取链接层数，如果获取到的url全部加入工作队列，将会是指数级增加，最后程序挂掉            if (deep < 3) {                SpiderUrl newUrl;                for (String u : urls) {                    newUrl = new SpiderUrl(u,deep);                    if(!workQueue.isExsit(newUrl)) {                        workQueue.add(newUrl);                    }                }            }        }        catch (IOException e) {            e.printStackTrace();        }    }    /**     * 读取http url 内容     * @param connection     * @return     * @return String     */    private String getResource(URLConnection connection) {        if (connection == null) {            return null;        }        StringBuilder sb = new StringBuilder();        try {            InputStream inputStream = connection.getInputStream();            InputStreamReader isr = new InputStreamReader(inputStream, "UTF-8");            int input;            while ( (input = isr.read()) != -1) {                sb.append((char)input);            }        }        catch (IOException e) {            System.out.println(threadName + ",get resource error,connection=" + connection);        }        return sb.toString();    }    /**     * 从url内容获取标题     * @param content     * @return     * @return String     */    private  String getTitle(String content) {        if (content == null) {            return null;        }        Pattern pattern = Pattern.compile("(<title>.{1,}</title>)");        Matcher matcher = pattern.matcher(content);        String title = null;        if (matcher.find()) {            title = matcher.group(0).replaceAll("<title>", "").replaceAll("</title>", "");        }        return title;    }    /**     * 从url内容中获取存在的url链接     * @param content     * @return     * @return List<String>     */    private  List<String> getUrls(String content) {        if (content == null) {            return null;        }        Pattern pattern = Pattern.compile("(<a.{1,}?href=['\"]?[a-zA-z]+:\\/\\/[^\\s]*?[\\s>]{1})");        Matcher matcher = pattern.matcher(content);        String a;        String lastChar;        List<String> links = new ArrayList<String>();        while (matcher.find()) {            a = matcher.group(0).replaceAll("<a.{1,}?href=['\"]?", "");            a = a.trim();            lastChar = a.substring(a.length()-1);            if (lastChar.equals("'") || lastChar.equals("\"") || lastChar.equals(">")) {                a = a.substring(0,a.length()-1);            }            links.add(a);        }        return links;    }}

该代码示例，旨在说明一个简单的爬虫，关于多线程和http的处理没有过多考虑，如存在错误，请指出。

1 0