搜索引擎爬虫,抓取url

来源:互联网 发布:淘宝店装修教程视频 编辑:程序博客网 时间:2024/04/28 16:21
001package se.robot;
002 
003import java.io.IOException;
004import java.util.LinkedList;
005import java.util.List;
006import java.util.Queue;
007import java.util.regex.Matcher;
008import java.util.regex.Pattern;
009 
010import org.jsoup.Jsoup;
011import org.jsoup.nodes.Document;
012import org.jsoup.nodes.Element;
013import org.jsoup.select.Elements;
014 
015public class Robot {
016 
017    // robot url
018    private List<String> urlList;
019    // cache url
020    private Queue<String> urlQueue;
021    // define Host
022    public final static String HOST = "debugs.tk";
023 
024    // constructor
025    public Robot() {
026        super();
027        // initialization robot's member
028        setUrlList(new LinkedList<String>());
029        setUrlQueue(new LinkedList<String>());
030    }
031 
032    // url
033    public List<String> getUrlList() {
034        return urlList;
035    }
036 
037    public void setUrlList(List<String> urlList) {
038        this.urlList = urlList;
039    }
040 
041    // cache
042    public Queue<String> getUrlQueue() {
043        return urlQueue;
044    }
045 
046    public void setUrlQueue(Queue<String> urlQueue) {
047        this.urlQueue = urlQueue;
048    }
049 
050    // Legal link
051    private boolean isURL(String url) {
052        try {
053            // judge url
054            Pattern pattern = Pattern.compile("^[a-zA-z]+://[^\\s]*");
055            Matcher matcher = pattern.matcher(url);
056            if (matcher.matches()) {
057                return true;
058            else {
059                return false;
060            }
061        catch (Exception e) {
062            e.printStackTrace();
063            return false;
064        }
065    }
066 
067    // whether the url is belong to host
068    public static boolean isHost(String url) {
069        return url.contains(HOST);
070    }
071 
072    // travel all url
073    public void traverse(String seed) {
074 
075        for (this.getUrlQueue().add(seed); !this.getUrlQueue().isEmpty();) {
076            boolean flag = true;
077            Document document = null;
078            try {
079                document = Jsoup.connect(seed).timeout(5000).get();
080            catch (IOException e) {
081                e.printStackTrace();
082                // whether connect success
083                flag = false;
084            }
085            // whether connect success,then select a tag
086            // add these aTag into queue
087            if (flag) {
088        // get url
089                Elements elements = document.select("a[href]");
090                for (Element e : elements) {
091                    String s = e.attr("abs:href");
092                    // Legal link and belong host
093                    // and url not in list
094                    // then add it
095                    if (isURL(s) && s.contains(HOST)
096                            && (!getUrlQueue().contains(s))
097                            && (!getUrlList().contains(s))) {
098                        this.getUrlQueue().add(s);
099                    }
100                }
101            }
102            // get head of queue
103            // and set it seed
104            // travel seed it again
105            seed = this.getUrlQueue().poll();
106            this.getUrlList().add(seed);
107            // show information
108            // System.out.println("SIZE:"
109            // + this.getUrlQueue().size() + "---"
110            // + seed + " connect!");
111        }
112    }
113 
114    // public static void main(String[] args) {
115    // Robot robot = new Robot();
116    // robot.traverse("http://debugs.tk");
117    // List<String> list = robot.getUrlList();
118    // for (String s : list) {
119    // System.out.println(s);
120    // }
121    // }
122 
123}
import org.jsoup.Jsoup;
就在oschina社区中。
http://www.oschina.net/p/jsoup

原创粉丝点击