网页爬虫

来源:互联网 发布:owncloud php版本 编辑:程序博客网 时间:2024/06/05 22:01

程序小白,希望和大家多交流,共同学习
程序跟随给定的URL来遍历网页,保存最先浏览的100个网址。

import java.util.ArrayList;import java.util.Scanner;public class WebCrawler{    public static void main(String [] args)    {        java.util.Scanner input = new java.util.Scanner(System.in);        System.out.print("Enter a URL : ");        String url = input.nextLine();        crawler(url);    }    public static void crawler(String startingURL)    {        ArrayList<String> listOfPendingURLs = new ArrayList<>();        ArrayList<String> listOfTravelsedURLs = new ArrayList<>();        listOfPendingURLs.add(startingURL);        while (!listOfPendingURLs.isEmpty() &&                listOfTravelsedURLs.size() <= 100)        {            String urlString = listOfPendingURLs.remove(0);            if (!listOfTravelsedURLs.contains(urlString))            {                listOfTravelsedURLs.add(urlString);                System.out.println("Crawel " + urlString);            }            for (String s : getSubURLs(urlString))            {                listOfPendingURLs.add(s);            }        }    }    public static ArrayList<String> getSubURLs(String urlString)    {        ArrayList<String> list = new ArrayList<>();        try        {             java.net.URL url = new java.net.URL(urlString);             Scanner input = new Scanner(url.openStream());             int current = 0;             while (input.hasNext())             {                 String line = input.nextLine();                current = line.indexOf("http", current);                while (current > 0)                {                    int endIndex = line.indexOf("\"", current);                    if (endIndex > 0)                    {                        list.add(line.substring(current, endIndex));                        current = line.indexOf("http", endIndex);                    }                    else                        current = -1;                }             }        }        catch (Exception ex)        {            System.out.println(ex.getMessage());        }        return list;    }}
原创粉丝点击