Jsoup爬数据+设置代理IP

来源:互联网 发布:地震能被预测吗 知乎 编辑:程序博客网 时间:2024/05/01 13:44

本文利用Jsoup工具从网站中爬IP,然后动态改变本地IP进行远程访问。
主要工作类:

public class Test {    /**     * @param args     */    public static void main(String[] args) {        // TODO Auto-generated method stub        parse();    }    public static void parse() {        // blogBody("");        List<String> list = null;        try {            list = getHtml();        } catch (IOException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }        String path = "/Users/tianjia/Documents/article";        List<String> articles = FileUtil.getListFromFile(path);        ExecutorService executorService = Executors.newCachedThreadPool();        int len_article = articles.size();        for (int i = 0; i < len_article; i++) {            executorService.execute(new MyRun(articles.get(i), list));        }    }    private static List<String> getHtml() throws IOException {        Document doc = null;        try {            // doc = Jsoup.connect("http://www.baidu.com")            doc = Jsoup.connect("http://www.xicidaili.com/nt")            // .data("query", "Java")                    .userAgent("Mozilla")                    // .cookie("auth", "token")                    // .timeout(3000)                    .get();        } catch (IOException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }        List<String> list = new ArrayList<String>();        Elements elements = doc.select("tr.odd");        int len = elements.size();        Element element = null;        for (int i = 0; i < len; i++) {            element = elements.get(i);            StringBuilder sBuilder = new StringBuilder(20);            sBuilder.append(element.child(1).text());            sBuilder.append(":");            sBuilder.append(element.child(2).text());            list.add(sBuilder.toString());        }        // System.out.println(doc.html());        doc = null;        elements.clear();        elements = null;        return list;    }    public static void visit(String ip, String url){        // prop.setProperty("http.proxyHost", "183.45.78.31");        // 设置http访问要使用的代理服务器的端口        // prop.setProperty("http.proxyPort", "8080");        String[] r = ip.split(":");        System.getProperties().setProperty("http.proxyHost", r[0]);        System.getProperties().setProperty("http.proxyPort", r[1]);        try {            // doc = Jsoup.connect("http://www.baidu.com")            Jsoup.connect(url)            // .data("query", "Java")                    .userAgent("Mozilla")                    // .cookie("auth", "token")                    // .timeout(3000)                    .get();        } catch (IOException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }    }}

自定义线程类:

public class MyRun implements Runnable{    private List<String> list;    private String urlString;    public MyRun(String url,List<String> list) {        this.list =  list;        this.urlString = url;    }    @Override    public void run() {        // TODO Auto-generated method stub        int len = list.size();        for (int i = 0; i < len; i++) {            Test.visit(list.get(i), urlString);            try {                Thread.sleep(1000);            } catch (InterruptedException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }        }    }}

文件操作类:

public class FileUtil {    public static List<String> getListFromFile(String  path){        List<String> list = new ArrayList<>();        String data = null;        try {            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path)));            while((data = br.readLine())!=null)            {                System.out.println(data);                 list.add(data);            }        } catch (IOException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }        return list;    }}
1 0