Java爬虫——代理IP

来源:互联网 发布:网络登山鞋 编辑:程序博客网 时间:2024/06/05 15:30
1 核心代码-爬虫类
package cn.tyoui.httpclient;import org.apache.commons.io.FileUtils;import org.apache.http.HttpEntity;import org.apache.http.HttpHost;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.util.ArrayList;import java.util.List;import java.util.Random;/** * 爬虫网页 */public class HttpCrawler {    private CloseableHttpClient httpClient = HttpClients.createDefault();    private List<ProxyIP> list = null;    //保存爬取的网页    private String dir = null;    /**     * 代理初始化     *     * @throws Exception     */    public void proxyInit(String proxyText) throws Exception {        list = new ArrayList<>();        List<String> listIP = FileUtils.readLines(new File(proxyText));        for (String str : listIP) {            String ip = str.split(":")[0];            int port = Integer.parseInt(str.split(":")[1]);            ProxyIP proxyIp = new ProxyIP(ip, port);            list.add(proxyIp);        }    }    /**     * 开始爬取     *     * @param webURL 要爬取的网址     * @throws Exception 爬取失败     */    public void startCrawler(String webURL) throws Exception {        String path = dir + File.separator + webURL.substring(webURL.lastIndexOf("/")) + ".html";        File file = new File(path);        if (file.exists() && file.length() > 20_000)            return;        if (list == null) {            crawler(webURL, path, null, 0);        } else {            int index = new Random().nextInt(list.size());            crawler(webURL, path, list.get(index), index);        }    }    /**     * 爬虫     *     * @param url   要爬的网址     * @param path  保存的路径     * @param proxy 代理ip的对象     * @param index 第几个代理ip     * @throws CloneNotSupportedException 关闭流失败     * @throws IOException                关闭流失败     */    private void crawler(String url, String path, ProxyIP proxy, int index) throws CloneNotSupportedException, IOException {        CloseableHttpResponse response = null;        HttpGet httpGet = null;        try {            httpGet = new HttpGet(url);            httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 " +                    "(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36");            RequestConfig requestConfig = null;            if (proxy == null) {                requestConfig = RequestConfig.custom().setConnectTimeout(2000).setSocketTimeout(1000).build();            } else {                HttpHost httpHost = new HttpHost(proxy.getIp(), proxy.getPort());                requestConfig = RequestConfig.custom().setProxy(httpHost).setConnectTimeout(2000).setSocketTimeout(1000).build();            }            httpGet.setConfig(requestConfig);            response = httpClient.execute(httpGet);            int status = response.getStatusLine().getStatusCode();            if (status == 200) {                HttpEntity entity = response.getEntity();                entity.writeTo(new FileOutputStream(path));                System.out.println("下载成功!" + url);            } else {                if (list != null)                    list.remove(index);                throw new Exception("爬取到的网页非正常!");            }        } catch (Exception e) {            System.err.println(e);            System.err.println("下载失败!" + url);        } finally {            if (httpGet != null)                httpGet.clone();            if (response != null)                response.close();        }    }    /**     * 保存爬取网页发的文件夹     *     * @param dir 文件夹     */    public void setDir(String dir) {        this.dir = dir;        File file=new File(dir);        if(!file.exists())            file.mkdirs();    }    /**     * 关闭爬取流     */    public void close() {        try {            httpClient.close();        } catch (IOException e) {            e.printStackTrace();        }    }    /**     * 获取代理ip链表     *     * @return     */    public List<ProxyIP> getList() {        return list;    }
/** * 测试 */  public static void main(String[] args) throws Exception {    HttpCrawler httpCrawler = new HttpCrawler();    httpCrawler.setDir("D:\\baidu");//添加保存网页文件夹    //  httpCrawler.proxyInit("E:\\IDECode\\StringUtils\\text\\代理ip.txt");//代理ip文本路径    httpCrawler.startCrawler("http://www.baidu.com");//要爬取的网址    httpCrawler.close();//关闭爬虫流  }
}
2 测试类
package cn.tyoui.httpclient;class ProxyIP {    private String ip;    private int port;    ProxyIP(String ip, int port) {        this.ip = ip;        this.port = port;    }    public void setIp(String ip) {        this.ip = ip;    }    public void setPort(int port) {        this.port = port;    }    public int getPort() {        return port;    }    public String getIp() {        return ip;    }}
3 maven pom.xml
<properties>    <commons-httpclient>4.5.3</commons-httpclient>    <commons-io.version>2.4</commons-io.version></properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient --><dependency>    <groupId>org.apache.httpcomponents</groupId>    <artifactId>httpclient</artifactId>    <version>${commons-httpclient}</version></dependency>
<dependency>    <groupId>commons-io</groupId>    <artifactId>commons-io</artifactId>    <version>${commons-io.version}</version></dependency>
</dependencies>
4 JDK 1.8
<build>    <plugins>        <plugin>            <groupId>org.apache.maven.plugins</groupId>            <artifactId>maven-resources-plugin</artifactId>            <version>2.7</version>            <configuration>                <encoding>UTF-8</encoding>            </configuration>        </plugin>        <plugin>            <groupId>org.apache.maven.plugins</groupId>            <artifactId>maven-compiler-plugin</artifactId>            <version>3.6.0</version>            <configuration>                <source>1.8</source>                <target>1.8</target>                <encoding>UTF-8</encoding>            </configuration>        </plugin>    </plugins></build>

5 代理IP格式