Java爬虫——代理IP
来源:互联网 发布:网络登山鞋 编辑:程序博客网 时间:2024/06/05 15:30
1 核心代码-爬虫类
package cn.tyoui.httpclient;import org.apache.commons.io.FileUtils;import org.apache.http.HttpEntity;import org.apache.http.HttpHost;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.util.ArrayList;import java.util.List;import java.util.Random;/** * 爬虫网页 */public class HttpCrawler { private CloseableHttpClient httpClient = HttpClients.createDefault(); private List<ProxyIP> list = null; //保存爬取的网页 private String dir = null; /** * 代理初始化 * * @throws Exception */ public void proxyInit(String proxyText) throws Exception { list = new ArrayList<>(); List<String> listIP = FileUtils.readLines(new File(proxyText)); for (String str : listIP) { String ip = str.split(":")[0]; int port = Integer.parseInt(str.split(":")[1]); ProxyIP proxyIp = new ProxyIP(ip, port); list.add(proxyIp); } } /** * 开始爬取 * * @param webURL 要爬取的网址 * @throws Exception 爬取失败 */ public void startCrawler(String webURL) throws Exception { String path = dir + File.separator + webURL.substring(webURL.lastIndexOf("/")) + ".html"; File file = new File(path); if (file.exists() && file.length() > 20_000) return; if (list == null) { crawler(webURL, path, null, 0); } else { int index = new Random().nextInt(list.size()); crawler(webURL, path, list.get(index), index); } } /** * 爬虫 * * @param url 要爬的网址 * @param path 保存的路径 * @param proxy 代理ip的对象 * @param index 第几个代理ip * @throws CloneNotSupportedException 关闭流失败 * @throws IOException 关闭流失败 */ private void crawler(String url, String path, ProxyIP proxy, int index) throws CloneNotSupportedException, IOException { CloseableHttpResponse response = null; HttpGet httpGet = null; try { httpGet = new HttpGet(url); httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); RequestConfig requestConfig = null; if (proxy == null) { requestConfig = RequestConfig.custom().setConnectTimeout(2000).setSocketTimeout(1000).build(); } else { HttpHost httpHost = new HttpHost(proxy.getIp(), proxy.getPort()); requestConfig = RequestConfig.custom().setProxy(httpHost).setConnectTimeout(2000).setSocketTimeout(1000).build(); } httpGet.setConfig(requestConfig); response = httpClient.execute(httpGet); int status = response.getStatusLine().getStatusCode(); if (status == 200) { HttpEntity entity = response.getEntity(); entity.writeTo(new FileOutputStream(path)); System.out.println("下载成功!" + url); } else { if (list != null) list.remove(index); throw new Exception("爬取到的网页非正常!"); } } catch (Exception e) { System.err.println(e); System.err.println("下载失败!" + url); } finally { if (httpGet != null) httpGet.clone(); if (response != null) response.close(); } } /** * 保存爬取网页发的文件夹 * * @param dir 文件夹 */ public void setDir(String dir) { this.dir = dir; File file=new File(dir); if(!file.exists()) file.mkdirs(); } /** * 关闭爬取流 */ public void close() { try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } /** * 获取代理ip链表 * * @return */ public List<ProxyIP> getList() { return list; }
/** * 测试 */ public static void main(String[] args) throws Exception { HttpCrawler httpCrawler = new HttpCrawler(); httpCrawler.setDir("D:\\baidu");//添加保存网页文件夹 // httpCrawler.proxyInit("E:\\IDECode\\StringUtils\\text\\代理ip.txt");//代理ip文本路径 httpCrawler.startCrawler("http://www.baidu.com");//要爬取的网址 httpCrawler.close();//关闭爬虫流 }
}
2 测试类
package cn.tyoui.httpclient;class ProxyIP { private String ip; private int port; ProxyIP(String ip, int port) { this.ip = ip; this.port = port; } public void setIp(String ip) { this.ip = ip; } public void setPort(int port) { this.port = port; } public int getPort() { return port; } public String getIp() { return ip; }}3 maven pom.xml
<properties> <commons-httpclient>4.5.3</commons-httpclient> <commons-io.version>2.4</commons-io.version></properties><dependencies><!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient --><dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>${commons-httpclient}</version></dependency><dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>${commons-io.version}</version></dependency></dependencies>4 JDK 1.8
<build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-resources-plugin</artifactId> <version>2.7</version> <configuration> <encoding>UTF-8</encoding> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.6.0</version> <configuration> <source>1.8</source> <target>1.8</target> <encoding>UTF-8</encoding> </configuration> </plugin> </plugins></build>
5 代理IP格式
阅读全文
0 0
- Java爬虫——代理IP
- Java爬虫爬取代理ip
- Python 爬虫入门(二)—— IP代理使用
- python爬虫——构建代理ip池
- Python爬虫实战——代理IP全部抓取
- python 网络爬虫——请求头,ip代理
- 爬虫入门2——爬代理ip地址
- 爬虫代理ip设置
- 爬虫代理ip设置
- 爬虫代理IP
- Python 爬虫IP代理
- 爬虫 IP代理策略
- scrapy爬虫代理——利用crawlera神器,无需再寻找代理IP
- scrapy爬虫代理——利用crawlera神器,无需再寻找代理IP
- python IP代理爬虫,download 代理IP
- python定向爬虫——爬取某代理Ip网站上的所有ip
- Scrapy爬虫:代理IP配置
- 爬虫IP代理资源汇总
- Two Sum
- PHP获取数组中指定的一列
- vue.js 在微信公众号内实现视频播放
- 集合总结
- PowerDesigner导出Excel
- Java爬虫——代理IP
- mac电脑修改网卡mac地址
- MyBatis中order by排序无效的问题
- 读取文件内的数据(数字)并进行三种排序,1(快速排序)2(归并排序)3(希尔排序)。
- SpringBoot中启动HTTPS
- windows查询文件中是否存在字符串
- 大数据量的算法面试题
- mybatis if test 字符串判断问题
- vue.js 监控 视频播放