爬虫代理ip设置
来源:互联网 发布:mac 下的虚拟机性能差 编辑:程序博客网 时间:2024/05/01 08:31
爬虫的过程中可能会涉及到ip代理的问题。网上有很多可用的ip代理信息:
http://pachong.org/
下面就是从中选择一个最合适的ip。实现思路:
1. 获取该页面内容,提取每一行ip信息。
2. 新建一个java Bean 封装每一个ip属性。
3.将所有的javaBean添加到一个arraylist中。
4.对整个arraylist排序。
代码如下:
/*
* 下载时事可用的网络爬虫代理
*/
public class CrawlProxyIp5Net {
public static ArrayList<ProxyConfigBean> getProxyConfigs(){
ArrayList<ProxyConfigBean> list = new ArrayList<ProxyConfigBean>();
try {
WebClient client = new WebClient(BrowserVersion.CHROME);
client.getOptions().setJavaScriptEnabled(false);
client.getOptions().setCssEnabled(false);
HtmlPage page = client.getPage("http://pachong.org/");
HtmlTableBody tableBody = (HtmlTableBody) page.getByXPath("//table[@class='tb']/tbody").get(0);
List<HtmlTableRow> tableRows = tableBody.getRows();
if(tableRows!=null){
for(int i=0;i<tableRows.size();i++){
ProxyConfigBean bean = new ProxyConfigBean();
HtmlTableRow tableRow = tableRows.get(i);
HtmlTableCell ipCell = tableRow.getCell(1);
HtmlTableCell portCell = tableRow.getCell(2);
HtmlTableCell countryCell = tableRow.getCell(3);
HtmlTableCell typeCell = tableRow.getCell(4);
HtmlTableCell statuCell = tableRow.getCell(5);
String ip = MyStringUtils.pureString(ipCell.asText());
bean.setIp(ip);
String portValue = MyStringUtils.pureString(portCell.asText());
if(portValue!=null && !portValue.matches("^[0-9]")){
int port = Integer.parseInt(portValue);
bean.setPort(port);
}
String country = MyStringUtils.pureString(countryCell.asText());
bean.setCountry(country);
String type = MyStringUtils.pureString(typeCell.asText());
bean.setType(type);
String statu = MyStringUtils.pureString(statuCell.asText());
bean.setStatu(statu);
//最后设置优先级,在设置优先级之前,必须设置好其他属性的值
bean.setPriority();
list.add(bean);
}
}
client.closeAllWindows();
Collections.sort(list,new Comparator<ProxyConfigBean>() {
@Override
public int compare(ProxyConfigBean bean1, ProxyConfigBean bean2) {
// TODO Auto-generated method stub
int scores1 = bean1.getPriority();
int scores2 = bean2.getPriority();
return scores2-scores1;
}
});
} catch (FailingHttpStatusCodeException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return list;
}
public static ProxyConfig getProxyConfig(){
ArrayList<ProxyConfigBean> list = getProxyConfigs();
if(list!=null && list.size()>0){
ProxyConfigBean bean = list.get(0);
ProxyConfig proxyConfig = new ProxyConfig();
proxyConfig.setProxyHost(bean.getIp());
proxyConfig.setProxyPort(bean.getPort());
return proxyConfig;
}else{
return null;
}
}
public static void main(String[] args) {
ArrayList<ProxyConfigBean> list = getProxyConfigs();
for(int i=0;i<list.size();i++){
ProxyConfigBean bean = list.get(i);
System.out.println(bean.getCountry() +" "+bean.getPort()+ " "+bean.getIp());
}
System.out.println("done.......");
}
}
其中JAVABean代码:
public class ProxyConfigBean {
private String ip;
private int port;
private String country;//中国,其他国家
private String type;//匿名 程度high、anonymous、elite、transparent
private String statu;//空闲、繁忙、较忙
private int priority;//优先级
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public int getPort() {
return port;
}
public void setPort(int port) {
this.port = port;
}
public String getCountry() {
return country;
}
public void setCountry(String country) {
this.country = country;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getStatu() {
return statu;
}
public void setStatu(String statu) {
this.statu = statu;
}
public int getPriority() {
return priority;
}
public void setPriority() {
//根据国家设置优先级
if(this.country.contains("中国")){
this.priority +=50;
}
//根据状态设置优先级,空闲(10)、繁忙(0)、较忙(5)
if(this.statu.contains("空闲")){
this.priority +=20;
}else if(this.statu.contains("较忙")){
this.priority +=10;
}
//根据类型设置优先级匿名 程度high(4)、anonymous(5)、elite(2)、transparent(3)
if(this.type.contains("anonymous")){
this.priority +=5;
}else if(this.type.contains("high")){
this.priority +=4;
}else if(this.type.contains("transparent")){
this.priority +=3;
}else if(this.type.contains("elite")){
this.priority +=2;
}
//根据端口,80端口最优
if(this.port == 80){
this.priority +=1;
}
}
}
- 爬虫代理ip设置
- 爬虫代理ip设置
- Python爬虫设置代理IP爬取知乎图片
- Python爬虫技巧---设置代理IP
- selenium+python设置爬虫代理IP
- 爬虫代理IP
- Python 爬虫IP代理
- 爬虫 IP代理策略
- python IP代理爬虫,download 代理IP
- 爬虫设置代理
- Scrapy爬虫:代理IP配置
- 爬虫IP代理资源汇总
- Jsoup使用代理ip爬虫
- 如何使用ip代理爬虫
- python3爬虫之IP代理
- python 爬虫获取代理Ip
- 代理ip的爬虫实现
- python3爬虫伪装代理IP
- 阶段自考之二
- 大数阶乘(nyoj28)
- 解压缩Android 根文件系统 ramdisk
- c中堆和栈的区别(整理别人的资料)
- GPIO 模拟红外发射(NEC)
- 爬虫代理ip设置
- vs 提示图标的含义
- 比较数据库中的日期格式
- cocos2d-x学习笔记——坐标系,锚点,文本类
- Runtime Engine Architecture
- 定点数与浮点数区别
- ORA-19573: cannot obtain exclusive enqueue for datafile 2
- rtc实时时钟
- WDK与DDK的区别