使用阻塞队列爬取代理ip实现爬虫
来源:互联网 发布:在淘宝上买微星笔记本 编辑:程序博客网 时间:2024/05/16 15:24
package com.yanshu.service;
/*import org.apache.commons.io.IOUtils;*/
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.net.ssl.HttpsURLConnection;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.*;
import java.util.HashMap;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingDeque;
public class Test {
public static void getIpAddress(BlockingQueue<Ip> queue) {
Map<String, String> maps = new HashMap<String, String>();
// for(int i = 1 ; i < 20; ++i) {
try {
Document doc = Jsoup.connect("http://www.xicidaili.com/nn/" )
.data("query", "Java")
.userAgent("Netscape/5")
.cookie("auth", "token")
// .timeout(3000)
.get();
String regex =
"((?:(?:25[0-5]|2[0-4]\\d|((1\\d{2})|([1-9]?\\d)))\\.){3}(?:25[0-5]|2[0-4]\\d|((1\\d{2})|([1-9]?\\d))))";
Elements elements =
doc.select("td:matches(" + regex + ")");
System.out.println(doc.text());
for(int j = 0; j < elements.size(); ++j) {
Element e = (Element) elements.get(j);
Element e1 = e.nextElementSibling();
String ip = e.text();
String prot = e1.text();
if(isPing(ip)) {
System.out.println(ip + " " + prot);
try {
queue.put(new Ip(ip, prot));
} catch (InterruptedException e2) {
e2.printStackTrace();
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
//}
}
public static boolean isPing(String ip) {
boolean status = false;
if(ip != null) {
try {
status = InetAddress.getByName(ip).isReachable(500);
} catch(UnknownHostException e) {
}
catch(IOException e) {
}
}
return status;
}
public static void main(String[] args) {
final BlockingQueue<Ip> queue = new LinkedBlockingDeque<>();
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
getIpAddress(queue);
}
});
Thread thread1 = new Thread(new Runnable() {
@Override
public void run() {
parse(queue);
}
});
thread.start();
thread1.start();
}
public static void parse(BlockingQueue<Ip> queue) {
while (true) {
Ip ip = null;
try {
ip = queue.take();
} catch (InterruptedException e) {
e.printStackTrace();
}
while (true) {
System.out.println(ip.ip + " " + ip.port);
SocketAddress addr = new InetSocketAddress(ip.ip, Integer.parseInt(ip.port));
Proxy proxy = new Proxy(Proxy.Type.HTTP, addr);
try{
URL url = new URL("https://api.douban.com/v2/book/isbn/7505715666");
HttpsURLConnection conn = (HttpsURLConnection)url.openConnection(proxy);
conn.setConnectTimeout(5000);
conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 7.0; NT 5.1; GTB5; .NET CLR 2.0.50727; CIBA)");
conn.connect();
String result = "";
BufferedReader in = null;
in = new BufferedReader(new InputStreamReader(
conn.getInputStream(),"UTF-8"));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
System.out.println(result);
}catch (Exception e) {
// e.printStackTrace();
break;
}
}
}
}
}
class Ip{
String ip;
String port;
public Ip(String ip, String port) {
this.ip = ip;
this.port = port;
}
}
/*import org.apache.commons.io.IOUtils;*/
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.net.ssl.HttpsURLConnection;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.*;
import java.util.HashMap;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingDeque;
public class Test {
public static void getIpAddress(BlockingQueue<Ip> queue) {
Map<String, String> maps = new HashMap<String, String>();
// for(int i = 1 ; i < 20; ++i) {
try {
Document doc = Jsoup.connect("http://www.xicidaili.com/nn/" )
.data("query", "Java")
.userAgent("Netscape/5")
.cookie("auth", "token")
// .timeout(3000)
.get();
String regex =
"((?:(?:25[0-5]|2[0-4]\\d|((1\\d{2})|([1-9]?\\d)))\\.){3}(?:25[0-5]|2[0-4]\\d|((1\\d{2})|([1-9]?\\d))))";
Elements elements =
doc.select("td:matches(" + regex + ")");
System.out.println(doc.text());
for(int j = 0; j < elements.size(); ++j) {
Element e = (Element) elements.get(j);
Element e1 = e.nextElementSibling();
String ip = e.text();
String prot = e1.text();
if(isPing(ip)) {
System.out.println(ip + " " + prot);
try {
queue.put(new Ip(ip, prot));
} catch (InterruptedException e2) {
e2.printStackTrace();
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
//}
}
public static boolean isPing(String ip) {
boolean status = false;
if(ip != null) {
try {
status = InetAddress.getByName(ip).isReachable(500);
} catch(UnknownHostException e) {
}
catch(IOException e) {
}
}
return status;
}
public static void main(String[] args) {
final BlockingQueue<Ip> queue = new LinkedBlockingDeque<>();
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
getIpAddress(queue);
}
});
Thread thread1 = new Thread(new Runnable() {
@Override
public void run() {
parse(queue);
}
});
thread.start();
thread1.start();
}
public static void parse(BlockingQueue<Ip> queue) {
while (true) {
Ip ip = null;
try {
ip = queue.take();
} catch (InterruptedException e) {
e.printStackTrace();
}
while (true) {
System.out.println(ip.ip + " " + ip.port);
SocketAddress addr = new InetSocketAddress(ip.ip, Integer.parseInt(ip.port));
Proxy proxy = new Proxy(Proxy.Type.HTTP, addr);
try{
URL url = new URL("https://api.douban.com/v2/book/isbn/7505715666");
HttpsURLConnection conn = (HttpsURLConnection)url.openConnection(proxy);
conn.setConnectTimeout(5000);
conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 7.0; NT 5.1; GTB5; .NET CLR 2.0.50727; CIBA)");
conn.connect();
String result = "";
BufferedReader in = null;
in = new BufferedReader(new InputStreamReader(
conn.getInputStream(),"UTF-8"));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
System.out.println(result);
}catch (Exception e) {
// e.printStackTrace();
break;
}
}
}
}
}
class Ip{
String ip;
String port;
public Ip(String ip, String port) {
this.ip = ip;
this.port = port;
}
}
阅读全文
0 0
- 使用阻塞队列爬取代理ip实现爬虫
- 使用阻塞队列爬取代理ip实现爬虫
- Java爬虫爬取代理ip
- java代理实现爬取代理IP
- 爬虫-爬取代理ip网页里的ip
- Java网络爬虫(七)--实现定时爬取与IP代理池
- python爬虫爬取goubanjia的代理ip
- 网络爬虫爬取全国省市区(动态ip代理的获取,实现对ip限制的突破)
- 使用scrapy爬取代理ip
- 爬取网站使用代理IP
- 代理ip的爬虫实现
- 爬取代理ip
- Jsoup使用代理ip爬虫
- 如何使用ip代理爬虫
- python简单爬虫爬取队列的实现
- Python爬虫之爬取——使用代理
- Python爬取代理IP
- Python爬取代理IP
- Maven之Tomcat
- codeforces 894B
- Python+Selenium定位不到元素原因及解决方法(报:NoSuchElementException)
- 鼠标拖动改变div容器的大小
- 明年iPhone或支持双卡双待;刘强东:若十年后还是BAT,对国家是种不幸|ServiceHot一周热闻
- 使用阻塞队列爬取代理ip实现爬虫
- SSM框架——详细整合教程(Spring+SpringMVC+MyBatis)
- 欢迎使用CSDN-markdown编辑器
- 【Scikit-Learn 中文文档】广义线性模型
- HDU
- 深度学习系列笔记之统计基础
- 使用scrapy 0.24 开发制作的小说爬虫
- CAN总线要点
- 【Scikit-Learn 中文文档 】安装 scikit-learn | ApacheCN