简单多线程爬虫+Jsoup分析
来源:互联网 发布:太阳辐射强度数据 编辑:程序博客网 时间:2024/05/21 19:47
使用简单多线程和Jsoup分析,得到CSDN的首页的所有子网页连接。
运行效果如下图
------------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------------------------------------
代码如下
import java.net.URL;import java.util.ArrayList;import java.util.HashMap;import org.apache.http.HttpEntity;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class WebCrawler {ArrayList<String> allurlSet = new ArrayList<String>();ArrayList<String> notCrawlurlSet = new ArrayList<String>();HashMap<String, Integer> depth = new HashMap<String, Integer>();int crawDepth = 2;int threadCount = 10;int count = 0;public static final Object signal = new Object();// 线程间通信public static void main(String args[]) {final WebCrawler wc = new WebCrawler();wc.addUrl("http://www.csdn.net", 1);long start = System.currentTimeMillis();System.out.println("**************开始爬虫**************");wc.begin(); while(true){ if(wc.notCrawlurlSet.isEmpty()&& Thread.activeCount() == 1||wc.count==wc.threadCount){ long end = System.currentTimeMillis(); System.out.println("总共爬了"+wc.allurlSet.size()+"个网页"); System.out.println("总共耗时"+(end-start)/1000+"秒"); System.exit(1); // break; } }}private void begin() {for (int i = 0; i < threadCount; ++i) {new Thread(new Runnable() {public void run() {while (true) {String tmp = getAUrl();if (tmp != null) {crawler(tmp);} else {synchronized (signal) {try {count++;System.out.println(Thread.currentThread().getName() + ": 等待");signal.wait();} catch (Exception e) {e.printStackTrace();}}}}}}, "thread-" + i).start();}}public void crawler(String sUrl) {URL url;try {HttpClient client = HttpClients.createDefault();HttpGet get = new HttpGet(sUrl);get.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");CloseableHttpResponse response = (CloseableHttpResponse) client.execute(get);HttpEntity entity = response.getEntity();String content = EntityUtils.toString(entity);int d = depth.get(sUrl);System.out.println("爬网页" + sUrl + "成功,深度为" + d + " 是由线程" + Thread.currentThread().getName() + "来爬");if (d < crawDepth) {Document doc = Jsoup.parseBodyFragment(content);Elements es = doc.select("a");String temp = "";for (Element e : es) {temp = e.attr("href");if (temp != null) {synchronized (signal) {addUrl(temp, d + 1);if (count > 0) {signal.notify();count--;}}}}}} catch (Exception e) {e.printStackTrace();}}public synchronized String getAUrl() {if (notCrawlurlSet.isEmpty())return null;String tmpAUrl;tmpAUrl = notCrawlurlSet.get(0);notCrawlurlSet.remove(0);return tmpAUrl;}public synchronized void addUrl(String url, int d) {notCrawlurlSet.add(url);allurlSet.add(url);depth.put(url, d);}}
阅读全文
0 0
- 简单多线程爬虫+Jsoup分析
- 非常简单Jsoup爬虫
- Jsoup实现简单的爬虫
- jsoup爬虫简单使用笔记
- JAVA爬虫--Jsoup的简单运用
- 基于Jsoup实现的简单爬虫
- Jsoup-实现简单的网络爬虫
- 基于Jsoup实现的简单网络爬虫
- JAVA简单爬虫例子--Jsoup的运用
- jsoup 爬虫
- (一)多线程简单爬虫
- Java jsoup多线程爬虫(爬豆瓣图书封面)
- java写的一个简单的爬虫(jsoup)
- 简单的网络爬虫实现(Jsoup使用)
- java爬虫--jsoup简单的表单抓取案例
- 【1】用jsoup来实现简单的java爬虫
- 爬虫概念 请求方式 jsoup 分析思路 原理
- java爬虫实战简单用Jsoup框架进行网页爬虫(如抓取网页图片)
- 设计模式入门
- C++程序设计学习笔记
- AndroidStudio通过wifi连接手机进行调试
- 整合spring-mvc+mybatis记录
- KNN分类器
- 简单多线程爬虫+Jsoup分析
- enum,typedef分析
- springMVC 4.3.7 @RequestBody 报错415 Unsupported Media Type
- SSM-3 pom依赖
- 《Deep Forest: Towards an Alternative to Deep Neural Networks》理解
- webView 的onReceivedError();
- c++中字符串反转的3种方法
- 关于JavaScript 的 async/await
- [BZOJ4199][NOI2015]品酒大会-后缀数组