简单多线程爬虫+Jsoup分析

来源:互联网 发布:太阳辐射强度数据 编辑:程序博客网 时间:2024/05/21 19:47

使用简单多线程和Jsoup分析,得到CSDN的首页的所有子网页连接。

运行效果如下图


------------------------------------------------------------------------------------------------------



---------------------------------------------------------------------------------------------------------------------------------




代码如下


import java.net.URL;import java.util.ArrayList;import java.util.HashMap;import org.apache.http.HttpEntity;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class WebCrawler {ArrayList<String> allurlSet = new ArrayList<String>();ArrayList<String> notCrawlurlSet = new ArrayList<String>();HashMap<String, Integer> depth = new HashMap<String, Integer>();int crawDepth = 2;int threadCount = 10;int count = 0;public static final Object signal = new Object();// 线程间通信public static void main(String args[]) {final WebCrawler wc = new WebCrawler();wc.addUrl("http://www.csdn.net", 1);long start = System.currentTimeMillis();System.out.println("**************开始爬虫**************");wc.begin();       while(true){              if(wc.notCrawlurlSet.isEmpty()&& Thread.activeCount() == 1||wc.count==wc.threadCount){                  long end = System.currentTimeMillis();                  System.out.println("总共爬了"+wc.allurlSet.size()+"个网页");                  System.out.println("总共耗时"+(end-start)/1000+"秒");                  System.exit(1);  //              break;              }       }}private void begin() {for (int i = 0; i < threadCount; ++i) {new Thread(new Runnable() {public void run() {while (true) {String tmp = getAUrl();if (tmp != null) {crawler(tmp);} else {synchronized (signal) {try {count++;System.out.println(Thread.currentThread().getName() + ": 等待");signal.wait();} catch (Exception e) {e.printStackTrace();}}}}}}, "thread-" + i).start();}}public void crawler(String sUrl) {URL url;try {HttpClient client = HttpClients.createDefault();HttpGet get = new HttpGet(sUrl);get.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");CloseableHttpResponse response = (CloseableHttpResponse) client.execute(get);HttpEntity entity = response.getEntity();String content = EntityUtils.toString(entity);int d = depth.get(sUrl);System.out.println("爬网页" + sUrl + "成功,深度为" + d + " 是由线程" + Thread.currentThread().getName() + "来爬");if (d < crawDepth) {Document doc = Jsoup.parseBodyFragment(content);Elements es = doc.select("a");String temp = "";for (Element e : es) {temp = e.attr("href");if (temp != null) {synchronized (signal) {addUrl(temp, d + 1);if (count > 0) {signal.notify();count--;}}}}}} catch (Exception e) {e.printStackTrace();}}public synchronized String getAUrl() {if (notCrawlurlSet.isEmpty())return null;String tmpAUrl;tmpAUrl = notCrawlurlSet.get(0);notCrawlurlSet.remove(0);return tmpAUrl;}public synchronized void addUrl(String url, int d) {notCrawlurlSet.add(url);allurlSet.add(url);depth.put(url, d);}}