爬虫之窃取网络小说(2,多线程爬虫)-yellowcong
来源:互联网 发布:夏普电视音频输出端口 编辑:程序博客网 时间:2024/06/05 19:41
在上一篇文章的基础上,爬虫之窃取网络小说(1),然后通过多线程的方式,实现多线程爬取网络小说,核心的代码是通过BlockingQueue来实现无锁的方式来解决这个多线程问题,如果使用notify和wait的方式,就相对慢了一些。
项目的结构
代码
Constatns.java
package com.yellowcong.http.common;/****作者:yellowcong*日期:2017/11/24*時間:9:00:19*描述:*/public class Constants { //设定编码 public static final String WEB_ENCODE ="UTF-8"; //主页路径 public static final String HOME_URL ="https://www.yite.cc/book/dxjdntbb/"; //文件写出路径 public static final String OUT_PATH = "C:\\Users\\zhangrw\\Desktop\\Demo";}
文件下载工具类
package com.yellowcong.http.utils;import java.io.File;import java.io.FileWriter;import java.io.IOException;/** * * 作者:yellowcong 日期:2017/11/24 時間:10:26:14 描述: */public class FileUtils { /** * 将文字转化为文件 * * @param str * @param outFile */ public static void copyStr2File(String str, File outFile) { FileWriter out = null; try { if(!outFile.getParentFile().exists()) { outFile.getParentFile().mkdirs(); } // 将Str 转化为文件 out = new FileWriter(outFile); char[] chars = str.toCharArray(); out.write(chars, 0, chars.length); } catch (IOException e) { e.printStackTrace(); } finally { try { if (out != null) { out.close(); } } catch (Exception e) { // TODO: handle exception } } }}
请求工具类
package com.yellowcong.http.utils;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.UnsupportedEncodingException;import java.util.ArrayList;import java.util.List;import java.util.Map;import org.apache.http.HeaderIterator;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.HttpStatus;import org.apache.http.NameValuePair;import org.apache.http.ParseException;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.CookieStore;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.protocol.HttpClientContext;import org.apache.http.cookie.Cookie;import org.apache.http.impl.client.BasicCookieStore;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.DefaultConnectionKeepAliveStrategy;import org.apache.http.impl.client.DefaultRedirectStrategy;import org.apache.http.impl.client.HttpClientBuilder;import org.apache.http.impl.cookie.BasicClientCookie;import org.apache.http.message.BasicNameValuePair;import org.apache.http.util.EntityUtils;import org.apache.log4j.LogManager;import org.apache.log4j.Logger;public class HttpClient { private static final Logger LOG = LogManager.getLogger(HttpClient.class); /** 请求网站的编码,这个地方,我默认 写的是GB3212*/ private static final String DEFALUT_ENCODE = "UTF-8"; public static CloseableHttpClient httpClient = null; public static HttpClientContext context = null; public static CookieStore cookieStore = null; public static RequestConfig requestConfig = null; static { init(); } private static void init() { context = HttpClientContext.create(); cookieStore = new BasicCookieStore(); // 配置超时时间(连接服务端超时1秒,请求数据返回超时2秒) requestConfig = RequestConfig.custom().setConnectTimeout(120000).setSocketTimeout(60000) .setConnectionRequestTimeout(60000).build(); // 设置默认跳转以及存储cookie httpClient = HttpClientBuilder.create().setKeepAliveStrategy(new DefaultConnectionKeepAliveStrategy()) .setRedirectStrategy(new DefaultRedirectStrategy()).setDefaultRequestConfig(requestConfig) .setDefaultCookieStore(cookieStore).build(); } /** * 发送get请求 * * @param url * @return response * @throws ClientProtocolException * @throws IOException */ public static String get(String url) { HttpGet httpget = new HttpGet(url); CloseableHttpResponse response = null; try { //伪装为浏览器 httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0"); //设定请求的参数 response= httpClient.execute(httpget, context); return copyResponse2Str(response); } catch(Exception e){ LOG.debug("请求失败\t"+url); }finally { try { if(response != null){ response.close(); } } catch (IOException e) { e.printStackTrace(); } } return null; } /** * 将返回的Response转化成String对象 * @param response 返回的Response * @return */ private static String copyResponse2Str(CloseableHttpResponse response){ try { int code = response.getStatusLine().getStatusCode(); //当请求的code返回值不是400的情况 if((code == HttpStatus.SC_MOVED_TEMPORARILY ) || (code == HttpStatus.SC_MOVED_PERMANENTLY) || (code == HttpStatus.SC_SEE_OTHER) || (code == HttpStatus.SC_TEMPORARY_REDIRECT)) { return null; }else{ return copyInputStream2Str(response.getEntity().getContent()); } } catch (Exception e) { e.printStackTrace(); } return null; } /** * 将InputStream转化为String类型的数据 * @param in * @return */ private static String copyInputStream2Str(InputStream in){ try { BufferedReader reader = new BufferedReader(new InputStreamReader(in,DEFALUT_ENCODE)); String line = null; StringBuffer sb = new StringBuffer(); while((line = reader.readLine()) != null){ sb.append(line); } return sb.toString(); } catch (Exception e) { LOG.debug("获取字符串失败"); } return null; } /** * 发送post请求,不带参数 的post * @param url * @return */ public static String post(String url){ return post(url, null); } /** * 发从post 请求 * @param url * @param parameters * @return * @throws ClientProtocolException * @throws IOException */ public static String post(String url, Map<String,Object> parameters){ HttpPost httpPost = new HttpPost(url); CloseableHttpResponse response = null; try { //设定请求的参数 setRequestParamter(parameters, httpPost); //发送请求 response = httpClient.execute(httpPost, context); return copyResponse2Str(response); }catch(Exception e){ LOG.debug("请求失败\t"+url); }finally { try { if(response != null){ response.close(); } } catch (IOException e) { e.printStackTrace(); } } return null; } /** * 设定POST请求的参数 * @param parameters * @param httpPost * @throws UnsupportedEncodingException */ private static void setRequestParamter(Map<String, Object> parameters, HttpPost httpPost) throws UnsupportedEncodingException { List<NameValuePair> nvps; //添加参数 if(parameters != null && parameters.size()>0){ nvps = new ArrayList<NameValuePair>(); for(Map.Entry<String, Object> map:parameters.entrySet()){ NameValuePair param = new BasicNameValuePair(map.getKey(), map.getValue().toString()); nvps.add(param); } httpPost.setEntity(new UrlEncodedFormEntity(nvps, DEFALUT_ENCODE)); } } /** * 将 http://www.yellowcong.com?age=7&name=8 * 这种age=7&name=8 转化为map数据 * @param parameters * @return */ @SuppressWarnings("unused") private static List<NameValuePair> toNameValuePairList(String parameters) { List<NameValuePair> nvps = new ArrayList<NameValuePair>(); String[] paramList = parameters.split("&"); for (String parm : paramList) { int index = -1; for (int i = 0; i < parm.length(); i++) { index = parm.indexOf("="); break; } String key = parm.substring(0, index); String value = parm.substring(++index, parm.length()); nvps.add(new BasicNameValuePair(key, value)); } System.out.println(nvps.toString()); return nvps; } /** * 手动增加cookie * @param name * @param value * @param domain * @param path */ public void addCookie(String name, String value, String domain, String path) { BasicClientCookie cookie = new BasicClientCookie(name, value); cookie.setDomain(domain); cookie.setPath(path); cookieStore.addCookie(cookie); } /** * 把结果console出来 * * @param httpResponse * @throws ParseException * @throws IOException */ public static void printResponse(HttpResponse httpResponse) throws ParseException, IOException { // 获取响应消息实体 HttpEntity entity = httpResponse.getEntity(); // 响应状态 System.out.println("status:" + httpResponse.getStatusLine()); System.out.println("headers:"); HeaderIterator iterator = httpResponse.headerIterator(); while (iterator.hasNext()) { System.out.println("\t" + iterator.next()); } } /** * 把当前cookie从控制台输出出来 * */ public static void printCookies() { cookieStore = context.getCookieStore(); List<Cookie> cookies = cookieStore.getCookies(); for (Cookie cookie : cookies) { System.out.println("key:" + cookie.getName() + " value:" + cookie.getValue()); } } /** * 检查cookie的键值是否包含传参 * * @param key * @return */ public static boolean checkCookie(String key) { cookieStore = context.getCookieStore(); List<Cookie> cookies = cookieStore.getCookies(); boolean res = false; for (Cookie cookie : cookies) { if (cookie.getName().equals(key)) { res = true; break; } } return res; } /** * 直接把Response内的Entity内容转换成String * * @param httpResponse * @return * @throws ParseException * @throws IOException */ public static String toString(CloseableHttpResponse httpResponse) throws ParseException, IOException { // 获取响应消息实体 HttpEntity entity = httpResponse.getEntity(); if (entity != null) return EntityUtils.toString(entity); else return null; } }
网站多线程处理类
package com.yellowcong.web.yite;import java.io.File;import java.util.concurrent.BlockingQueue;import com.yellowcong.http.common.Constants;import com.yellowcong.http.utils.FileUtils;import com.yellowcong.web.yite.WebUtils.Passage;/****作者:yellowcong*日期:2017/11/24*時間:10:09:13*描述:*/public class PageThread implements Runnable{ private BlockingQueue<String> queue ; public PageThread(BlockingQueue<String> queue) { super(); this.queue = queue; } public void run() { try { while(this.queue.size()>0) { String url = this.queue.take(); Passage passage = WebUtils.getPageData(url);// System.out.println("\r\n当前线程\t"+Thread.currentThread().getName()+"还剩下"+this.queue.size()+"个"); FileUtils.copyStr2File(passage.getContent(), new File(Constants.OUT_PATH+File.separator+passage.getTitle()+".txt")); System.out.printf("当前线程%s,文章名称%s,还剩下%s个\r\n", Thread.currentThread().getName(),passage.getTitle(),this.queue.size()); } } catch (InterruptedException e) { e.printStackTrace(); } }}
网站解析工具类
package com.yellowcong.web.yite;import java.util.concurrent.ArrayBlockingQueue;import java.util.concurrent.BlockingQueue;import java.util.concurrent.ConcurrentHashMap;import java.util.concurrent.ConcurrentMap;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import com.yellowcong.http.common.Constants;import com.yellowcong.http.utils.HttpClient;/****作者:yellowcong*日期:2017/11/24*時間:9:53:03*描述:*/public class WebUtils { /** * 获取https://www.yite.cc/ 这个网站 * 小说主页面的地址 * 比如: * https://www.yite.cc/book/chaojixiulianxitong/ * https://www.yite.cc/book/jiaojiaoshiniang/ * 等。。。 * @param url * @return * @throws Exception */ public static BlockingQueue<String> getPageInfo(String url) throws Exception{ //获取路径 String htmlStr = HttpClient.get(Constants.HOME_URL); //将解析的html转化为 Document homeDoc = Jsoup.parse(htmlStr); //获取到所有的链接 Element element = homeDoc.getElementById("list"); //获取所有的章节信息 Elements pages = element.getElementsByTag("a"); System.out.println(pages.size()); //创建一个队列 BlockingQueue<String> queue = new ArrayBlockingQueue<String>(pages.size()); for(int i=0;i<pages.size();i++) { Element node = pages.get(i); String hrefStr = Constants.HOME_URL+node.attr("href"); String hrefContent = node.html(); System.out.printf("%s-->%s\r\n",hrefStr,hrefContent); queue.put(hrefStr); } return queue; } /** * 获取到每一篇 * @param url * @return */ public static Passage getPageData(String url) { //获取路径 String htmlStr = HttpClient.get(url); //将解析的html转化为 Document homeDoc = Jsoup.parse(htmlStr); Element content = homeDoc.getElementById("content"); //内容 String pageContent = content.text(); //标题 String title = homeDoc.getElementById("txtbox").getElementsByTag("h1").get(0).text(); //写数据 return new Passage(title,pageContent); } public static class Passage { private String title; private String content; public Passage(String title, String content) { super(); this.title = title; this.content = content; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } }}
函数入口
package day111_24;import java.util.concurrent.BlockingQueue;import java.util.concurrent.ExecutorService;import java.util.concurrent.Executors;import com.yellowcong.http.common.Constants;import com.yellowcong.web.yite.PageThread;import com.yellowcong.web.yite.WebUtils;/****作者:yellowcong*日期:2017/11/24*時間:10:20:17*描述:*/public class Main { public static void main(String[] args) throws Exception { //获取队列信息 BlockingQueue<String> queue = WebUtils.getPageInfo(Constants.HOME_URL); //创建一个线程池 int poolSize =8; ExecutorService pool = Executors.newFixedThreadPool(poolSize); //提交任务 //获取线程 PageThread thread = new PageThread(queue); for(int i=0;i<poolSize;i++) { //提交任务 pool.submit(thread); } pool.shutdown(); }}
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>yellowcong</groupId> <artifactId>day111_24</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>day111_24</name> <url>http://maven.apache.org</url> <!-- 配置国内比较快的 阿里云的Maven仓库 --> <repositories> <repository> <id>aliyunmaven</id> <url>http://maven.aliyun.com/nexus/content/groups/public/</url> </repository> </repositories> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <!-- 日志 --> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.16</version> </dependency> <!-- 网页解析 --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.7.3</version> </dependency> <!-- http协议 解析 BEGIN--> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpcore</artifactId> <version>4.4.2</version> </dependency> <dependency> <groupId>commons-codec</groupId> <artifactId>commons-codec</artifactId> <version>1.9</version> </dependency> <!-- http协议 解析 END--> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> </dependencies></project>
实战结果
查看下载情况
执行结束后
执行后,输出的文件,老爽了
看到了有295个文件,貌似少了几个,肯定是请求的时候报错了
阅读全文
0 0
- 爬虫之窃取网络小说(2,多线程爬虫)-yellowcong
- 爬虫之窃取网络小说(1)-yellowcong
- Java之请求发送工具类(HttpClientUtils,爬虫)-yellowcong
- 爬虫之黑龙江科技大学 URP大战-yellowcong
- python爬虫爬取网络小说
- 基于Java的网络爬虫实现抓取网络小说(一)
- Python爬虫之<XPath与多线程爬虫>
- python 爬虫 网络小说下载(静态网站)
- 爬虫入门四(多线程爬虫)
- 多线程爬虫(提升爬虫的速度)
- 多线程爬虫之糗事百科
- 爬虫多线程
- 多线程爬虫
- 多线程爬虫
- Python爬虫入门实战系列(一)--爬取网络小说并存放至txt文件
- Python爬虫:初探多线程爬虫
- 爬虫爬虫爬虫(一)
- python多线程小爬虫之练练手
- (三)u-boot启动流程分析(C语言部分board_f.c)
- linux deamon函数使用方法说明
- 矩阵的物理意义
- Building for UN UVA
- Linux Jenkins 安装
- 爬虫之窃取网络小说(2,多线程爬虫)-yellowcong
- node更新与npm库更新
- IOS开发笔记--视频录制
- IBM系列软件“安装”阶段出错问题的解决
- swoole的安装
- Win炫酷实用快捷键及触控板手势
- 实时视频传输协议RTP
- 如何在ashx页面获取Session值
- iOS极光推送清除角标解决方案