爬虫之窃取网络小说（2，多线程爬虫）-yellowcong

来源：互联网发布：夏普电视音频输出端口编辑：程序博客网时间：2024/06/05 19:41

在上一篇文章的基础上,爬虫之窃取网络小说（1），然后通过多线程的方式，实现多线程爬取网络小说，核心的代码是通过BlockingQueue来实现无锁的方式来解决这个多线程问题，如果使用notify和wait的方式，就相对慢了一些。

项目的结构

这里写图片描述

代码

Constatns.java

package com.yellowcong.http.common;/****作者:yellowcong*日期:2017/11/24*時間:9:00:19*描述:*/public class Constants {    //设定编码    public static final String WEB_ENCODE ="UTF-8";    //主页路径    public static final String HOME_URL ="https://www.yite.cc/book/dxjdntbb/";    //文件写出路径    public static final String OUT_PATH = "C:\\Users\\zhangrw\\Desktop\\Demo";}

文件下载工具类

package com.yellowcong.http.utils;import java.io.File;import java.io.FileWriter;import java.io.IOException;/** * * 作者:yellowcong 日期:2017/11/24 時間:10:26:14 描述: */public class FileUtils {    /**     * 将文字转化为文件     *      * @param str     * @param outFile     */    public static void copyStr2File(String str, File outFile) {        FileWriter out = null;        try {            if(!outFile.getParentFile().exists()) {                outFile.getParentFile().mkdirs();            }            // 将Str 转化为文件            out = new FileWriter(outFile);            char[] chars = str.toCharArray();            out.write(chars, 0, chars.length);        } catch (IOException e) {            e.printStackTrace();        } finally {            try {                if (out != null) {                    out.close();                }            } catch (Exception e) { // TODO: handle exception             }        }    }}

请求工具类

package com.yellowcong.http.utils;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.UnsupportedEncodingException;import java.util.ArrayList;import java.util.List;import java.util.Map;import org.apache.http.HeaderIterator;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.HttpStatus;import org.apache.http.NameValuePair;import org.apache.http.ParseException;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.CookieStore;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.protocol.HttpClientContext;import org.apache.http.cookie.Cookie;import org.apache.http.impl.client.BasicCookieStore;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.DefaultConnectionKeepAliveStrategy;import org.apache.http.impl.client.DefaultRedirectStrategy;import org.apache.http.impl.client.HttpClientBuilder;import org.apache.http.impl.cookie.BasicClientCookie;import org.apache.http.message.BasicNameValuePair;import org.apache.http.util.EntityUtils;import org.apache.log4j.LogManager;import org.apache.log4j.Logger;public class HttpClient {      private static final Logger LOG = LogManager.getLogger(HttpClient.class);      /** 请求网站的编码，这个地方，我默认 写的是GB3212*/    private static final String DEFALUT_ENCODE = "UTF-8";    public static CloseableHttpClient httpClient = null;      public static HttpClientContext context = null;      public static CookieStore cookieStore = null;      public static RequestConfig requestConfig = null;      static {          init();      }      private static void init() {          context = HttpClientContext.create();          cookieStore = new BasicCookieStore();          // 配置超时时间（连接服务端超时1秒，请求数据返回超时2秒）          requestConfig = RequestConfig.custom().setConnectTimeout(120000).setSocketTimeout(60000)                  .setConnectionRequestTimeout(60000).build();          // 设置默认跳转以及存储cookie          httpClient = HttpClientBuilder.create().setKeepAliveStrategy(new DefaultConnectionKeepAliveStrategy())                  .setRedirectStrategy(new DefaultRedirectStrategy()).setDefaultRequestConfig(requestConfig)                  .setDefaultCookieStore(cookieStore).build();      }      /**      * 发送get请求     *       * @param url      * @return response      * @throws ClientProtocolException      * @throws IOException      */      public static String get(String url)  {          HttpGet httpget = new HttpGet(url);          CloseableHttpResponse response = null;        try {              //伪装为浏览器            httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0");            //设定请求的参数            response= httpClient.execute(httpget, context);              return copyResponse2Str(response);        } catch(Exception e){            LOG.debug("请求失败\t"+url);        }finally {              try {                if(response != null){                    response.close();                }            } catch (IOException e) {                e.printStackTrace();            }          }        return null;    }      /**     * 将返回的Response转化成String对象     * @param response 返回的Response     * @return     */    private static String copyResponse2Str(CloseableHttpResponse response){        try {            int code = response.getStatusLine().getStatusCode();            //当请求的code返回值不是400的情况            if((code == HttpStatus.SC_MOVED_TEMPORARILY )            || (code == HttpStatus.SC_MOVED_PERMANENTLY)            || (code == HttpStatus.SC_SEE_OTHER)            || (code == HttpStatus.SC_TEMPORARY_REDIRECT)) {                return null;            }else{                return copyInputStream2Str(response.getEntity().getContent());            }        } catch (Exception e) {            e.printStackTrace();        }        return null;    }    /**     * 将InputStream转化为String类型的数据     * @param in     * @return     */    private static String copyInputStream2Str(InputStream in){         try {            BufferedReader reader = new BufferedReader(new InputStreamReader(in,DEFALUT_ENCODE));            String line = null;            StringBuffer sb = new StringBuffer();            while((line = reader.readLine()) != null){                sb.append(line);            }            return sb.toString();        } catch (Exception e) {            LOG.debug("获取字符串失败");        }        return null;    }    /**     * 发送post请求，不带参数 的post     * @param url     * @return     */    public static String post(String url){        return post(url, null);    }    /**     * 发从post 请求     * @param url     * @param parameters     * @return     * @throws ClientProtocolException     * @throws IOException     */    public static String post(String url, Map<String,Object> parameters){          HttpPost httpPost = new HttpPost(url);         CloseableHttpResponse response = null;          try {              //设定请求的参数            setRequestParamter(parameters, httpPost);            //发送请求            response = httpClient.execute(httpPost, context);            return copyResponse2Str(response);        }catch(Exception e){            LOG.debug("请求失败\t"+url);        }finally {              try {                if(response != null){                    response.close();                }            } catch (IOException e) {                e.printStackTrace();            }          }          return null;      }    /**     * 设定POST请求的参数     * @param parameters     * @param httpPost     * @throws UnsupportedEncodingException     */    private static void setRequestParamter(Map<String, Object> parameters, HttpPost httpPost)            throws UnsupportedEncodingException {        List<NameValuePair> nvps;        //添加参数        if(parameters != null && parameters.size()>0){            nvps = new ArrayList<NameValuePair>();            for(Map.Entry<String, Object> map:parameters.entrySet()){                NameValuePair param = new BasicNameValuePair(map.getKey(), map.getValue().toString());                nvps.add(param);            }            httpPost.setEntity(new UrlEncodedFormEntity(nvps, DEFALUT_ENCODE));          }    }      /**     * 将 http://www.yellowcong.com?age=7&name=8      * 这种age=7&name=8  转化为map数据     * @param parameters     * @return     */    @SuppressWarnings("unused")      private static List<NameValuePair> toNameValuePairList(String parameters) {          List<NameValuePair> nvps = new ArrayList<NameValuePair>();          String[] paramList = parameters.split("&");          for (String parm : paramList) {              int index = -1;              for (int i = 0; i < parm.length(); i++) {                  index = parm.indexOf("=");                  break;              }              String key = parm.substring(0, index);              String value = parm.substring(++index, parm.length());              nvps.add(new BasicNameValuePair(key, value));          }          System.out.println(nvps.toString());          return nvps;      }      /**      * 手动增加cookie      * @param name      * @param value      * @param domain      * @param path      */      public void addCookie(String name, String value, String domain, String path) {          BasicClientCookie cookie = new BasicClientCookie(name, value);          cookie.setDomain(domain);          cookie.setPath(path);          cookieStore.addCookie(cookie);      }      /**      * 把结果console出来      *       * @param httpResponse      * @throws ParseException      * @throws IOException      */      public static void printResponse(HttpResponse httpResponse) throws ParseException, IOException {          // 获取响应消息实体          HttpEntity entity = httpResponse.getEntity();          // 响应状态          System.out.println("status:" + httpResponse.getStatusLine());          System.out.println("headers:");          HeaderIterator iterator = httpResponse.headerIterator();          while (iterator.hasNext()) {              System.out.println("\t" + iterator.next());          }      }      /**      * 把当前cookie从控制台输出出来      *       */      public static void printCookies() {          cookieStore = context.getCookieStore();          List<Cookie> cookies = cookieStore.getCookies();          for (Cookie cookie : cookies) {              System.out.println("key:" + cookie.getName() + "  value:" + cookie.getValue());          }      }      /**      * 检查cookie的键值是否包含传参      *       * @param key      * @return      */      public static boolean checkCookie(String key) {          cookieStore = context.getCookieStore();          List<Cookie> cookies = cookieStore.getCookies();          boolean res = false;          for (Cookie cookie : cookies) {              if (cookie.getName().equals(key)) {                  res = true;                  break;              }          }          return res;      }      /**      * 直接把Response内的Entity内容转换成String      *       * @param httpResponse      * @return      * @throws ParseException      * @throws IOException      */      public static String toString(CloseableHttpResponse httpResponse) throws ParseException, IOException {          // 获取响应消息实体          HttpEntity entity = httpResponse.getEntity();          if (entity != null)              return EntityUtils.toString(entity);          else              return null;      }  }

网站多线程处理类

package com.yellowcong.web.yite;import java.io.File;import java.util.concurrent.BlockingQueue;import com.yellowcong.http.common.Constants;import com.yellowcong.http.utils.FileUtils;import com.yellowcong.web.yite.WebUtils.Passage;/****作者:yellowcong*日期:2017/11/24*時間:10:09:13*描述:*/public class PageThread implements Runnable{    private BlockingQueue<String> queue ;    public PageThread(BlockingQueue<String> queue) {        super();        this.queue = queue;    }    public void run() {        try {            while(this.queue.size()>0) {                String url = this.queue.take();                Passage passage = WebUtils.getPageData(url);//              System.out.println("\r\n当前线程\t"+Thread.currentThread().getName()+"还剩下"+this.queue.size()+"个");                FileUtils.copyStr2File(passage.getContent(), new File(Constants.OUT_PATH+File.separator+passage.getTitle()+".txt"));                System.out.printf("当前线程%s,文章名称%s,还剩下%s个\r\n", Thread.currentThread().getName(),passage.getTitle(),this.queue.size());            }        } catch (InterruptedException e) {            e.printStackTrace();        }    }}

网站解析工具类

package com.yellowcong.web.yite;import java.util.concurrent.ArrayBlockingQueue;import java.util.concurrent.BlockingQueue;import java.util.concurrent.ConcurrentHashMap;import java.util.concurrent.ConcurrentMap;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import com.yellowcong.http.common.Constants;import com.yellowcong.http.utils.HttpClient;/****作者:yellowcong*日期:2017/11/24*時間:9:53:03*描述:*/public class WebUtils {    /**     * 获取https://www.yite.cc/ 这个网站      * 小说主页面的地址      * 比如：     * https://www.yite.cc/book/chaojixiulianxitong/     * https://www.yite.cc/book/jiaojiaoshiniang/     * 等。。。     * @param url     * @return     * @throws Exception      */    public static BlockingQueue<String> getPageInfo(String url) throws Exception{        //获取路径        String htmlStr = HttpClient.get(Constants.HOME_URL);        //将解析的html转化为        Document homeDoc = Jsoup.parse(htmlStr);        //获取到所有的链接        Element element = homeDoc.getElementById("list");        //获取所有的章节信息        Elements pages = element.getElementsByTag("a");        System.out.println(pages.size());        //创建一个队列        BlockingQueue<String> queue = new ArrayBlockingQueue<String>(pages.size());        for(int i=0;i<pages.size();i++) {            Element node = pages.get(i);            String hrefStr = Constants.HOME_URL+node.attr("href");            String hrefContent = node.html();            System.out.printf("%s-->%s\r\n",hrefStr,hrefContent);            queue.put(hrefStr);        }        return queue;    }    /**     * 获取到每一篇     * @param url     * @return     */    public static Passage getPageData(String url) {        //获取路径        String htmlStr = HttpClient.get(url);        //将解析的html转化为        Document homeDoc = Jsoup.parse(htmlStr);        Element content = homeDoc.getElementById("content");        //内容        String pageContent = content.text();        //标题        String title = homeDoc.getElementById("txtbox").getElementsByTag("h1").get(0).text();        //写数据        return new Passage(title,pageContent);    }    public static class Passage {        private String title;        private String content;        public Passage(String title, String content) {            super();            this.title = title;            this.content = content;        }        public String getTitle() {            return title;        }        public void setTitle(String title) {            this.title = title;        }        public String getContent() {            return content;        }        public void setContent(String content) {            this.content = content;        }    }}

函数入口

package day111_24;import java.util.concurrent.BlockingQueue;import java.util.concurrent.ExecutorService;import java.util.concurrent.Executors;import com.yellowcong.http.common.Constants;import com.yellowcong.web.yite.PageThread;import com.yellowcong.web.yite.WebUtils;/****作者:yellowcong*日期:2017/11/24*時間:10:20:17*描述:*/public class Main {    public static void main(String[] args) throws Exception {        //获取队列信息        BlockingQueue<String> queue = WebUtils.getPageInfo(Constants.HOME_URL);        //创建一个线程池        int poolSize =8;        ExecutorService pool = Executors.newFixedThreadPool(poolSize);        //提交任务        //获取线程        PageThread thread = new PageThread(queue);        for(int i=0;i<poolSize;i++) {            //提交任务            pool.submit(thread);        }        pool.shutdown();    }}

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">  <modelVersion>4.0.0</modelVersion>  <groupId>yellowcong</groupId>  <artifactId>day111_24</artifactId>  <version>0.0.1-SNAPSHOT</version>  <packaging>jar</packaging>  <name>day111_24</name>  <url>http://maven.apache.org</url>    <!-- 配置国内比较快的 阿里云的Maven仓库 -->    <repositories>        <repository>            <id>aliyunmaven</id>            <url>http://maven.aliyun.com/nexus/content/groups/public/</url>        </repository>    </repositories>   <properties>    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>  </properties>  <dependencies>        <!-- 日志 -->        <dependency>            <groupId>log4j</groupId>            <artifactId>log4j</artifactId>            <version>1.2.16</version>        </dependency>        <!-- 网页解析 -->        <dependency>            <groupId>org.jsoup</groupId>            <artifactId>jsoup</artifactId>            <version>1.7.3</version>        </dependency>        <!-- http协议 解析 BEGIN-->        <dependency>            <groupId>org.apache.httpcomponents</groupId>            <artifactId>httpclient</artifactId>            <version>4.5</version>        </dependency>        <dependency>            <groupId>org.apache.httpcomponents</groupId>            <artifactId>httpcore</artifactId>            <version>4.4.2</version>        </dependency>        <dependency>            <groupId>commons-codec</groupId>            <artifactId>commons-codec</artifactId>            <version>1.9</version>        </dependency>        <!-- http协议 解析 END-->    <dependency>      <groupId>junit</groupId>      <artifactId>junit</artifactId>      <version>3.8.1</version>      <scope>test</scope>    </dependency>  </dependencies></project>

实战结果

查看下载情况
这里写图片描述

执行结束后
这里写图片描述

执行后，输出的文件，老爽了
这里写图片描述

看到了有295个文件，貌似少了几个，肯定是请求的时候报错了

这里写图片描述

阅读全文

0 0