爬虫的综合案例

来源:互联网 发布:spark sql 官网 编辑:程序博客网 时间:2024/05/21 10:54

爬虫的综合案例爬取虎嗅网的完整Demo

创建Maven项目

  • 首先引入依赖
    <dependencies>        <dependency>            <groupId>org.apache.httpcomponents</groupId>            <artifactId>httpclient</artifactId>            <version>4.5.3</version>        </dependency>        <dependency>            <!-- jsoup HTML parser library @ https://jsoup.org/ -->            <groupId>org.jsoup</groupId>            <artifactId>jsoup</artifactId>            <version>1.10.3</version>        </dependency>        <!-- https://mvnrepository.com/artifact/org.springframework/spring-jdbc -->        <dependency>            <groupId>org.springframework</groupId>            <artifactId>spring-jdbc</artifactId>            <version>4.2.6.RELEASE</version>        </dependency>        <dependency>            <groupId>mysql</groupId>            <artifactId>mysql-connector-java</artifactId>            <version>5.1.41</version>        </dependency>        <dependency>            <groupId>c3p0</groupId>            <artifactId>c3p0</artifactId>            <version>0.9.1.2</version>        </dependency>        <dependency>            <groupId>com.alibaba</groupId>            <artifactId>fastjson</artifactId>            <version>1.2.31</version>        </dependency>        <dependency>            <groupId>com.google.code.gson</groupId>            <artifactId>gson</artifactId>            <version>2.8.1</version>        </dependency>  </dependencies>

使用的是mysql数据库

  • 创建数据库名字为:spider

    create database spider;
  • 创建表的代码如下:

CREATE TABLE `huxiu_article` (  `id` varchar(250) DEFAULT NULL,  `title` varchar(250) DEFAULT NULL,  `author` varchar(250) DEFAULT NULL,  `createTime` varchar(250) DEFAULT NULL,  `zan` varchar(250) DEFAULT NULL,  `pl` varchar(250) DEFAULT NULL,  `sc` varchar(250) DEFAULT NULL,  `content` blob,  `url` varchar(250) DEFAULT NULL) ENGINE=InnoDB DEFAULT CHARSET=utf8

具体代码如下

实体类Article

public class Article {    private String id;    private String url;    private String title;    private String author;    private String createTime;    private String pl;    private String zan;    private String sc;    private String content;    public String getId() {        return id;    }    public void setId(String id) {        this.id = id;    }    ........}

操作数据库的ArticleDao

public class ArticleDao extends JdbcTemplate {    public ArticleDao() {        // 创建C3P0的datasource 1.配置 2.代码        ComboPooledDataSource dataSource = new ComboPooledDataSource();        // 1.url        // 2.driver        // 3.username&password        dataSource.setUser("root");        dataSource.setPassword("root");        dataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf-8");        setDataSource(dataSource);    }    public void save(Article article) {        String sql = "INSERT INTO `spider`.`huxiu_article` (`id`, `title`, `author`, `createTime`, `zan`, `pl`, `sc`, `content`, `url` ) VALUES( ?,?,?,?,?,?,?,?,?)";        update(sql, article.getId(),article.getTitle(),article.getAuthor(),article.getCreateTime(),article.getZan(),article.getPl(),article.getSc(),article.getContent(),article.getUrl());    }}

返回值的实体类HuxiuPagingResponse

    public class HuxiuPagingResponse {    private String data;    private String last_dateline;    private String msg;    private String result;    private String total_page;    public String getData() {        return data;    }    public void setData(String data) {        this.data = data;    }    ........}

程序主方法入口HuXiuSpider

public class HuXiuSpider {    // 保存数据    public static ArticleDao articleDao = new ArticleDao();    // dataLine用来做分页的请求    private static String dateLine = null;    // 创建固定大小的线程池(下载、解析、存储)    private static ExecutorService threadPool = Executors.newFixedThreadPool(30);    // 队列---从首页和分页解析出来的文章url,存放在这个队列中    public static ArrayBlockingQueue<String> urlQueue = new ArrayBlockingQueue<String>(1000);    // 队列---每个文章解析出来的Html文档,存放这个队列中    public static ArrayBlockingQueue<String> articleHtmlQueue = new ArrayBlockingQueue<String>(1000);    // 队列---每个文章的内容,也就是article对象,存放这个队列中    public static ArrayBlockingQueue<Article> articleContentQueue = new ArrayBlockingQueue<Article>(1000);    public static void main(String[] args) throws Exception {        // 提交线程 用来针对每个文章的url ----进行网络请求        for (int i = 0; i < 10; i++) {            threadPool.execute(new ProcessSinglePageRunnable());        }        // 解析页面        for (int i = 0; i < 10; i++) {            threadPool.execute(new ParseHtmlRunnable());        }        // 保存数据        threadPool.execute(new SaveDBRunnable());        //获取首页的文章url列表        getIndexArticleUrlList();        //加载分页        processPaging();    }    /**     * 获取首页的文章列表信息     *      * @throws IOException     * @throws ClientProtocolException     */    private static void getIndexArticleUrlList() throws IOException, ClientProtocolException {        // 1.指定首页url http://www.huxiu.com        String indexUrl = "http://www.huxiu.com";        // 2.发起一个HttpGet请求        HttpGet indexHttpGet = new HttpGet(indexUrl);        //设置User-Agent        indexHttpGet.setHeader("User-Agent",                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");        String html = getHtmlByRequest(indexHttpGet);        // 5.使用Jsoup进行解析,得到 文章的列表 ,获得文章aid。        Document indexDocument = Jsoup.parse(html);        // 获取date_line        Elements dateLines = indexDocument.select("[data-last_dateline]");        dateLine = dateLines.get(0).attr("data-last_dateline");        // 5.1 解析出div的某个属性data-id        Elements aidElements = indexDocument.select("div[data-aid]");        // 5.2 依次得到每个新闻的aid        for (Element element : aidElements) {            String aid = element.attr("data-aid");            try {                urlQueue.put(aid);            } catch (InterruptedException e) {                System.out.println("添加 aid 到urlQueue异常" + e);            }        }    }    private static void processPaging() {        for (int page = 2; page <= 1615; page++) {            try {                // 编写分页                String pagingUrl = "https://www.huxiu.com/v2_action/article_list";                HttpPost httpPost = new HttpPost(pagingUrl);                // 设置参数                ArrayList<NameValuePair> arrayList = new ArrayList<NameValuePair>();                arrayList.add(new BasicNameValuePair("huxiu_hash_code", "fb7f7403c58c3e8cb45aa47afc204c10"));                arrayList.add(new BasicNameValuePair("page", page + ""));                arrayList.add(new BasicNameValuePair("last_dateline", dateLine));                httpPost.setEntity(new UrlEncodedFormEntity(arrayList));                // 执行网络参数                String jsonText = getHtmlByRequest(httpPost);                // 想将json串转成对象                Gson gson = new Gson();                HuxiuPagingResponse huxiuPagingResponse = gson.fromJson(jsonText, HuxiuPagingResponse.class);                // 每一次请求,都需要解析出新的dataLine                dateLine = huxiuPagingResponse.getLast_dateline();                // 获取数据                String htmlData = huxiuPagingResponse.getData();                Document doc = Jsoup.parse(htmlData);                // 解析出div的某个属性data-id                Elements aidElements = doc.select("div[data-aid]");                // 依次得到每个新闻的aid                for (Element element : aidElements) {                    String aid = element.attr("data-aid");                    urlQueue.put(aid);                }            } catch (Exception e) {                // log.errer()                System.out.println(page);                System.out.println(e);            }            try {                Thread.sleep(500);            } catch (InterruptedException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }        }    }    /**     * 获取html文档     * @throws IOException     * @throws ClientProtocolException     */    public static String getHtml(String aidUrl) throws IOException, ClientProtocolException {        // 2.发起一个httpget请求        HttpGet indexHttpGet = new HttpGet(aidUrl);        return getHtmlByRequest(indexHttpGet);    }    private static String getHtmlByRequest(HttpRequestBase request) throws IOException, ClientProtocolException {        //设置请求头User-Agent        request.setHeader("User-Agent",                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");        // 3.使用HttpClient执行,得到一个entity。        CloseableHttpClient indexHttpClient = HttpClients.createDefault();        CloseableHttpResponse indexResponse = indexHttpClient.execute(request);        String html = null;        if (200 == indexResponse.getStatusLine().getStatusCode()) {            HttpEntity indexEntity = indexResponse.getEntity();            // 4.将entity转成字符串(html)            html = EntityUtils.toString(indexEntity, Charset.forName("utf-8"));        }        return html;    }}

用来针对每个文章的url 进行网络请求ProcessSinglePageRunnable

public class ProcessSinglePageRunnable  implements Runnable {    public void run() {        while (true) {            try {                processSingleUrl();                Thread.sleep(3000);            } catch (InterruptedException e) {            }        }    }    private void processSingleUrl() throws InterruptedException {        String aid = HuXiuSpider.urlQueue.take();        String aidUrl = "http://www.huxiu.com/article/" + aid + ".html";        try {            /*Article article = new Article();            article.setId(aid);*/            // 获取到单个新闻页面的html            String aidHtml = HuXiuSpider.getHtml(aidUrl);            HuXiuSpider.articleHtmlQueue.put(aidHtml);        } catch (Exception e) {            System.out.println(aidUrl);            System.out.println(e);        }    }}

解析每个页面ParseHtmlRunnable

import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.select.Elements;public class ParseHtmlRunnable implements Runnable {    public void run() {        while (true) {            String html = null;            try {                html = HuXiuSpider.articleHtmlQueue.take();            } catch (InterruptedException e1) {                // TODO Auto-generated catch block                e1.printStackTrace();            }            Article article = new Article();            Document detailDocument = Jsoup.parse(html);            //解析出div的某个属性data-id            Elements aidElements = detailDocument.select("div[data-aid]");            String aid=aidElements.get(0).attr("data-aid");            article.setId(aid);            System.out.println(aid+".........");            // 解析文章title            Elements titles = detailDocument.select(".t-h1");            String title = titles.get(0).text();            article.setTitle(title);            // 解析文章author author-name            Elements names = detailDocument.select(".author-name");            String name = names.get(0).text();            article.setAuthor(name);            // 解析文章发布时间            Elements dates = detailDocument.select("[class^=article-time]");            String date = dates.get(0).text();            article.setCreateTime(date);            // 解析文章 收藏数            Elements shares = detailDocument.select("[class^=article-share]");            String share = shares.get(0).text();            article.setSc(share);            // 解析文章 评论数            Elements pls = detailDocument.select("[class^=article-pl]");            String pl = pls.get(0).text();            article.setPl(pl);            // 解析文章 点赞数 num            Elements nums = detailDocument.select(".num");            String num = nums.get(0).text();            article.setZan(num);            // 解析文章正文内容 article-content-wrap            Elements content = detailDocument.select(".article-content-wrap p");            String contentText = content.text();            article.setContent(contentText);            // article.setUrl(aidUrl);            try {                HuXiuSpider.articleContentQueue.put(article);            } catch (InterruptedException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }        }    }}

保存数据的类SaveDBRunnable

public class SaveDBRunnable implements Runnable {    public void run() {        while (true) {            try {                Article article = HuXiuSpider.articleContentQueue.take();                HuXiuSpider.articleDao.save(article);            } catch (InterruptedException e) {                e.printStackTrace();            }        }    }}
原创粉丝点击