Java多线程B站爬虫与45万条视频数据,mysql批量写入性能对比,附代码

来源:互联网 发布:真有外星人吗 知乎 编辑:程序博客网 时间:2024/06/04 18:34

恩,萌新刚来,听学长说写博客可以总结梳理自己的知识,所以来试试,自娱自乐,不喜莫喷。目前还是大二狗,学Java半年多,错误很多,望大神指正。微笑

本文涉及:Java多线程,单例模式,爬虫相关技术,MySQL,JDBC,SQL优化


刚学Java还没学sql时写过一个B站爬虫,但是由于自己临时学的sql速度太慢,爬取45万条数据用了四五个小时,速度太慢,最近sql与Java有些进步,优化了一下爬虫,没有再次爬太多视频数据(爬太多B站会封IP),恩,结果是6分钟8万条数据,还算满意,我把我的设计思路分享一下,供萌新们参考,大神轻点打脸大哭


因为限制爬虫速度的第一是网速,第二是数据库,但是他们对硬件占用不冲突,所以我就打算把他们分到不同的线程中:

sql插入优化在另一篇文章中:


核心流程:

step1:爬取数据:因为从网上获取数据有很大的延迟,线程很长时间处于等待状态,对计算机压力不大,所以尽量多开几个线程,我一般开5个

step2:解析并暂存:不管多少个线程,爬取后把json把需要的数据解析成本地的数据类,全部存入一个队列中

step3:数据库写入:数据库只开一个线程,因为写入占用硬盘,限制条件是硬盘速度,多线程意义不大。扫描step2中的队列,如果不为空,从头部取出一个并从队列中移除,然后编入数据库批量写入命令中,达到预定的阈值后一次写入多条。


优点:爬去和写入分开,不会在写入时爬去处于等待状态,系统利用率高

缺点:系统资源占用高


核心算法确定完毕,具体操作如下:

首先用Fiddler4找到接口:


在不断地访问B站后找到了加载视频列表的API

然后访问不同的分区,观察规律,最后总结出获取不同分区视频列表的API函数

    public  static final int[] categories={22,26,126,127,157,158,164,159,71,137,131,24,25,47,27,33,32,153,51,152,28,31,30,59,29,54,130,            20,154,156,17,65,136,19,121,37,124,122,39,96,95,98,138,21,76,75,161,162,163,22,26,126,127,157,158,164,159,71,137,131};    public static String GetUrl(int categories,int pagenum){        return String.format("http://api.bilibili.com/archive_rank/getarchiverankbypartion?callback=?&type=jsonp&tid=%d&pn=%d&_=?",categories,pagenum);    }

categories对应不同的小区(例如鬼畜大区下的鬼畜调教小区对应22,我没发现大区的API,发现的同学请在留言中提到,万分感谢)

懒得找的同学可以用我找到的信息:

switch (category){    case 24:avType.setTypeName("动画--MAD·AMV");avType.setArea("动画");avType.setAreaNum(1);return avType;    case 25:avType.setTypeName("动画--MD·3D");avType.setArea("动画");avType.setAreaNum(1);return avType;    case 47:avType.setTypeName("动画--短片·手书·配音");avType.setArea("动画");avType.setAreaNum(1);return avType;    case 27:avType.setTypeName("动画--综合");avType.setArea("动画");avType.setAreaNum(1);return avType;    case 33:avType.setTypeName("番剧--连载动画");avType.setArea("番剧");avType.setAreaNum(2);return avType;    case 32:avType.setTypeName("番剧--完结动画");avType.setArea("番剧");avType.setAreaNum(2);return avType;    case 153:avType.setTypeName("番剧--国产动画");avType.setArea("番剧");avType.setAreaNum(2);return avType;    case 51:avType.setTypeName("番剧--资讯");avType.setArea("番剧");avType.setAreaNum(2);return avType;    case 152:avType.setTypeName("番剧--官方延伸");avType.setArea("番剧");avType.setAreaNum(2);return avType;    case 28:avType.setTypeName("音乐--原创音乐");avType.setArea("音乐");avType.setAreaNum(3);return avType;    case 31:avType.setTypeName("音乐--翻唱");avType.setArea("音乐");avType.setAreaNum(3);return avType;    case 30:avType.setTypeName("音乐--VOCALOID·UTAU");avType.setArea("音乐");avType.setAreaNum(3);return avType;    case 59:avType.setTypeName("音乐--演奏");avType.setArea("音乐");avType.setAreaNum(3);return avType;    case 29:avType.setTypeName("音乐--三次元音乐");avType.setArea("音乐");avType.setAreaNum(3);return avType;    case 54:avType.setTypeName("音乐--OP/ED/OST");avType.setArea("音乐");avType.setAreaNum(3);return avType;    case 130:avType.setTypeName("音乐--音乐选集");avType.setArea("音乐");avType.setAreaNum(3);return avType;    case 20:avType.setTypeName("舞蹈--宅舞");avType.setArea("舞蹈");avType.setAreaNum(4);return avType;    case 154:avType.setTypeName("舞蹈--三次元舞蹈");avType.setArea("舞蹈");avType.setAreaNum(4);return avType;    case 156:avType.setTypeName("舞蹈--宅舞教程");avType.setArea("舞蹈");avType.setAreaNum(4);return avType;    case 17:avType.setTypeName("游戏--单机游戏");avType.setArea("游戏");avType.setAreaNum(5);return avType;    case 65:avType.setTypeName("游戏--网友-电竞");avType.setArea("游戏");avType.setAreaNum(5);return avType;    case 136:avType.setTypeName("游戏--音游");avType.setArea("游戏");avType.setAreaNum(5);return avType;    case 19:avType.setTypeName("游戏--Mugen");avType.setArea("游戏");avType.setAreaNum(5);return avType;    case 121:avType.setTypeName("游戏--GWM");avType.setArea("游戏");avType.setAreaNum(5);return avType;    case 37:avType.setTypeName("科技--纪录片");avType.setArea("科技");avType.setAreaNum(6);return avType;    case 124:avType.setTypeName("科技--趣味科普-人文");avType.setArea("科技");avType.setAreaNum(6);return avType;    case 122:avType.setTypeName("科技--野生技术协会");avType.setArea("科技");avType.setAreaNum(6);return avType;    case 39:avType.setTypeName("科技--演讲-公开课");avType.setArea("科技");avType.setAreaNum(6);return avType;    case 96:avType.setTypeName("科技--星海");avType.setArea("科技");avType.setAreaNum(6);return avType;    case 95:avType.setTypeName("科技--数码");avType.setArea("科技");avType.setAreaNum(6);return avType;    case 98:avType.setTypeName("科技--机械");avType.setArea("科技");avType.setAreaNum(6);return avType;    case 138:avType.setTypeName("生活--搞笑");avType.setArea("生活");avType.setAreaNum(7);return avType;    case 21:avType.setTypeName("生活--日常");avType.setArea("生活");avType.setAreaNum(7);return avType;    case 76:avType.setTypeName("生活--美食圈");avType.setArea("生活");avType.setAreaNum(7);return avType;    case 75:avType.setTypeName("生活--动物圈");avType.setArea("生活");avType.setAreaNum(7);return avType;    case 161:avType.setTypeName("生活--手工");avType.setArea("生活");avType.setAreaNum(7);return avType;    case 162:avType.setTypeName("生活--绘画");avType.setArea("生活");avType.setAreaNum(7);return avType;    case 163:avType.setTypeName("生活--运动");avType.setArea("生活");avType.setAreaNum(7);return avType;    case 22:avType.setTypeName("鬼畜--鬼畜调教");avType.setArea("鬼畜");avType.setAreaNum(8);return avType;    case 26:avType.setTypeName("鬼畜--音MAD");avType.setArea("鬼畜");avType.setAreaNum(8);return avType;    case 126:avType.setTypeName("鬼畜--人力VOCALOID");avType.setArea("鬼畜");avType.setAreaNum(8);return avType;    case 127:avType.setTypeName("鬼畜--教程演示");avType.setArea("鬼畜");avType.setAreaNum(8);return avType;    case 157:avType.setTypeName("时尚--美妆");avType.setArea("时尚");avType.setAreaNum(9);return avType;    case 158:avType.setTypeName("时尚--服饰");avType.setArea("时尚");avType.setAreaNum(9);return avType;    case 164:avType.setTypeName("时尚--健身");avType.setArea("时尚");avType.setAreaNum(9);return avType;    case 159:avType.setTypeName("时尚--资讯");avType.setArea("时尚");avType.setAreaNum(9);return avType;    case 71:avType.setTypeName("娱乐--综艺");avType.setArea("娱乐");avType.setAreaNum(10);return avType;    case 137:avType.setTypeName("娱乐--明星");avType.setArea("娱乐");avType.setAreaNum(10);return avType;    case 131:avType.setTypeName("娱乐--Korea相关");avType.setArea("娱乐");avType.setAreaNum(10);return avType;    default:avType.setTypeName("未知");avType.setArea("未知");avType.setAreaNum(99);return avType;}

pagenum对应视频列表的第几页,每页有多条数据

例如访问

http://api.bilibili.com/archive_rank/getarchiverankbypartion?callback=?&type=jsonp&tid=22&pn=1&_=?

就会返回一个包含鬼畜调教第一页json信息
通过不同的参数访问这个API就会返回包含视频信息的json对象
解析这个json,并去除我们需要的信息保存在ArrayList中,这个就 不多说了,有现有的json解析包
如下就是从网上获取json的run方法

package iss2015302580343.whu.homework_5;import java.util.ArrayList;/** * Created by Tao on 2017/1/6. */public class WriterThread extends Thread {    MySQLAccess mySQLAccess = MySQLAccess.getAccess();    HttpGetter httpGetter = new HttpGetter();    JsonReader jsonReader = new JsonReader();    String json=null;    int page;    int startNum;    int endNum;    public WriterThread(int page,int start,int end){        this.page=page;this.startNum=start;this.endNum=end;    }    @Override    public void run() {        for (int i = startNum; i < endNum; i++) {            for (int j = 1; j <= page; j++) {                //j代表page,每个分类读取多少页                //i代表第几个分类,从categories[]中获取                json = httpGetter.GetByString(BILIBILIValues.GetUrl(BILIBILIValues.categories[i], j));                ArrayList<AVMode> avModes = jsonReader.jsonRead(json);                if (avModes == null) {                    continue;                }            }        }    }}

然后设计一个可以存储所有信息的数据库,也不多说


为了在不同的地方统一操作,我们把数据库操作的类设计为单例模式并实现runable接口:waitForWrite就是等待写入的ArrayList
public void run() {    while (true){            System.out.print("");            if(waitForWrite.size()>0){                for (int i=0;i<waitForWrite.size();i++){                    try {                        writeIn(waitForWrite.get(i));                    }catch (MySQLIntegrityConstraintViolationException exception){                        //跳过重复的                    }                    catch (SQLException e) {                        e.printStackTrace();                    }                    waitForWrite.remove(i);                    System.out.println("++++++++++++++++++");                }            }    }}

writeIn的实现为:
private void writeIn(ArrayList<AVMode> avModes) throws SQLException {    StringBuilder sql= new StringBuilder("INSERT INTO " + MySQLValues.getTablename() +" values ");    int i=0;    for(;i<avModes.size();i++){        sql.append(" (");        sql.append( avModes.get(i).getAid()+",");        sql.append( "'"+ avModes.get(i).getName().replace("'","-").replace("\"","-")+"',");        sql.append( avModes.get(i).getPlay()+",");        sql.append( "'"+ avModes.get(i).getAuther().replace("'","-").replace("\"","-")+"',");        sql.append( avModes.get(i).getCategories()+",");        sql.append( avModes.get(i).getFavorites()+",");        sql.append( avModes.get(i).getCoins()+",");        sql.append( avModes.get(i).getDanmaku()+",");        sql.append( BILIBILIValues.CategoriesNames(avModes.get(i).getCategories()).getAreaNum());        if(i<avModes.size()-1){            sql.append(" ),");        }else {            sql.append(" );");        }    }    PreparedStatement preparedStatement= connection.prepareStatement(sql.toString());    preparedStatement.execute();}

每一页多条数据一次写入 ,速度加快了10倍


拿到数据后就可以干很多是了:

例如加权排序:

/** * 按类别查找并排序,排序权重为:40% 硬币数, 40% 收藏数, 20%弹幕数 * @param area 大分类 * @num num 这个类取多少条 * @return */public ResultSet FindArea(int area, int num){    ResultSet result = null;    try    {        String sql = "SELECT * FROM " + MySQLValues.getTablename()+                " WHERE area = ? ORDER by (favorites * 4 + coins * 4 + danmaku * 2 ) desc"+                " Limit 0,10 ; ";        PreparedStatement preparedStatement= connection.prepareStatement(sql);        preparedStatement.setString(1, String.valueOf(area));        result = preparedStatement.executeQuery();    } catch (SQLException e)    {        e.printStackTrace();    }    return result;}
鬼畜区试验:

这是新爬的10万条数据的数据库结果:(算完时间忘关了,多爬了些)


这是以前的45万条数据的数据库:

不信的看数据库大小:



最后附上项目代码,lib都在里面https://git.oschina.net/iss2015302580343/bilibilipachong.git

                                             
0 0
原创粉丝点击