Python自动化拉取Mysql数据并装载到Hive(V2.0)
来源:互联网 发布:布卡直播软件 编辑:程序博客网 时间:2024/06/08 20:03
上一版本:http://blog.csdn.net/babyfish13/article/details/70792158
相较于之前的版本,主要作了两方面的优化:数据字符回车换行导致装载错误的问题修复及跑批调用参数等方面的优化。
此版本,主要考虑了串行调度、全量加载,并行、增量装载将在下一个版本中优化。
1、数据装载的表级参数配置文件
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive-Auto/batchPar.conf
2、数据装载的系统级参数配置脚本
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive-Auto/systemParGet.py
3、源表结构及元数据信息的获取
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive-Auto/getSrcMetadata.py
4、hive库表结构的创建
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive-Auto/hiveTabCreate.py
5、Mysql数据向Hive装载
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive-Auto/mysqlData2Hive.py
6、串行总控调度
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive-Auto/mysql2HiveSerialCtl.py
相较于之前的版本,主要作了两方面的优化:数据字符回车换行导致装载错误的问题修复及跑批调用参数等方面的优化。
此版本,主要考虑了串行调度、全量加载,并行、增量装载将在下一个版本中优化。
1、数据装载的表级参数配置文件
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive-Auto/batchPar.conf
# BatchParConfig file ----------------------------------# srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='game_zone', tabType='single'# srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='live_history_status', tabType='submeter-256'srcMysql_config=srcMysqlConfig_jellyfish_user, src_tabName='user_profile', tabType='submeter-256'# srcMysql_config=srcMysqlConfig_jellyfish_seed, src_tabName='room', tabType='single'# srcMysql_config=srcMysqlConfig_jellyfish_user, src_tabName='user_id_card', tabType='single'srcMysql_config=srcMysqlConfig_jellyfish_seed, src_tabName='game', tabType='single'srcMysql_config=srcMysqlConfig_jellyfish_event, src_tabName='match_apply', tabType='single'# srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='user_daily_sign_record', tabType='single'# srcMysql_config=srcMysqlConfig_jellyfish_event, src_tabName='event_online_count', tabType='single'# srcMysql_config=srcMysqlConfig_jellyfish_event, src_tabName='event_award_201611', tabType='single'srcMysql_config=srcMysqlConfig_jellyfish_hadoop_stat, src_tabName='room_group', tabType='single'
2、数据装载的系统级参数配置脚本
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive-Auto/systemParGet.py
# -*- coding=utf-8 -*-import warningsimport datetimewarnings.filterwarnings("ignore")# src Database configsrcMysqlConfig_Tv_server = { 'host': 'MysqlHostInnerIp', # 'host': 'MysqlHostOuterIp', 'user': 'MysqlUser', 'passwd': 'MysqlPass', 'port': 50506, 'db': 'Tv_server'}srcMysqlConfig_Tv_user = { 'host': 'MysqlHostInnerIp', # 'host': 'MysqlHostOuterIp', 'user': 'MysqlUser', 'passwd': 'MysqlPass', 'port': 50514, 'db': 'Tv_user'}srcMysqlConfig_Tv_seed = { 'host': 'MysqlHostInnerIp', # 'host': 'MysqlHostOuterIp', 'user': 'MysqlUser', 'passwd': 'MysqlPass', 'port': 50029, 'db': 'Tv_seed'}srcMysqlConfig_Tv_event = { 'host': 'MysqlHostInnerIp', # 'host': 'MysqlHostOuterIp', 'user': 'MysqlUser', 'passwd': 'MysqlPass', 'port': 50512, 'db': 'Tv_event'}srcMysqlConfig_Tv_hadoop_stat = { 'host': 'MysqlHostInnerIp', # 'host': 'MysqlHostOuterIp', 'user': 'MysqlUser', 'passwd': 'MysqlPass', 'port': 6605, 'db': 'Tv_hadoop_stat'}tmp_data_dir = '/home/hadoop/nisj/Mysql2Hive-Auto/tmp_data'def getNowDay(): DayNow = datetime.datetime.today().strftime('%Y-%m-%d') return DayNowdef dateRange(beginDate, endDate): dates = [] dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d") date = beginDate[:] while date <= endDate: dates.append(date) dt = dt + datetime.timedelta(1) date = dt.strftime("%Y-%m-%d") return datesdef getSrcMysqlConfig(srcMysql_config): srcMysql_config = srcMysql_config return srcMysql_config['host'], srcMysql_config['port'], srcMysql_config['user'], srcMysql_config['passwd'], srcMysql_config['db']# print getSrcMysqlConfig(srcMysql_config=srcMysqlConfig_Tv_server)
3、源表结构及元数据信息的获取
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive-Auto/getSrcMetadata.py
# -*- coding=utf-8 -*-import osimport refrom systemParGet import *warnings.filterwarnings("ignore")def mysqlTabCreateScript(srcMysql_config, src_tabName, tabType): # 参数初始化赋值 host = getSrcMysqlConfig(srcMysql_config)[0] port = getSrcMysqlConfig(srcMysql_config)[1] user = getSrcMysqlConfig(srcMysql_config)[2] passwd = getSrcMysqlConfig(srcMysql_config)[3] db = getSrcMysqlConfig(srcMysql_config)[4] if tabType == 'single': srcTabName = src_tabName elif 'submeter' in tabType: srcTabName = src_tabName + "_0" srcTabStructure = os.popen("""source /etc/profile; \ /usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \ -N -e"set names utf8; \ select a2.column_name,case when a2.data_type like '%int' then 'bigint' else 'string' end data_type from information_schema.TABLES a1 left join information_schema.columns a2 on a1.TABLE_SCHEMA=a2.TABLE_SCHEMA and a1.TABLE_NAME=a2.TABLE_NAME where a1.TABLE_SCHEMA='{db}' and a1.table_name ='{srcTabName}' order by a2.ORDINAL_POSITION;" \ """ .format(host=host, port=port, user=user, passwd=passwd, db=db, srcTabName=srcTabName)).readlines(); srcTabCol_list = [] for stcList in srcTabStructure: stc = re.split('\t', stcList.replace('\n', '')) srcTabCol_list.append(stc) TabCreateScript = 'drop table if exists xxx_{src_tabName};\ncreate table xxx_{src_tabName}(\n'.format(src_tabName=src_tabName) colList = [] for srcColType in srcTabCol_list: TabCreateScript = TabCreateScript + '\`' + srcColType[0] + '\`' + ' ' + srcColType[1] + ',\n' colList.append(srcColType[0]) TabCreateScript = TabCreateScript[:-2]+")row format delimited fields terminated by '\t' lines terminated by '\n';;" return TabCreateScript, colList# Batch Test
4、hive库表结构的创建
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive-Auto/hiveTabCreate.py
# -*- coding=utf-8 -*-from getSrcMetadata import *warnings.filterwarnings("ignore")def HiveCreateTab(srcMysql_config, src_tabName, tabType): TabCreateScript = mysqlTabCreateScript(srcMysql_config, src_tabName, tabType)[0] os.system("""/usr/lib/hive-current/bin/hive -e "{TabCreateScript}" """.format(TabCreateScript=TabCreateScript))# Batch Test
5、Mysql数据向Hive装载
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive-Auto/mysqlData2Hive.py
# -*- coding=utf-8 -*-from getSrcMetadata import *warnings.filterwarnings("ignore")def mysqlDataDownload(srcMysql_config, src_tabName, tabType): # 参数初始化赋值 host = getSrcMysqlConfig(srcMysql_config)[0] port = getSrcMysqlConfig(srcMysql_config)[1] user = getSrcMysqlConfig(srcMysql_config)[2] passwd = getSrcMysqlConfig(srcMysql_config)[3] db = getSrcMysqlConfig(srcMysql_config)[4] if os.path.exists('{tmp_data_dir}/'.format(tmp_data_dir=tmp_data_dir)) == False: os.system('mkdir -p {tmp_data_dir}/'.format(tmp_data_dir=tmp_data_dir)) colList = mysqlTabCreateScript(srcMysql_config, src_tabName, tabType)[1] allColChars = '' for colName in colList: allColChars = allColChars + 'replace(replace(replace(replace(\`' + colName + '\`,\',\',\'[comma]\'),\'\\n\',\'[newline-n]\'),\'\\r\',\'[newline-r]\'),\'\\t\',\'[tab]\')' + ',' if tabType == 'single': submeter_cnt = 1 elif 'submeter' in tabType: submeter_cnt = int(str(tabType).replace('submeter-', '')) os.system("rm -rf {tmp_data_dir}/xxx_{src_tabName}.txt".format(src_tabName=src_tabName, tmp_data_dir=tmp_data_dir)) for submeterPlus in range(0, submeter_cnt, 1): if submeter_cnt == 1: submeterPlus = '' else: submeterPlus = "_" + str(submeterPlus) os.system("""source /etc/profile; \ /usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \ -N -e"set names utf8; \ select {allColChars} from {db}.{src_tabName}{submeterPlus} ;" \ >>{tmp_data_dir}/xxx_{src_tabName}.txt \ """.format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, tmp_data_dir=tmp_data_dir, allColChars=allColChars[:-1], submeterPlus=submeterPlus))def DataUploadHive(src_tabName): os.system("""/usr/lib/hive-current/bin/hive -e "load data local inpath '{tmp_data_dir}/xxx_{src_tabName}.txt' overwrite into table xxx_{src_tabName};" """.format(src_tabName=src_tabName, tmp_data_dir=tmp_data_dir)) os.system("rm -rf {tmp_data_dir}/xxx_{src_tabName}.txt ".format(src_tabName=src_tabName, tmp_data_dir=tmp_data_dir))def MysqlData2hive(srcMysql_config, src_tabName, tabType): mysqlDataDownload(srcMysql_config, src_tabName, tabType) DataUploadHive(src_tabName)# Batch Test
6、串行总控调度
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive-Auto/mysql2HiveSerialCtl.py
# -*- coding=utf-8 -*-from hiveTabCreate import *from mysqlData2Hive import *warnings.filterwarnings("ignore")def Mysql2Hive(srcMysql_config, src_tabName, tabType): HiveCreateTab(srcMysql_config, src_tabName, tabType) MysqlData2hive(srcMysql_config, src_tabName, tabType)def Mysql2HiveCtl(): with open("batchPar.conf") as ConfigFile: ConfigFileList = [] for ConfigLine in ConfigFile: if '#' not in ConfigLine and ConfigLine.replace('\n', '') != '': ConfigFileList.append(ConfigLine.replace('\n', '')) srcMysql_config = src_tabName = tabType = '' for ConfigLine in ConfigFileList[1:]: for par in str(ConfigLine).split(', '): exec(par) Mysql2Hive(srcMysql_config=srcMysql_config, src_tabName=src_tabName, tabType=tabType)# batch_testMysql2HiveCtl()
阅读全文
0 0
- Python自动化拉取Mysql数据并装载到Hive(V2.0)
- Python自动化拉取Mysql数据并装载到Hive
- Python自动化拉取Mysql数据并装载到Hive
- Python自动化拉取Mysql数据并装载到Hive(V3.0)
- Python自动化拉取Mysql数据并装载到Oracle
- Python将Mysql分表数据按小时增量装载到Hive示例
- 使用Python脚本从Hive中取数据计算后加载到Mysql示例
- 用Python将json数据装载到mysql
- Hive搭建并将元数据存储到MySQL
- Hive装载数据命令
- Python从阿里云Oss拉数据写入Hive表并进行相关处理
- Python爬取数据并写入MySQL数据库
- hive——拉取数据的一个实例
- shell拉取MYSQL数据库数据
- python 远程连接MySQL数据库 拉取数据存至本地文件
- hive 表装载数据详解
- hive数据去重,并取指定的一条数据
- #python学习笔记#使用python爬取网站数据并保存到数据库
- Spring Boot动态监听jsp文件
- 在线云评测系统日志(十二):杂项
- 数据结构-哈希表的一点小总结
- Git教程 —— 修改远程提交
- nginx php-fpm 输出php错误日志
- Python自动化拉取Mysql数据并装载到Hive(V2.0)
- shell 基础
- 使用开源的card.io 扫描识别银行卡
- CSS选择器之【组合选择器】
- cgi 小科普
- 002初入C/C++之程序调试
- srping-集成mybatis
- MySQL为表的所有字段添加数据
- C# Dictionary 终极使用方法