用Python将统计数据不存在的记录按维度对应指标补齐(续:日数据情形)

来源:互联网 发布:数据采集系统解决方案 编辑:程序博客网 时间:2024/05/01 16:00
接【Python多线程跑Hive批一例】-->http://blog.csdn.net/babyfish13/article/details/54906405
及【用Python将统计数据不存在的记录按维度对应指标补齐】-->http://blog.csdn.net/babyfish13/article/details/54632508

本文主要是将数据按日存储的结果表有些维度为空的记录给补起来,与按周的又有所不同。
1、用到的日期临时表
show create table bitmp_date_list;
CREATE TABLE `bitmp_date_list` (  `str_date` varchar(10) DEFAULT NULL) ENGINE=InnoDB DEFAULT CHARSET=utf8;

2、改进及更新后的日期处理脚本
/Users/nisj/PycharmProjects/EsDataProc/RemainProcByDay/DayProc.py
# -*- coding=utf-8 -*-import warningsimport datetimewarnings.filterwarnings("ignore")def getNowDay():    DayNow = datetime.datetime.today().strftime('%Y-%m-%d')    return DayNowdef getFristDay():    FristDay=datetime.datetime.strptime('2015-06-29', '%Y-%m-%d').strftime('%Y-%m-%d')    return FristDaydef getDayAllList(run_day):    DayAllList=[]    begin_date = datetime.datetime.strptime(getFristDay(), "%Y-%m-%d")    if run_day != '':        end_date = datetime.datetime.strptime(run_day, '%Y-%m-%d')    else:        end_date = datetime.datetime.strptime(getNowDay(), "%Y-%m-%d")    while begin_date <= end_date:        date_str = begin_date.strftime("%Y-%m-%d")        DayAllList.append(date_str)        begin_date += datetime.timedelta(days=1)    return DayAllListdef getDayRunList(run_day):    DayRunList=[]    if run_day != '':        end_date = datetime.datetime.strptime(run_day, '%Y-%m-%d')    else:        end_date = datetime.datetime.strptime(getNowDay(), "%Y-%m-%d")    if end_date - datetime.timedelta(days=60) <= datetime.datetime.strptime(getFristDay(), "%Y-%m-%d"):        begin_date = datetime.datetime.strptime(getFristDay(), "%Y-%m-%d")    else:        begin_date = end_date - datetime.timedelta(days=60)    while begin_date <= end_date:        date_str = begin_date.strftime("%Y-%m-%d")        DayRunList.append(date_str)        begin_date += datetime.timedelta(days=1)    return DayRunListdef getDayRunListReverse(run_day):    DayRunList=[]    begin_date = datetime.datetime.strptime(run_day, '%Y-%m-%d') + datetime.timedelta(days=1)    if begin_date + datetime.timedelta(days=60) >= datetime.datetime.strptime(getNowDay(), "%Y-%m-%d") - datetime.timedelta(days=1):        end_date = datetime.datetime.strptime(getNowDay(), "%Y-%m-%d") - datetime.timedelta(days=1)    else:        end_date = begin_date + datetime.timedelta(days=60)    while begin_date <= end_date:        date_str = begin_date.strftime("%Y-%m-%d")        DayRunList.append(date_str)        begin_date += datetime.timedelta(days=1)    return DayRunListdef getDayRemainRunIntervalList(str_begin_date, str_end_date):    DayList=[]    begin_date = datetime.datetime.strptime(str_begin_date, "%Y-%m-%d")    end_date = datetime.datetime.strptime(str_end_date, "%Y-%m-%d")    while begin_date <= end_date:        date_str = begin_date.strftime("%Y-%m-%d")        DayList.append(date_str)        begin_date += datetime.timedelta(days=1)    return DayList# Batch Test# run_day = '2016-07-10'# for remain_day in getDayAllList(run_day):#     for his_day in getDayRunList(run_day=remain_day):#         print remain_day, his_day# one day# for his_day in getDayRunList(run_day=run_day):#     print run_day, his_day# run_day = '2017-01-01'# for remain_day in getDayRunList(run_day):#     print remain_day,run_day    # for his_day in getDayRunList(run_day=remain_day):    #     print remain_day, his_day# 计算2017年的数据信息# str_begin_date = '2017-01-01'# str_end_date = '2017-02-05'# for remain_day in getDayRemainRunIntervalList(str_begin_date, str_end_date):#     for his_day in getDayRunList(run_day=remain_day):#         print remain_day,his_day

3、空记录查找及插入
/Users/nisj/PycharmProjects/EsDataProc/RemainProcByDay/Daily_result_null_proc.py
# -*- coding=utf-8 -*-import osimport refrom DayProc import *def Insert_TmpDate_Data(run_day):    os.system("""mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -N -e "use funnyai_data; truncate table bitmp_date_list; " """ )    for date_str in getDayRunListReverse(run_day):        os.system("""mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -N -e "use funnyai_data; \                            insert into bitmp_date_list(str_date) \                            select '%s'; \                             " """ % (date_str))def resultNullProc(run_day):    result_data = os.popen("""mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -N -e "use funnyai_data; \                select appkey,app_source,date \                from chushou_user_compare_daily_stat \                where date='%s' and app_source <>'' \                group by appkey,app_source,date; \                 " """ % (run_day)).readlines();    nrpd_list = []    for nrp_list in result_data:        nrp = re.split('\t', nrp_list.replace('\n', ''))        nrpd_list.append(nrp)    for nrpd in nrpd_list:        appkey = nrpd[0]        appsource = nrpd[1]        date = nrpd[2]        result_data1 = os.popen("""mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -N -e "use funnyai_data; \                        select '%s' date,a1.str_date compare_date,'%s' appkey,'%s' app_source \                        from bitmp_date_list a1 \                        left join ( \                        select appkey,app_source,date,compare_date \                        from chushou_user_compare_daily_stat \                        where date='%s' and appkey='%s' and app_source='%s' \                        ) a2 on a1.str_date=a2.compare_date \                        where a2.appkey is null \                        order by a2.date,a1.str_date;" """ % (date, appkey, appsource, date, appkey, appsource)).readlines();        nrpd_list = []        for nrp_list in result_data1:            nrp = re.split('\t', nrp_list.replace('\n', ''))            nrpd_list.append(nrp)        for nrpd in nrpd_list:            date = nrpd[0]            compare_date = nrpd[1]            appkey = nrpd[2]            appsource = nrpd[3]            print date,compare_date,appkey,appsource            os.system("""mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -N -e "use funnyai_data; \                    insert into chushou_user_compare_daily_stat(appkey,app_source,date,compare_date,compare_identify_count,compare_register_user_count,compare_user_count,compare_pay_amount,compare_pay_user_count,compare_pay_count) \                    select '%s','%s','%s','%s','%d','%d','%d','%d','%d','%d'; \                     " """ % (appkey, appsource, date, compare_date, 0, 0, 0, 0, 0, 0))# run_day = '2017-01-01'# Insert_TmpDate_Data(run_day)# resultNullProc(run_day)for run_day in getDayRemainRunIntervalList('2017-01-02', '2017-01-03'):    print run_day    Insert_TmpDate_Data(run_day)    resultNullProc(run_day)


0 0
原创粉丝点击