用Python在Mysql与Hive间数据计算一例

来源:互联网 发布:辐射4 画面优化设置 编辑:程序博客网 时间:2024/05/29 19:50
1、自带粉丝的计算
功能说明:
从mysql中取房间号roomId,然后在Hive中计算其首播日期,将首播日期返插回到Mysql中;然后取Mysql中的roomId和首播日期,在Hive中计算主播的自带粉丝数据;最后,将自带粉丝数更新到Mysql中。
/Users/nisj/PycharmProjects/BiDataProc/Demand/Cc0810/ccQuery.py
# -*- coding=utf-8 -*-import datetimeimport osimport warningsimport sysimport rereload(sys)sys.setdefaultencoding('utf8')warnings.filterwarnings("ignore")yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')def get10DayRangeAndYesterday(first_rec_date):    day10Start=(datetime.datetime.strptime(first_rec_date, '%Y-%m-%d') - datetime.timedelta(days=2)).strftime('%Y-%m-%d')    day10End=(datetime.datetime.strptime(first_rec_date, '%Y-%m-%d') + datetime.timedelta(days=7)).strftime('%Y-%m-%d')    yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')    return day10Start, day10End, yesterdaydef getRoomId():    roomIds = os.popen("""source /etc/profile; \                /usr/bin/mysql  -h199.199.199.199 -P6605 -umysqlUser -pmysqlPass -N -e "select room_id  \                from jellyfish_hadoop_stat.invite_anchor  \                where fans_count=0; \                " """).readlines();    roomId_list = []    for roomIdList in roomIds:        roomId = re.split('\t', roomIdList.replace('\n', ''))        roomId_list.append(roomId)    return roomId_listdef getFirstRecDatesAndUpdate2Mysql():    roomId_con=''    for roomId in getRoomId():        roomId_con = roomId_con + ',' + roomId[0]    roomId_con = roomId_con[1:]    first_rec_dates=os.popen("""source /etc/profile; \                /usr/lib/hive-current/bin/hive -e "select room_id,switch_date from \                (select room_id,switch_date,cast(sum(hour(time_minus)*60+minute(time_minus)+second(time_minus)/60) as bigint) live_duration,row_number()over(partition by room_id order by switch_date) rk \                from ( \                select room_id,substr(switch_time,1,10) switch_date,cast(updated_time as timestamp)-cast(switch_time as timestamp) time_minus \                from xxx_live_history_status \                where room_id in({roomId_con})) x \                group by room_id,switch_date \                having cast(sum(hour(time_minus)*60+minute(time_minus)+second(time_minus)/60) as bigint)>=120) xx \                where rk=1; \                " """.format(roomId_con=roomId_con)).readlines();    first_rec_date_list = []    for first_rec_dateList in first_rec_dates:        first_rec_date = re.split('\t', first_rec_dateList.replace('\n', ''))        first_rec_date_list.append(first_rec_date)    for first_rec_date in first_rec_date_list:        roomId=first_rec_date[0]        first_rec_date=first_rec_date[1]        os.system("""source /etc/profile; \                    /usr/bin/mysql  -h199.199.199.199 -P6605 -umysqlUser -pmysqlPass -e "update jellyfish_hadoop_stat.invite_anchor \                    set first_rec_date='{first_rec_date}' \                    where room_id={roomId}; \                    " """.format(roomId=roomId, first_rec_date=first_rec_date));def getRoomIdAndFirstRecDates():    roomIds = os.popen("""source /etc/profile; \                /usr/bin/mysql  -h199.199.199.199 -P6605 -umysqlUser -pmysqlPass -N -e "select room_id,substr(first_rec_date,1,10) first_rec_date  \                from jellyfish_hadoop_stat.invite_anchor  \                where fans_count=0 and first_rec_date<>'2099-12-30 23:59:59'; \                " """).readlines();    roomId_list = []    for roomIdList in roomIds:        roomId = re.split('\t', roomIdList.replace('\n', ''))        roomId_list.append(roomId)    return roomId_listdef getFansCntUpdate2Mysql():    # 自带粉丝数据的计算    for roomId, first_rec_date in getRoomIdAndFirstRecDates():        if first_rec_date <= (datetime.date.today() - datetime.timedelta(days=8)).strftime('%Y-%m-%d'):            day10Start = get10DayRangeAndYesterday(first_rec_date=first_rec_date)[0]            day10End = get10DayRangeAndYesterday(first_rec_date=first_rec_date)[1]            yesterday = get10DayRangeAndYesterday(first_rec_date=first_rec_date)[2]            fans_byself_cnts=os.popen("""source /etc/profile; \                        /usr/lib/hive-current/bin/hive -e "with tab_user_frist_subscriber as (select room_id,uid view_uid,state,created_time \                        from (select room_id,uid,state,created_time,row_number()over(partition by uid order by created_time) rk from oss_room_subscriber_roomid where pt_day='{yesterday}') x \                        where rk=1 and room_id={roomId}), \                        tab_newuser10days as(select uid,nickname,created_time,last_login_time  \                        from oss_chushou_user_profile  \                        where pt_day='{yesterday}' and substr(created_time,1,10) between '{day10Start}' and 'day10End') \                        select a1.room_id,'{first_rec_date}' first_rec_date,count(distinct a2.uid) fans_byself_cnt \                        from tab_user_frist_subscriber a1 \                        inner join tab_newuser10days a2 on a1.view_uid=a2.uid \                        group by a1.room_id; \                        " """.format(day10Start=day10Start, day10End=day10End, yesterday=yesterday, roomId=roomId, first_rec_date=first_rec_date)).readlines();            fans_byself_cnt_list = []            for fans_byself_cntList in fans_byself_cnts:                fans_byself_cnt = re.split('\t', fans_byself_cntList.replace('\n', ''))                fans_byself_cnt_list.append(fans_byself_cnt)            for fans_byself_cnt in fans_byself_cnt_list:                roomId=fans_byself_cnt[0]                fans_byself_cnt=fans_byself_cnt[2]                os.system("""source /etc/profile; \                            /usr/bin/mysql  -h199.199.199.199 -P6605 -umysqlUser -pmysqlPass -e "update jellyfish_hadoop_stat.invite_anchor \                            set fans_count={fans_byself_cnt} \                            where room_id={roomId}; \                            " """.format(roomId=roomId, fans_byself_cnt=fans_byself_cnt));# Batch TestgetFirstRecDatesAndUpdate2Mysql()getFansCntUpdate2Mysql()

2、<1>之前的一个版本
与1功能相同,但代码细节上因为功能逻辑的变更,稍有区别。<1>可能更规范合理一些,<2>可忽略。
/Users/nisj/PycharmProjects/BiDataProc/Demand/Cc0810/ccQuery0813.py
# -*- coding=utf-8 -*-import datetimeimport osimport warningsimport sysimport rereload(sys)sys.setdefaultencoding('utf8')warnings.filterwarnings("ignore")yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')def get10DayRangeAndYesterday(first_rec_date):    day10Start=(datetime.datetime.strptime(first_rec_date, '%Y-%m-%d') - datetime.timedelta(days=2)).strftime('%Y-%m-%d')    day10End=(datetime.datetime.strptime(first_rec_date, '%Y-%m-%d') + datetime.timedelta(days=7)).strftime('%Y-%m-%d')    yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')    return day10Start, day10End, yesterdaydef getRoomId():    roomIds = os.popen("""source /etc/profile; \                /usr/bin/mysql  -h199.199.199.199 -P6605 -umysqlUser -pmysqlPass -N -e "select room_id  \                from jellyfish_hadoop_stat.invite_anchor  \                where fans_count=0; \                " """).readlines();    roomId_list = []    for roomIdList in roomIds:        roomId = re.split('\t', roomIdList.replace('\n', ''))        roomId_list.append(roomId)    return roomId_listdef dataQueryUpdate2Mysql():    # 自带粉丝数据的计算    for roomId in getRoomId():        first_rec_dates=os.popen("""source /etc/profile; \                    /usr/lib/hive-current/bin/hive -e "select room_id roomid,switch_date from ( \                    select room_id,substr(switch_time,1,10) switch_date,cast(updated_time as timestamp)-cast(switch_time as timestamp) time_interval,row_number()over(partition by room_id order by switch_time) rk \                    from xxx_live_history_status \                    where room_id={roomId} and hour(cast(updated_time as timestamp)-cast(switch_time as timestamp))>=2 ) x \                    where rk=1; \                    " """.format(roomId=roomId[0])).readlines();        first_rec_date_list = []        for first_rec_dateList in first_rec_dates:            first_rec_date = re.split('\t', first_rec_dateList.replace('\n', ''))            first_rec_date_list.append(first_rec_date)        for first_rec_date in first_rec_date_list:            if first_rec_date[1] <= (datetime.date.today() - datetime.timedelta(days=8)).strftime('%Y-%m-%d'):                day10Start = get10DayRangeAndYesterday(first_rec_date=first_rec_date[1])[0]                day10End = get10DayRangeAndYesterday(first_rec_date=first_rec_date[1])[1]                yesterday = get10DayRangeAndYesterday(first_rec_date=first_rec_date[1])[2]                fans_byself_cnts=os.popen("""source /etc/profile; \                            /usr/lib/hive-current/bin/hive -e "with tab_user_frist_subscriber as (select room_id,uid view_uid,state,created_time \                            from (select room_id,uid,state,created_time,row_number()over(partition by uid order by created_time) rk from oss_room_subscriber_roomid where pt_day='{yesterday}') x \                            where rk=1 and room_id={roomId}), \                            tab_newuser10days as(select uid,nickname,created_time,last_login_time  \                            from oss_chushou_user_profile  \                            where pt_day='{yesterday}' and substr(created_time,1,10) between '{day10Start}' and 'day10End') \                            select a1.room_id,'{first_rec_date}' first_rec_date,count(distinct a2.uid) fans_byself_cnt \                            from tab_user_frist_subscriber a1 \                            inner join tab_newuser10days a2 on a1.view_uid=a2.uid \                            group by a1.room_id; \                            " """.format(day10Start=day10Start, day10End=day10End, yesterday=yesterday, roomId=roomId[0], first_rec_date=first_rec_date[1])).readlines();                fans_byself_cnt_list = []                for fans_byself_cntList in fans_byself_cnts:                    fans_byself_cnt = re.split('\t', fans_byself_cntList.replace('\n', ''))                    fans_byself_cnt_list.append(fans_byself_cnt)                for fans_byself_cnt in fans_byself_cnt_list:                    roomId=fans_byself_cnt[0]                    first_rec_date=fans_byself_cnt[1]                    fans_byself_cnt=fans_byself_cnt[2]                    os.system("""source /etc/profile; \                                /usr/bin/mysql  -h199.199.199.199 -P6605 -umysqlUser -pmysqlPass -e "update jellyfish_hadoop_stat.invite_anchor \                                set fans_count={fans_byself_cnt},first_rec_date='{first_rec_date}' \                                where room_id={roomId}; \                                " """.format(roomId=roomId, fans_byself_cnt=fans_byself_cnt, first_rec_date=first_rec_date));# Batch TestdataQueryUpdate2Mysql()

原创粉丝点击