Hive中distinct和Group by效率对比及处理方式

来源:互联网 发布:淘宝安能物流被禁用 编辑:程序博客网 时间:2024/06/06 09:11

注:该Hive语法仍可以做很多优化:建立临时表一次性从大表中取出数据;把where条件尽量放在一次判断中,这里仅针对distinct和group by讨论。

结论:

针对大量数据的去重,group by的效率要远高于distinct。

从distinct转化为group by

简单的转化这里不提,针对:一条语句里求总记录条数以及去重之后的记录条数
简单示例:
这里写图片描述

SELECTCOUNT(*) AS sum_cnt,COUNT(DISTINCT age ) AS user_cntFROMliu_t_02 tGROUP BY t.name;

结果
这里写图片描述

转化为group by 只需要在加一个group by,并采用sum和count的组合

SELECTSUM(age),COUNT(tt.age)FROM(SELECT t.name0 AS name0,COUNT(age) as ageFROMliu_t_02 tGROUP BY t.name0,t.age)ttGROUP BY tt.name0

测试时间对比

自己写的用于计算巨量数据的distinct的脚本,在shell中用时13分钟

select    rst.flag                  AS flag   ,rst.source                AS source   ,rst.template              AS template   ,count(*)      AS click_cnt   ,count(distinct rst.imei)           AS click_user   ,rst.prod_name             AS prod_namefrom(    SELECT        ttt.flag                   AS flag       ,ttt.source                 AS source       ,ttt.template               AS template       ,tttt.prod_name             AS prod_name       ,tttt.imei                  AS imei    FROM    (        SELECT            t2.flag                   AS flag           ,t2.source                 AS source           ,t2.template               AS template           ,imei        FROM        (            SELECT                t1.pt_d                ,app_ver               ,t1.cardId               ,t1.template               ,t1.source               ,1                         AS flag               ,imei                      AS imei            FROM            (                SELECT                     t.imei                   ,t.app_ver                                                                AS app_ver                   ,get_json_object(t.content,'$.cardId')                                    AS cardId                   ,get_json_object(t.content,'$.template')                                  AS template                   ,IF(t.content rlike 'content',get_json_object(t.content,'$.content'),'')  AS source                   ,t.pt_d                                                                   AS pt_d                FROM                (                    SELECT                         imei                       ,report_evt_content      AS content                       ,app_ver                       ,pt_d                    FROM bicoredata.dwd_evt_bdreporter_app_oper_info_report_dm                    WHERE pt_d='20170722'                        AND pt_service='hiboard'                         AND package_name='com.huawei.hiboard'                        AND app_ver rlike '5*'                          AND report_evt_id='65539'                )t            )t1            WHERE t1.app_ver>='5.0.1.312'            AND cardid='4'        )t2        UNION ALL        SELECT            tt3.flag           AS flag           ,tt3.recreason      AS source           ,tt3.template       AS template           ,imei        FROM        (            SELECT                tt1.flag                            AS flag               ,tt1.recreason                       AS recreason               ,tt1.template                        AS template               ,tt1.pt_d                            AS pt_d               ,tt1.imei                            AS imei            FROM             (                SELECT                    tt.re_time                   ,tt.times                   ,tt.imei                   ,tt.version                   ,tt.phonetype                   ,get_json_object(a.apps,'$.template')                                  AS template                   ,get_json_object(a.apps,'$.cardId')                                    AS cardId                   ,IF(a.apps rlike 'recReason',get_json_object(a.apps,'$.recReason'),'') AS recreason                   ,2                                                                     AS flag                   ,tt.pt_d                                                               AS pt_d                FROM                (                    SELECT                        split(message,'\\|')[0]                               AS re_time                       ,split(message,'\\|')[1]                               AS times                       ,bicoredata.HiboardAesDecrypt(split(message,'\\|')[2]) AS imei                       ,split(message,'\\|')[3]                               AS cardId                       ,split(message,'\\|')[4]                               AS version                       ,split(message,'\\|')[5]                               AS phonetype                       ,split(message,'\\|')[6]                               AS showlist                       ,pt_d                    FROM                    (                        SELECT                            get_json_object(meassage,'$.message')      AS message                           ,get_json_object(meassage,'$.\\\\@logType') AS logType                           ,get_json_object(meassage,'$.\\@hostAddr')  AS hostAddr                           ,get_json_object(meassage,'$.\\@hostName')  AS hostName                           ,pt_d                        FROM biads.ads_rcm_hiboard_server_shows_operlog_dm                          WHERE pt_d='20170722'                    )tt0                )tt                LATERAL VIEW EXPLODE(split(regexp_replace(SUBSTR(tt.showlist,2,LENGTH(tt.showlist)-2),'\\},\\{','\\}#\\{'),'#')) a  AS apps            )tt1            JOIN            (                SELECT                    imei                FROM bicoredata.dwd_evt_bdreporter_app_oper_info_report_dm                WHERE pt_d='20170722'                    AND pt_service='hiboard'                    AND package_name='com.huawei.hiboard'                    AND app_ver rlike '5*'                    AND report_evt_id='65537'                    AND rec_time rlike '2017-07-22'            )tt2            ON tt1.imei = tt2.imei            WHERE tt1.cardid='4'                 AND tt1.version>='5.0.1.312'         )tt3    )ttt    LEFT OUTER JOIN    (        SELECT             UPPER(prod_name) AS prod_name           ,imei        FROM dwd_eqp_device_ds_his    )tttt    ON ttt.imei = tttt.imei)rstgroup by rst.flag,        rst.source,        rst.template,        rst.prod_name limit 50;

修改为group by,运算结果相同,用时9分钟,效率提升30%。

select    res.flag                  AS flag   ,res.source                AS source   ,res.template              AS template   ,SUM(res.click_user)       AS click_cnt   ,count(res.click_user)     AS click_user   ,res.prod_name             AS prod_namefrom(select    rst.flag                  AS flag   ,rst.source                AS source   ,rst.template              AS template   ,count(rst.imei)           AS click_user   ,rst.prod_name             AS prod_namefrom(    SELECT        ttt.flag                   AS flag       ,ttt.source                 AS source       ,ttt.template               AS template       ,tttt.prod_name             AS prod_name       ,tttt.imei                  AS imei    FROM    (        SELECT            t2.flag                   AS flag           ,t2.source                 AS source           ,t2.template               AS template           ,imei        FROM        (            SELECT                t1.pt_d                ,app_ver               ,t1.cardId               ,t1.template               ,t1.source               ,1                         AS flag               ,imei                      AS imei            FROM            (                SELECT                     t.imei                   ,t.app_ver                                                                AS app_ver                   ,get_json_object(t.content,'$.cardId')                                    AS cardId                   ,get_json_object(t.content,'$.template')                                  AS template                   ,IF(t.content rlike 'content',get_json_object(t.content,'$.content'),'')  AS source                   ,t.pt_d                                                                   AS pt_d                FROM                (                    SELECT                         imei                       ,report_evt_content      AS content                       ,app_ver                       ,pt_d                    FROM bicoredata.dwd_evt_bdreporter_app_oper_info_report_dm                    WHERE pt_d='20170722'                        AND pt_service='hiboard'                         AND package_name='com.huawei.hiboard'                        AND app_ver rlike '5*'                          AND report_evt_id='65539'                )t            )t1            WHERE t1.app_ver>='5.0.1.312'            AND cardid='4'        )t2        UNION ALL        SELECT            tt3.flag           AS flag           ,tt3.recreason      AS source           ,tt3.template       AS template           ,imei        FROM        (            SELECT                tt1.flag                            AS flag               ,tt1.recreason                       AS recreason               ,tt1.template                        AS template               ,tt1.pt_d                            AS pt_d               ,tt1.imei                            AS imei            FROM             (                SELECT                    tt.re_time                   ,tt.times                   ,tt.imei                   ,tt.version                   ,tt.phonetype                   ,get_json_object(a.apps,'$.template')                                  AS template                   ,get_json_object(a.apps,'$.cardId')                                    AS cardId                   ,IF(a.apps rlike 'recReason',get_json_object(a.apps,'$.recReason'),'') AS recreason                   ,2                                                                     AS flag                   ,tt.pt_d                                                               AS pt_d                FROM                (                    SELECT                        split(message,'\\|')[0]                               AS re_time                       ,split(message,'\\|')[1]                               AS times                       ,bicoredata.HiboardAesDecrypt(split(message,'\\|')[2]) AS imei                       ,split(message,'\\|')[3]                               AS cardId                       ,split(message,'\\|')[4]                               AS version                       ,split(message,'\\|')[5]                               AS phonetype                       ,split(message,'\\|')[6]                               AS showlist                       ,pt_d                    FROM                    (                        SELECT                            get_json_object(meassage,'$.message')      AS message                           ,get_json_object(meassage,'$.\\\\@logType') AS logType                           ,get_json_object(meassage,'$.\\@hostAddr')  AS hostAddr                           ,get_json_object(meassage,'$.\\@hostName')  AS hostName                           ,pt_d                        FROM biads.ads_rcm_hiboard_server_shows_operlog_dm                          WHERE pt_d='20170722'                    )tt0                )tt                LATERAL VIEW EXPLODE(split(regexp_replace(SUBSTR(tt.showlist,2,LENGTH(tt.showlist)-2),'\\},\\{','\\}#\\{'),'#')) a  AS apps            )tt1            JOIN            (                SELECT                    imei                FROM bicoredata.dwd_evt_bdreporter_app_oper_info_report_dm                WHERE pt_d='20170722'                    AND pt_service='hiboard'                    AND package_name='com.huawei.hiboard'                    AND app_ver rlike '5*'                    AND report_evt_id='65537'                    AND rec_time rlike '2017-07-22'            )tt2            ON tt1.imei = tt2.imei            WHERE tt1.cardid='4'                 AND tt1.version>='5.0.1.312'         )tt3    )ttt    LEFT OUTER JOIN    (        SELECT             UPPER(prod_name) AS prod_name           ,imei        FROM dwd_eqp_device_ds_his    )tttt    ON ttt.imei = tttt.imei)rstgroup by rst.flag,        rst.source,        rst.template,        rst.prod_name,        rst.imei)resgroup by res.flag,        res.source,        res.template,        res.prod_name   limit 50;
原创粉丝点击