Hive的HQL操作

来源:互联网 发布:中华民族伟大复兴 知乎 编辑:程序博客网 时间:2024/04/29 00:13

Hive的简单使用

  1. 创建表语句
    create table city(province_code int,
    province_name string,
    city_code int,
    city_name string
    )
    row FORMAT delimited
    fields terminated by ','
    lines terminated by '\n';

    如果需要将本地文件加载到Hive:
    load data local inpath '/home/bigdata/hive/city.txt' into table city;

  2. 基本的查询语句
    select * from city order by city_code desc limit 10;
    select province_name,count(city_name) as cnt from city group by province_name order by cnt desc limit 10; //统计城市最多的省份
    select city_code,count(2) as cnt from user where access='WIFI' group by city_code order by cnt desc limit 5; //统计wifi环境下用户最多的5个城市
    select access,city_code,count(uid) as cnti from user group by access,city_code having cnti>2 order by cnti desc; //统计用户数>=3的access和city_code组合
    select access,count(1) as cnts from user where lower(model) like '%ne%' group by access order by cnts desc; //机型中带有ne(不区分大小写),不同access及对应用户数
    select count(distinct province_name) from city; //统计省份总数
    select count(1)
    from
    (
    select province_name from city group by province_name
    ) a; //统计省份总数

    Hive的复杂使用

    INSERT OVERWRITE TABLE user_daily PARTITION (p_data='2017- 01-01')
    SELECT * FROM user; //插入分区
    show partitions user_daily; //显示分区
    set hive.exec.dynamic.partition.mode=nonstrict;
    INSERT OVERWRITE TABLE user_daily PARTITION (p_data)
    SELECT *,'2017-09-02' FROM user
    UNION ALL
    SELECT *,'2017-09-03' FROM user
    ; //动态插入分区
    ALTER TABLE user_daily PARTITION
    (p_data='2017-01-01') RENAME TO PARTITION
    (p_data='20170101'); //更改分区名字
    ALTER TABLE user_daily DROP PARTITION(p_data='20170101'); //删除
    select sum(if(access='WIFI',1,0))/count(1) from user; //使用WIFI的占比
    //统计尾号在0-3 4-7 8-9之间的id
    select
    case
    when uid % 10 in (0,1,2,3) then '0-3'
    when uid % 10 in (4,5,6,7) then '4-7'
    else '8-9'
    end as interval,
    count(*) as cnt
    from user
    group by
    case
    when uid % 10 in (0, 1, 2, 3) then '0-3'
    when uid % 10 in (4, 5, 6, 7) then '4-7'
    else '8-9'
    end;
    //去重
    select collect_set(access) from user;
    //不去重
    select collect_list(access) from user;
    //右边的表和左边表相同的字段(以左边的表为准)
    select user.uid, user.city_code, city.city_name
    from
    (select * from user where uid <= 100) user
    left join
    (select * from city where province_code <= 30) city
    on (user.city_code = city.city_code)
    limit 20;
    //(以右边的表为准)
    select user.uid, user.city_code, city.city_name
    from
    (select * from user where uid <= 100) user
    right join
    (select * from city where province_code <= 30) city
    on (user.city_code = city.city_code)
    limit 20;
    //内链接
    select user.uid, user.city_code, city.city_name
    from
    (select * from user where uid <= 100) user
    inner join
    (select * from city where province_code <= 30) city
    on (user.city_code = city.city_code)
    limit 20;
    //全连接
    select user.uid, user.city_code, city.city_name
    from
    (select * from user where uid <= 100) user
    full join
    (select * from city where province_code <= 30) city
    on (user.city_code = city.city_code)
    limit 20;
    //分组topN
    select access,city_code, uid
    from
    (
    select uid,access,city_code,
    row_number() over (partition by access order by city_code desc) as row_num
    from user
    )a
    where row_num=1;
    //累计
    select p_date,
    sum(cnt) over(order by p_date asc rows between unbounded preceding and current row)
    from
    (
    select p_date, count(*) as cnt
    from user_daily
    where p_date between '2017-09-01' and '2017-09-30'
    group by p_date
    )a
    ;
    //累计
    select p_data,
    sum(cnt) over(order by p_data asc rows between unbounded preceding and current row)
    from
    (
    select p_data,count(*) as cnt
    from user_daily
    where p_data between '2017-09-01' and '2017-09-30'
    group by p_data
    )a
    ;

原创粉丝点击