9.16 hive基本命令与窗口函数

来源:互联网 发布:淘宝小二在哪里找 编辑:程序博客网 时间:2024/06/05 08:08

CREATE TABLE CITY(
province_code INT,
province_name string,
city_code INT,
city_name string
)

row FORMAT delimited
fields terminated by ‘,’
lines terminated by ‘\n’;

show create table city;
desc city;

load data local inpath ‘/home/bigdata/hiv/city.txt’ into table city;

select * from city limit 10
set hive.cli.print.header=true;

CREATE TABLE phone_daily(
model string,
os string,
brand string,
resolution string,
launch_date string
)
partitioned by (p_date string);

INSERT OVERWRITE TABLE phone_daily PARTITION (p_date ‘2017-09-13’);

select province_name,cnt from(
select province_name,count(city_name) as cnt
from city
group by province_name
)a
where cnt=1;
select province_name,count(city_name) as cnt from city group by province_name having cnt<2;

select province_name,count(city_name) as cnt from city group by province_name order by cnt desc limit 10

CREATE EXTERNAL TABLE city_ex(
province_code int,
province_name string,
city_code int,
city_name string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘,’
LINES TERMINATED BY ‘\n’
LOCATION ‘/user/tanqi/city’;

CREATE TABLE city_in(
province_code int,
province_name string,
city_code int,
city_name string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘,’
LINES TERMINATED BY ‘\n’
LOCATION ‘/user/tanqi/city’;

CREATE TABLE user(
uid INT,
city_code INT,
model string,
access string
)
row FORMAT delimited
fields terminated by ‘,’
lines terminated by ‘\n’;

load data local inpath ‘/home/bigdata/tanqi/hive/user.txt’ into table user;

create table user_daily(
uid int,
city_code int,
model string,
access string
)
partitioned by (p_date string);

INSERT OVERWRITE TABLE user_daily PARTITION (p_date=’2017-09-01’)
SELECT * FROM user; //插入的数据源从user里来 从user里先查出来后再插入

show partitioned user_daily;

set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE user_daily PARTITION (p_date)
SELECT *,’2017-09-02’ FROM user
UNION ALL
SELECT *,’2017-09-03’ FROM user
;

ALTER TABLE user_daily PARTITION (p_date=’2017-01-01’) RENAME TO PARTITION (p_date=’20170101’);

alter table user_daily drop partition(p_date=’20170901’);

select city_code,count(distinct uid) as sum from user where access=’WIFI’ group by city_code order by sum desc limit 5;

select city_code,access,count(uid)as cnt from user group by access,city_code having cnt>=3 order by cnt desc;

select access,count(uid) as cnt from user where lower(model) like”%ne%” group by access order by cnt desc;

select sum(if(access=’WIFI’,1,0))/count(1) from user;
如果access=wifi 则+1 否则0

case 表达式
when 值1 then 结果1
when 值2 then 结果2
else 结果3

select
case
when uid % 10 in (0, 1, 2, 3) then ‘0-3’
when uid % 10 in (4, 5, 6, 7) then ‘4-7’
else ‘8-9’
end as interval,
count(*) as cnt
from user
group by
case
when uid % 10 in (0, 1, 2, 3) then ‘0-3’
when uid % 10 in (4, 5, 6, 7) then ‘4-7’
else ‘8-9’
end;

select collect_set(access) from user;

[“WIFI”,”2G”,”3G”,”4G”] 所有取值的集合 自动去重

select collect_list(access) from user; 不去重的集合

select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
FULL join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 200;

(right left inner full)

  A   JOIN   B

左 △ null
右 null △
内 A∩B
全 A∪B

set hive.cli.print.header=true;
set hive.cli.print.row.to.vertical=true;
set hive.cli.print.row.to.vertical.num=1; 不显示列名

分组top N 窗口函数
① 分组 例如 access
②排序 序号 city_code
③top 1

select access,city_code, uid
from
(
select uid, access, city_code,
row_number() over (partition by access order by city_code desc) as row_num
from user
) a
where row_num = 1;

row number() 返回结果集分区内行的序列号,每个分区的第一行从 1 开始。
over
over后的写法:
over(order by salary) 按照salary排序进行累计,order by是个默认的开窗函数
over(partition by deptno)按照部门分区

select p_date,
sum(cnt) over(order by p_date asc rows between unbounded preceding and current row)
from
(
select p_date, count(*) as cnt
from user_daily
where p_date between ‘2017-09-01’ and ‘2017-09-30’
group by p_date
)a
;

unbounded preceding无边界的第一行