9.16号HIVE教学

来源:互联网 发布:二次元轻小说软件 编辑:程序博客网 时间:2024/04/29 19:17

上午
谭奇hive教学:

进入:
ssh bigdata@47.94.18.202
密码:bigdata2017
进入003:ssh bigdata003
alias
进入MySQL:mysql -u hive -phive2017 -D hive

查询表:show tables;
select * from TBLS;
select * from TBLS\G

返回bigdata001:
先退出 Ctrl+z
ssh bigdata001
进入hive:hive
创建一个自己的库:create database nwj; use nwj; show tables;

在notepad++上建表:
CREATE TABLE city(
province_code INT,
province_name string,
city_code INT,
city_name string
)
row FORMAT delimited
fields terminated by ‘,’
lines terminated by ‘\n’;

删除表:
drop table city;

导入表:load data local inpath ‘/home/bigdata/hive/city.txt’ into table city;
导入后可以查找:select * from city;
限制查询行数:select * from city limit 10;

which hive;
进入hive路径
cd /usr/../hive
cd conf
进入hive:hive
建库:create database nwj;
查看库:show databases;
进入库:1use nwj;
建表:
CREATE TABLE city(
province_code INT,
province_name string,
city_code INT,
city_name string
)
row FORMAT delimited
fields terminated by ‘,’
lines terminated by ‘\n’;
load data local inpath ‘/home/bigdata/hive/city.txt’ into table city;

create table user_daily(
uid int,
city_code int,
model string,
access string
)
partitioned by (p_date string);
INSERT OVERWRITE TABLE user_daily PARTITION (p_date=’2017-09-01’)
SELECT * FROM user;

CREATE EXTERNAL TABLE city_ex(
province_code int,
province_name string,
city_code int,
city_name string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘,’
LINES TERMINATED BY ‘\n’
LOCATION ‘/user/hdfs/tanqi/city’;
CREATE TABLE city_in(
province_code int,
province_name string,
city_code int,
city_name string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘,’
LINES TERMINATED BY ‘\n’
LOCATION ‘/user/hdfs/tanqi/city’;

CREATE TABLE user(
uid INT,
city_code INT,
model string,
access string
)
row FORMAT delimited
fields terminated by ‘,’
lines terminated by ‘\n’;
load数据到hive表:load data local inpath ‘/home/bigdata/tanqi/hive/user.txt’ into table user;

INSERT OVERWRITE TABLE user_daily PARTITION (p_date=’2017-09-01’)
SELECT * FROM user;

set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE user_daily PARTITION (p_date)
SELECT *,’2017-09-02’ FROM user
UNION ALL
SELECT *,’2017-09-03’ FROM user;

ALTER TABLE user_daily PARTITION (p_date=’2017-01-01’) RENAME TO PARTITION (p_date=’20170101’);

查看表语句:show creat tables city;
查表内容:select * from city limit 10;
显示列名:set hive.cli.print.header=true;

修改表名:alter table city rename to city_new;
复制表:create table city like city_new;

删除库:drop database tq;

删除表:drop table city;

以下例子:

作业统计:
1.创建user表,并将文件数据导入user表(上面)
select city_code,count(1) as cnt from user where access=’WIFI’
group by city_code order by cnt desc limit;

select access,city_code,count(1) as cnt from user
group by access,city_code having cnt >= 3 order by cnt desc;

select access,count(1) as cnt from user
where lower(model) like “%ne%”
group by access order by cnt desc;

晚上:
在hive上运行:show tables;
select * from user limit 5;//查看前五行
select count(1) from user;//查看总数
select sum(if(access=’WIFI’,1,0))/count(1) from user;//查看wifi占比
select sum(if(access=’2G’,1,0))/count(1) from user;//查看2G占比

分支判断分类语句:
select
case
when uid % 10 in (0, 1, 2, 3) then ‘0-3’
when uid % 10 in (4, 5, 6, 7) then ‘4-7’
else ‘8-9’
end as interval,
count(*) as cnt
from user
group by
case
when uid % 10 in (0, 1, 2, 3) then ‘0-3’
when uid % 10 in (4, 5, 6, 7) then ‘4-7’
else ‘8-9’
end;

统计access的去重集合:
select collect_set(access) from user;
统计access的不去重集合(数组):
select collect_list(access) from user;

关联俩个表:
进行左连接集合前20行:
select user.uid, user.city_code, city.city_name
from
(select * from user where uid <= 100) user
left join
(select * from city where province_code <= 30) city
on (user.city_code = city.city_code)
limit 20;
//右连接right,内连接inner(交集) 全连接full(并集),

窗口函数:
分组排序
select access,city_code,uid
from
(
select uid,access,city_code,
row_number() over (partition by access order by city_code desc) as row_num
from user
) a
where row_num = 1;

累计和
select p_date,
sum(cnt) over(order by p_date asc rows between unbounded preceding and current row)
from
(
select p_date, count(*) as cnt
from user_daily
where p_date between ‘2017-09-01’ and ‘2017-09-30’
group by p_date
)a
;