利用hive完成阿里天池大数据音乐预测比赛数据处理工作

来源:互联网 发布:02795522网络诈骗电话 编辑:程序博客网 时间:2024/05/16 09:38

hive shell

创建外表,指定目录
CREATE EXTERNAL TABLE IF NOT EXISTS songs2 (
sid string,
aid string,
ptime string,
sinit int,
language int,
gender int) 
COMMENT 'This is the staging page view table'   
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION  '/bs/music/songs/'; 
导入HDFS文件(原文件消失)
LOAD DATA INPATH '/bs/music/input/mars_tianchi_songs.csv' OVERWRITE INTO TABLE songs2;
查看前10条数据
select * from songs2 limit 10;
创建外表,指定目录
CREATE EXTERNAL TABLE IF NOT EXISTS useraction (
uid string,
sid string,
btime string,
atype int,
ds string) 
COMMENT 'This is the staging page view table'   
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION  '/bs/music/useraction/'; 
导入HDFS文件(原文件消失)
LOAD DATA INPATH '/bs/music/input/mars_tianchi_user_actions.csv' OVERWRITE INTO TABLE useraction;
select * from useraction limit 10;
表连接 小表在前
SELECT t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;
Map join连接
SELECT /*+mapjoin(songs2)*/ t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;
导出查询数据到hdfs 
INSERT OVERWRITE DIRECTORY '/bs/music/data' 
SELECT /*+mapjoin(songs2)*/ t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;
查询结果保存到表
CREATE EXTERNAL TABLE IF NOT EXISTS usersongs (
sid string,
aid string,
ptime string,
sinit int,
language int,
gender int,
uid string,
sid2 string,
btime string,
atype int,
ds string) 
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION  '/bs/music/data/'; 
INSERT OVERWRITE TABLE usersongs 
SELECT /*+mapjoin(songs2)*/ t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;
查询结果保存到本地
hive -e "select * from usersongs limit 10" >> /opt/tools/test.csv
查看HDFS上文件的前5行
hadoop fs -text  /bs/music/data/000000_0 |head -n 5
去重分组查询 同一aid的uid去重总量,sid的去重总量
select count(distinct uid),count(distinct sid),aid from usersongs  where atype=1 group by aid;

1 0
原创粉丝点击