hive按照文件夹分区

来源:互联网 发布:能耗 分析软件 免费 编辑:程序博客网 时间:2024/06/11 23:42

先创建了一张表按照tradeDate分区的外部表

DROP TABLE IF EXISTS t_hft;CREATE EXTERNAL TABLE t_hft(SecurityID STRING,tradeTime STRING,PreClosePx DOUBLE) PARTITIONED BY (tradeDate INT)ROW FORMAT DELIMITED FIELDS TERMINATED BY ','LOCATION'/user/hive/warehouse/test.db/endTest/';


然后加入2个分区并引入location


ALTER TABLE t_hft ADD PARTITION (tradeDate = 1) LOCATION '/user/hive/warehouse/test.db/endTest/ttest1';ALTER TABLE t_hft ADD PARTITION (tradeDate = 2) LOCATION '/user/hive/warehouse/test.db/endTest/ttest2';


备注:hive不支持下划线类似_test.csv文件导入,表中字段如果有中文需要加入``反引号,例如`tradeTime` STRING


hive中ods是操作数据层,是最底层】

hive查看表信息:

desc extend test_table;

删除所有行:

truncate table test_table;


分桶作用:

1.数据sampling(采样)
2.提升某些查询操作效率,列如Map Side join

create table tb_user (
id int ,name string,age int
) partitioned by (timeflag bigint)
 clustered by (id) 
sorted by (age asc)
 into 4 buckets  
row format delimited fields 
terminated by ',';

show databases;
show tables;
desc test;


-------------
分桶表示例:


#创建分桶表
drop table stu_buck;
create table stu_buck(Sno int,Sname string,Sex string,Sage int,Sdept string)
clustered by(Sno) 
sorted by(Sno DESC)
into 4 buckets
row format delimited
fields terminated by ',';




#设置变量,设置分桶为true, 设置reduce数量是分桶的数量个数
set hive.enforce.bucketing = true;
set mapreduce.job.reduces=4;




insert overwrite table student_buck
select * from student cluster by(Sno) sort by(Sage);  报错,cluster 和 sort 不能共存




#开会往创建的分通表插入数据(插入数据需要是已分桶, 且排序的)
#可以使用distribute by(sno) sort by(sno asc)   或是排序和分桶的字段相同的时候使用Cluster by(字段)
#注意使用cluster by  就等同于分桶+排序(sort)
insert into table stu_buck
select Sno,Sname,Sex,Sage,Sdept from student distribute by(Sno) sort by(Sno asc);


insert overwrite table stu_buck
select * from student distribute by(Sno) sort by(Sno asc);


insert overwrite table stu_buck
select * from student cluster by(Sno);


------------------------
保存select查询结果的几种方式:
1、将查询结果保存到一张新的hive表中
create table t_tmp
as
select * from t_p;


2、将查询结果保存到一张已经存在的hive表中
insert into  table t_tmp
select * from t_p;


3、将查询结果保存到指定的文件目录(可以是本地,也可以是hdfs)
insert overwrite local directory '/home/hadoop/test'
select * from t_p;




insert overwrite directory '/aaa/test'
select * from t_p;


-----------------------------------
关于hive中的各种join


准备数据
1,a
2,b
3,c
4,d
7,y
8,u


2,bb
3,cc
7,yy
9,pp






建表:
create table a(id int,name string)
row format delimited fields terminated by ',';


create table b(id int,name string)
row format delimited fields terminated by ',';


导入数据:
load data local inpath '/home/hadoop/a.txt' into table a;
load data local inpath '/home/hadoop/b.txt' into table b;




实验:
** inner join
select * from a inner join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 2     | b       | 2     | bb      |
| 3     | c       | 3     | cc      |
| 7     | y       | 7     | yy      |
+-------+---------+-------+---------+--+










**left join
select * from a left join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | a       | NULL  | NULL    |
| 2     | b       | 2     | bb      |
| 3     | c       | 3     | cc      |
| 4     | d       | NULL  | NULL    |
| 7     | y       | 7     | yy      |
| 8     | u       | NULL  | NULL    |
+-------+---------+-------+---------+--+










**right join
select * from a right join b on a.id=b.id;










**
select * from a full outer join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | a       | NULL  | NULL    |
| 2     | b       | 2     | bb      |
| 3     | c       | 3     | cc      |
| 4     | d       | NULL  | NULL    |
| 7     | y       | 7     | yy      |
| 8     | u       | NULL  | NULL    |
| NULL  | NULL    | 9     | pp      |
+-------+---------+-------+---------+--+




**
select * from a left semi join b on a.id = b.id;
+-------+---------+--+
| a.id  | a.name  |
+-------+---------+--+
| 2     | b       |
| 3     | c       |
| 7     | y       |
+-------+---------+--+




-------------
多重插入:


from student
insert into table student_p partition(part='a')
select * where Sno<95011;
insert into table student_p partition(part='a')
select * where Sno<95011;




--------------
导出数据到本地:


insert overwrite local directory '/home/hadoop/student.txt'
select * from student;


-------------
UDF案例:
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;


drop table if exists t_rating;
create table t_rating(movieid string,rate int,timestring string,uid string)
row format delimited fields terminated by '\t';


insert overwrite table t_rating
select split(parsejson(line),'\t')[0]as movieid,split(parsejson(line),'\t')[1] as rate,split(parsejson(line),'\t')[2] as timestring,split(parsejson(line),'\t')[3] as uid from rat_json limit 10;




-------
内置jason函数
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate  from rat_json limit 10;




-----------
transform案例:


1、先加载rating.json文件到hive的一个原始表 rat_json
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;


2、需要解析json数据成四个字段,插入一张新的表 t_rating
insert overwrite table t_rating
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate  from rat_json;


3、使用transform+python的方式去转换unixtime为weekday
先编辑一个python脚本文件
########python######代码
vi weekday_mapper.py
#!/bin/python
import sys
import datetime


for line in sys.stdin:
  line = line.strip()
  movieid, rating, unixtime,userid = line.split('\t')
  weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
  print '\t'.join([movieid, rating, str(weekday),userid])


保存文件
然后,将文件加入hive的classpath:
hive>add FILE /home/hadoop/weekday_mapper.py;
hive>create TABLE u_data_new as
SELECT
  TRANSFORM (movieid, rate, timestring,uid)
  USING 'python weekday_mapper.py'
  AS (movieid, rate, weekday,uid)
FROM t_rating;


select distinct(weekday) from u_data_new limit 10;


0 0