Hive 进阶

来源:互联网 发布:钢铁产能过剩数据 编辑:程序博客网 时间:2024/04/29 06:05

两种情况下不走map-reduce:

1. where ds >' ' //ds 是partition

2. select * from table //后面没有查询条件,什么都没有


1.建表

CREATE TABLE sal(  id INT,  name STRING,  salary INT  )partitioned by (city string)ROW FORMAT DELIMITEDFIELDS TERMINATED BY '\t'STORED AS TEXTFILE;




1.1 修改表及属性

#把id,name以外的列删除alter table sal replace columns (id int, name string); #增加列alter table sal add columns (remark string);#修改column ALTER TABLE table_name         CHANGE  col_old_name col_new_name         column_type;      ALTER TABLE sal CHANGE remark city string;


2.导入数据

load data local inpath '/home/hadoop/in/mytable' overwrite into table sal;1zuansun3000none2zuansu24000none3zuansu33000none4zuansu44000none5zuansu53000none6zuansu64000none7zuansu73000none8zuansu84000none9zuansu910000none10zuansu1020000none11zuansu1115000none12zuansu1225000none


3.嵌套查询

from (select * from sal) e select e.id,e.name,e.salary  where e.salary>3000;#case whenselect id,name, case     when salary<10000 then '屌丝'     when salary>=10000 and salary<20000 then '中下等'     when salary>=20000 and salary<50000 then '高帅富'    else '外星人'   end as salarylevel from sal;


4.group by 

select remark,sum(salary) from sal group by remark;


5.动态分区 

5.1 创建临时表

CREATE TABLE sal_tmp(  id INT,  name STRING,  salary INT,  city string  )ROW FORMAT DELIMITEDFIELDS TERMINATED BY '\t'STORED AS TEXTFILE;


5.2将数据导入到临时表中

load data local inpath '/home/hadoop/in/mytable' overwrite into table sal_tmp;


5.3 操作的配置

set hive.exec.dynamic.partition=true; // 允许动态分区set hive.exec.dynamic.partition.mode=nonstrict;  set hive.exec.dynamic.partitions.pernode=50000;  set hive.exec.dynamic.partitions.partitions=50000;  set hive.exec.max.created.files=500000;  set mapred.reduce.tasks =20000; //每个任务默认的reduce数目 set hive.merge.mapfiles=true;  //在Map-only的任务结束时合并小文件




5.4附partition相关参数:

hive.exec.dynamic.partition(缺省false): 设置为true允许使用dynamic partition
hive.exec.dynamic.partition.mode(缺省strick):设置dynamic partition模式(nostrict允许所有partition列都为dynamic partition,strict不允许)
hive.exec.max.dynamic.partitions.pernode (缺省100):每一个mapreduce job允许创建的分区的最大数量,如果超过了这个数量就会报错
hive.exec.max.dynamic.partitions (缺省1000):一个dml语句允许创建的所有分区的最大数量
hive.exec.max.created.files (缺省100000):所有的mapreduce job允许创建的文件的最大数量




5.5
insert into table sal partition (city) select * from sal_tmp;


6. join操作

#建表create table a(id int,gender string)row format delimited fields terminated by '\t' stored as textfile;#加载数据load data local inpath '/home/hadoop/in/a' overwrite into table a;#内连接查询select sal.id,sal.name,sal.salary,sal.city,a.gender from sal join a on(sal.id=a.id);#左外连接查询select sal.id,sal.name,sal.salary,sal.city,a.gender from sal left outer join a on(sal.id=a.id);


7.创建索引

create index a_index on table a(id) AS  'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD ;     


8.桶

#临时表create table tb_tmp(id int,age int, name string ,timeflag bigint) row format delimited fields terminated by ',';#带桶的表,4个桶create table tb_stu(id int,age int, name string,timeflag bigint) clustered by (id) sorted by (age) into 4 buckets row format delimited fields terminated by ',';#加载数据到临时表load data local inpath '/home/hadoop/in/tb_tmp' overwrite into table tb_tmp;1,20,zxm,201403302,21,ljz,201403303,19,cds,201403304,18,mac,201403305,22,android,201403306,23,symbian,201403307,25,wp,201403308,20,cxd,201403309,21,fvd,2014033010,19,cvb,2014033011,18,erd,2014033012,22,nid,2014033013,23,fvd,2014033014,19,cvb,2014033015,18,e33,2014033016,22,nid,20140330#设置执行桶的属性set hive.enforce.bucketing = true;#插入到tb_stu表insert into table tb_stu select * from tb_tmp;#抽样select * from tb_stu tablesample(bucket 1 out of 4 on id);注:tablesample是抽样语句,语法:TABLESAMPLE(BUCKET x OUT OF y),相当于以下语句:SELECT * FROM numbersflat WHERE number % y = x-1;


9.RCfile

#rcfile 格式表create table tb_rc(id int,age int, name string ,timeflag bigint) row format delimited fields terminated by ',' stored as rcfile;#插入数据,上表中已经有tb_tmp表,所以直接插入数据即可insert into table tb_rc select * from tb_tmp;




10.分隔符的多样化(配合正则表达式使用)

#cat /tmp/liuxiaowen/1.txt 000377201207221125^^APPLE IPHONE 4S^^2132288201210331629^^THINKING IN JAVA^^1132288201210331629^^THIN ssss^^1111132288201210331629^^THdd dd ddJAVA^^10add jar /opt/app/hive-0.7.0-rc1/lib/hive-contrib-0.7.0.jar ;    create external table tt(times string,  product_name string,  sale_num string  ) ROW FORMAT  SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'  WITH SERDEPROPERTIES  ( 'input.regex' = '([^^]*)\\^\\^([^^]*)\\^\\^([^^]*)',  'output.format.string' = '%1$s %2$s %3$s')  STORED AS TEXTFILE;load data local inpath '/home/hadoop/in/tt' overwrite into table tt;hive> select product_name from tt;    APPLE IPHONE 4S  THINKING IN JAVA  THIN ssss  THdd dd ddJAVA 




11.更加复杂的数据类型 

11.1 array

cat login_array.txt                                                                                                                    192.168.1.1,3105007010|3105007011|3105007012192.168.1.2,3105007020|3105007021|3105007022CREATE TABLE login_array (  ip  STRING,  uid  array<BIGINT>)PARTITIONED BY (dt STRING)ROW FORMAT DELIMITEDFIELDS TERMINATED BY ','COLLECTION ITEMS TERMINATED BY '|'STORED AS TEXTFILE;加载数据到hive表LOAD DATA LOCAL INPATH '/home/hadoop/in/login_array' OVERWRITE INTO TABLE login_array PARTITION (dt='20130101'); #查看数据select * from login_array;192.168.1.1[3105007010,3105007011,3105007012]20130101192.168.1.2[3105007020,3105007021,3105007022]20130101select ip,uid[0] from login_array where dt='20130101'; --使用下标访问数组192.168.1.13105007010192.168.1.23105007020select ip,size(uid) from login_array where dt='20130101'; #查看数组长度192.168.1.13192.168.1.23select * from login_array where  array_contains(uid,3105007010);#数组查找192.168.1.1[3105007010,3105007011,3105007012]20130101


11.2 使用Map

cat map_test_raw:2014-03-03 12:22:34#127.0.0.1#get#amap#src=123&code=456&cookie=789#status=success&time=2s2014-03-03 11:22:34#127.0.0.1#get#autonavi#src=123&code=456#status=success&time=2s&cookie=789#创建表create external  table map_test_raw(ts String,ip String,type String,logtype String,request Map<String,String>,response Map<String,String>)  ROW FORMAT DELIMITED FIELDS TERMINATED BY '#'  COLLECTION ITEMS TERMINATED BY '&'   MAP KEYS TERMINATED BY '='   stored as textfile;LOAD DATA LOCAL INPATH '/home/hadoop/in/map_test_raw' OVERWRITE INTO TABLE map_test_raw;#查看数据select * from map_test_raw;2014-03-03 12:22:34127.0.0.1getamap{"src":"123","code":"456","cookie":"789"}{"status":"success","time":"2s"}2014-03-03 11:22:34127.0.0.1getautonavi{"src":"123","code":"456"}{"status":"success","time":"2s","cookie":"789"}


11.3 使用struct

# cat login_struct.txt192.168.1.1,zhangsan:40192.168.1.1,lisi:41192.168.1.1,gavin:42192.168.1.1,wangwu:43192.168.1.1,xiaoming:44192.168.1.1,xiaojun:45# 建表CREATE TABLE login_struct (  ip  STRING,  user  struct<name:string,age:int>)ROW FORMAT DELIMITEDFIELDS TERMINATED BY ','COLLECTION ITEMS TERMINATED BY ':'STORED AS TEXTFILE;#导入数据LOAD DATA LOCAL INPATH '/home/hadoop/in/login_struct' OVERWRITE INTO TABLE login_struct; #查看数据select ip,user from login_struct;192.168.1.1{"name":"zhangsan","age":40}192.168.1.1{"name":"lisi","age":41}192.168.1.1{"name":"gavin","age":42}192.168.1.1{"name":"wangwu","age":43}192.168.1.1{"name":"xiaoming","age":44}192.168.1.1{"name":"xiaojun","age":45}















0 0