hive常用操作

来源:互联网 发布:ubuntu输入不了中文 编辑:程序博客网 时间:2024/06/06 16:27

-2、外连接:right outer join

-1、加路径

hive <<EEOOFF
use lilu;
add file 60_sms_dun_telandphone/dun_org_name_map.txt;
add file 60_sms_dun_telandphone/rule_zhengye_orgcode.py;

0、byobu程序在服务器运行;本地hive能运行的程序,放到linux环境中用sh脚本运行,不一定能成功!

      cd /home/lilu/tmp_0505
      bash run_prepare.sh 20160531 340

      bash round2_rule_1.sh 20170506

1、hive判断是否为NULL

  用 where 字段a is null 

  用 where length(a)>0

2、hive建表、导入txt文件:
USE databasename;
CREATE TABLE if not exists temp_test(
masterhotel int COMMENT '酒店ID',
ciiquantity  double COMMENT '间夜量'

)

row format delimited

fields terminated by '\t'

stored as textfile;

hive> load data local inpath 'city1.txt' into table temp_test;

分区表:create table partition_test
(member_id string,
name string
)
partitioned by (
stat_date string,
province string)
row format delimited fields terminated by ',';

alter table partition_test add partition (stat_date='20110728',province='zhejiang');#创建好了一个分区

insert overwrite table partition_test partition(stat_date='20110728',province='henan') select member_id,name from partition_test_input where stat_date='20110728' and province='henan';#向partition_test的分区中插入数据

set hive.exec.dynamic.partition=true;#动态分区

insert overwrite table partition_test partition(stat_date='20110728',province)#主分区采用态列,副分区采用动态

set hive.exec.dynamic.partition.mode=nostrick;#允许分区列全部是动态的

LOAD DATA local INPATH '/user/hua/*' INTO TABLE day_hour partition(dt='2010-07- 07');

3、hive求差集:

 hive> select * from zz0; 

111111 
222222 
888888 
hive> select * from zz1; 
111111 
333333 
444444 
888888
 select * from zz0 full outer join zz1 on zz0.uid = zz1.uid; 
NULL 
111111  111111 
222222  NULL 
NULL    333333 
NULL    444444 
888888  888888 

求结果的并集:

create table lilu.dun_all_0428_2 as
select distinct collection_number from tmp1
union all select distinct collection_number from tmp2
union all select distinct collection_number from tmp3;

4、join:

hive> select * from zz0 join zz1 on zz0.uid = zz1.uid; 
111111  111111 
888888  888888

5、数据表导出为文件

insert overwrite local directory '/home/lilu/data_for_lilu_0316_1' row format delimited fields terminated by '\t'

select * from databasename.temp_test;

6、两个表join得到一张新表

set mapreduce.map.memory.mb=4096;
set mapreduce.reduce.memory.mb=4096; 

create table temp0 as
select a.*,b.* from dun_0310 as a join  tb_v4 as b on a.tel_1 = b.other_cell_phone;  dun_0310 放小表

7、合并文件

cd /home/lilu/data2
cat 0* > temp.txt

8、随机取数据

select * from temp_test order by rand() limit 1000;

9、日期格式转换:将20160101转化成2016-0101

yyyymmdd格式:计算日期最大值、最小值:

create table temp_ll1 as select tel,max(pt_dt) as max_dt,min(pt_dt) as min_dt from temp_details_123m_3 group by tel;

yyyymmdd格式转化为yyyy-mm-dd格式,计算年月日:

create table tb1 as

select from_unixtime(unix_timestamp(time1,'yyyymmdd'),'yyyy-mm-dd') as t1 from temp_tb0;

select year(t1) as year,month(t1) as month,day(t1) as day from tb1;

yyyy-mm-dd格式:计算两个日期相差几天

create table temp_total_span as select tel as tel1,datediff(max_dt,min_dt) as total_span from temp_ll2; 

日期时间转日期:

select to_date('2011-12-08 10:03:01') from temp_tb1;

10、将一张表中的数据插入到另一张表

insert into table temp_details_5m2 select * from temp_details_7m1;

11、查看、删除job

hadoop job -list

hadoop  job –kill  job_id

12、建数据库

create database lilu;

13、

select tel,year,month,if(imsis>10,1,0) as imsis10 from tb_323210_imsis;

select case when 1=1 then 'tom' when 2=2 then 'mary' else'tim' end from lxw_dual;

14、根据key_number列去重

create table tb_all_tel as

select *,row_number() over(distribute by key_number) rn from zhangwj.lilu_all_tel;

create table tb_tel1 as
select * from tb_all_tel where rn=1;

15、streaming理解为将hive表里的数据按行交到python程序中执行,最终将结果还按行写入到一张hive表里面。

16、修改表名:

ALTER TABLE name CHANGE column_name new_name new_type

17、需要中间结果命名一下

select distinct collection_number

from (select distinct collection_number from tmp1
union all select distinct collection_number from tmp2_1
union all select distinct collection_number from tmp3

union all select distinct collection_number from tmp5) tt;

18、select *,
case when sms_splitter_str like "%|催收企业名%" then 1
else 0 end as have_org
from tb_sms_sample_dun_telandphone_rule_4;

19、no score

12

13

13

22

24

select no,collect_set(score) as score_set from tablss group by no;

结果:noscore_set

1[2,3]

2[2,4]

select no,collect_set(score)[0] from tablss group by no;

no score

12

2 2

20、group by放在where后面,向hive表中插入数据。insert overwrite table tb_result 若表中本来存在数据,覆盖表中的数据。

insert overwrite table tb_result_rule_6_all 
select collection_number,min(pt_dt) as min_date,max(pt_dt) as max_date 
from tb_sms_sample_dun_telandphone
where (rule_tag=3 or come_from=1) and length(collection_number)>3
group by collection_number;

21、hive表直接导入到mysql表

sqoop export \
-connect 'jdbc:mysql://192.168.109.200:3306/db_tel_manage?useUnicode=true&characterEncoding=utf8' \
-username root -password root123 \
-table tb_result_rule_union_all \
-export-dir /user/hive/warehouse/qihu_sms_4.db/tb_result_rule_union_all \
--fields-terminated-by '\001' \
--lines-terminated-by '\n'

22、字符串截取函数:substr

语法: substr(string A, int start, int len)

hive> select substr('abcde',3,2) from lxw_dual;

cd

23、正则表达式替换函数:regexp_replace

语法: regexp_replace(string A, string B, string C)  说明:将字符串A中的符合java正则表达式B的部分替换为C。

hive> select regexp_replace('foobar', 'oo|ar', '') from lxw_dual;

fb

24、正则表达式解析函数:regexp_extract

hive> select regexp_extract('foothebar', 'foo(.*?)(bar)', 0) fromlxw_dual;

foothebar

25、hive、java匹配手机号码

select regexp_extract('8618750930899', '1[34578]\d{9}', 0) ;

       移动:134、135、136、137、138、139、150、151、157(TD)、158、159、187、188

  联通:130、131、132、152、155、156、185、186

  电信:133、153、180、189

"^((13[0-9])|(15[^4,\\D])|(18[0,5-9]))\\d{8}$"
((17[0-9])(14[0-9])|(13[0-9])|(15[^4,\\D])|(18[0,5-9]))\\d{8}
regexp_extract(sms_sender,'(1[34578]\\d{9}|0(10|2[0-5789]|\\d{3})\\d{7,8})',0) as sms_sender

0 0
原创粉丝点击