hive常用操作

来源：互联网发布：ubuntu输入不了中文编辑：程序博客网时间：2024/06/06 16:27

-2、外连接：right outer join

-1、加路径

hive <<EEOOFF
use lilu;
add file 60_sms_dun_telandphone/dun_org_name_map.txt;
add file 60_sms_dun_telandphone/rule_zhengye_orgcode.py;

0、byobu程序在服务器运行；本地hive能运行的程序，放到linux环境中用sh脚本运行，不一定能成功！

cd /home/lilu/tmp_0505
bash run_prepare.sh 20160531 340

bash round2_rule_1.sh 20170506

1、hive判断是否为NULL

用 where 字段a is null

用 where length(a)>0

2、hive建表、导入txt文件：
USE databasename;
CREATE TABLE if not exists temp_test(
masterhotel int COMMENT '酒店ID',
ciiquantity double COMMENT '间夜量'

)

row format delimited

fields terminated by '\t'

stored as textfile;

hive> load data local inpath 'city1.txt' into table temp_test;

分区表：create table partition_test
(member_id string,
name string
)
partitioned by (
stat_date string,
province string)
row format delimited fields terminated by ',';

alter table partition_test add partition (stat_date='20110728',province='zhejiang');#创建好了一个分区

insert overwrite table partition_test partition(stat_date='20110728',province='henan') select member_id,name from partition_test_input where stat_date='20110728' and province='henan';#向partition_test的分区中插入数据

set hive.exec.dynamic.partition=true;#动态分区

insert overwrite table partition_test partition(stat_date='20110728',province)#主分区采用静态列，副分区采用动态

set hive.exec.dynamic.partition.mode=nostrick;#允许分区列全部是动态的

LOAD DATA local INPATH '/user/hua/*' INTO TABLE day_hour partition(dt='2010-07- 07');

3、hive求差集:

hive> select * from zz0;

111111
222222
888888
hive> select * from zz1;
111111
333333
444444
888888
select * from zz0 full outer join zz1 on zz0.uid = zz1.uid;
NULL
111111 111111
222222 NULL
NULL 333333
NULL 444444
888888 888888

求结果的并集：

create table lilu.dun_all_0428_2 as
select distinct collection_number from tmp1
union all select distinct collection_number from tmp2
union all select distinct collection_number from tmp3;

4、join：

hive> select * from zz0 join zz1 on zz0.uid = zz1.uid;
111111 111111
888888 888888

5、数据表导出为文件

insert overwrite local directory '/home/lilu/data_for_lilu_0316_1' row format delimited fields terminated by '\t'

select * from databasename.temp_test;

6、两个表join得到一张新表

set mapreduce.map.memory.mb=4096;
set mapreduce.reduce.memory.mb=4096;

create table temp0 as
select a.*,b.* from dun_0310 as a join tb_v4 as b on a.tel_1 = b.other_cell_phone; dun_0310 放小表

7、合并文件

cd /home/lilu/data2
cat 0* > temp.txt

8、随机取数据

select * from temp_test order by rand() limit 1000;

9、日期格式转换:将20160101转化成2016-0101

yyyymmdd格式：计算日期最大值、最小值：

create table temp_ll1 as select tel,max(pt_dt) as max_dt,min(pt_dt) as min_dt from temp_details_123m_3 group by tel;

yyyymmdd格式转化为yyyy-mm-dd格式，计算年月日：

create table tb1 as

select from_unixtime(unix_timestamp(time1,'yyyymmdd'),'yyyy-mm-dd') as t1 from temp_tb0;

select year(t1) as year,month(t1) as month,day(t1) as day from tb1;

yyyy-mm-dd格式：计算两个日期相差几天

create table temp_total_span as select tel as tel1,datediff(max_dt,min_dt) as total_span from temp_ll2;

日期时间转日期：

select to_date('2011-12-08 10:03:01') from temp_tb1;

10、将一张表中的数据插入到另一张表

insert into table temp_details_5m2 select * from temp_details_7m1;

11、查看、删除job

hadoop job -list

hadoop job –kill job_id

12、建数据库

create database lilu;

13、

select tel,year,month,if(imsis>10,1,0) as imsis10 from tb_323210_imsis;

select case when 1=1 then 'tom' when 2=2 then 'mary' else'tim' end from lxw_dual;

14、根据key_number列去重

create table tb_all_tel as

select *,row_number() over(distribute by key_number) rn from zhangwj.lilu_all_tel;

create table tb_tel1 as
select * from tb_all_tel where rn=1;

15、streaming理解为将hive表里的数据按行交到python程序中执行，最终将结果还按行写入到一张hive表里面。

16、修改表名：

ALTER TABLE name CHANGE column_name new_name new_type

17、需要中间结果命名一下

select distinct collection_number

from (select distinct collection_number from tmp1
union all select distinct collection_number from tmp2_1
union all select distinct collection_number from tmp3

union all select distinct collection_number from tmp5) tt;

18、select *,
case when sms_splitter_str like "%|催收企业名%" then 1
else 0 end as have_org
from tb_sms_sample_dun_telandphone_rule_4;

19、no score

select no，collect_set(score) as score_set from tablss group by no;

结果：noscore_set

1[2,3]

2[2,4]

select no，collect_set(score)[0] from tablss group by no;

no score

2 2

20、group by放在where后面，向hive表中插入数据。insert overwrite table tb_result 若表中本来存在数据，覆盖表中的数据。

insert overwrite table tb_result_rule_6_all
select collection_number,min(pt_dt) as min_date,max(pt_dt) as max_date
from tb_sms_sample_dun_telandphone
where (rule_tag=3 or come_from=1) and length(collection_number)>3
group by collection_number;

21、hive表直接导入到mysql表

sqoop export \
-connect 'jdbc:mysql://192.168.109.200:3306/db_tel_manage?useUnicode=true&characterEncoding=utf8' \
-username root -password root123 \
-table tb_result_rule_union_all \
-export-dir /user/hive/warehouse/qihu_sms_4.db/tb_result_rule_union_all \
--fields-terminated-by '\001' \
--lines-terminated-by '\n'

22、字符串截取函数：substr

语法: substr(string A, int start, int len)

hive> select substr('abcde',3,2) from lxw_dual;

23、正则表达式替换函数：regexp_replace

语法: regexp_replace(string A, string B, string C) 说明：将字符串A中的符合java正则表达式B的部分替换为C。

hive> select regexp_replace('foobar', 'oo|ar', '') from lxw_dual;

24、正则表达式解析函数：regexp_extract

hive> select regexp_extract('foothebar', 'foo(.*?)(bar)', 0) fromlxw_dual;

foothebar

25、hive、java匹配手机号码

select regexp_extract('8618750930899', '1[34578]\d{9}', 0) ;

移动：134、135、136、137、138、139、150、151、157(TD)、158、159、187、188

　　联通：130、131、132、152、155、156、185、186

　　电信：133、153、180、189

"^((13[0-9])|(15[^4,\\D])|(18[0,5-9]))\\d{8}$"

((17[0-9])(14[0-9])|(13[0-9])|(15[^4,\\D])|(18[0,5-9]))\\d{8}

regexp_extract(sms_sender,'(1[34578]\\d{9}|0(10|2[0-5789]|\\d{3})\\d{7,8})',0) as sms_sender

0 0