Hive基础+案例

来源:互联网 发布:人工智能高峰论坛 编辑:程序博客网 时间:2024/05/20 23:06
hive shell
查看所有数据库:show databases;
创建数据库:create database database_name;
切换数据库:use database_name;
查看所有表:show tables;
模糊查询表:show tables like '*name*';
查看所有的hive函数:show functions;
查看table的表结构: desc tablesname ;   desc formatted table_name;
查看表创建的脚本:show create table uv_etltest; 
删除表:drop table uv_etltest; 
查看分区信息:show partitions table_name; 

案例:countpv日志。l=字段匹配正则:m\.fang\.com/news/\w+/03_\d+.html,且l=不包含sf_source=的每个标黄的id的PV和UV。
日志格式:8801b688-1492445566729-8c8230bd^u_50cfe454-1499154971810-2c970e6d^0^http^m.fang.com^/news/sh/03_25116147.html^sf_source=ttcollaborate^5^无^无^/^^0^无^122.194.3.70^0^1^0^1^[1]^无^2017-07-04^15^56^11^2017-07-04^15^56^11^^^

show databases;
create database uvtest;
use uvtest;
show tables;
desc uv_etltest;
show create table uv_etltest;
show partitions uv_etltest;

创建表:
CREATE EXTERNAL TABLE IF NOT EXISTS uv_etltest (
Visitor String
,Visit String
,IsFirstVisit int
,LocationProtocol String
,LocationDomain String
,LocationPath String
,LocationParametersString String
,RefererType String
,RefererProtocol String
,RefererDomain String
,RefererPath String
,RefererParametersString String
,SearchGroupType String
,SearchKeyword String
,IP String
,OpenPageType String
,UserType String
,UserInfo String
,Mouse int
,BIDs String
,BBSSign String
,ServerTimeDate date
,ServerTimeHour String
,ServerTimeMinute String
,ServerTimeSecond String
,ClientTimeDate date
,ClientTimeHour String
,ClientTimeMinute String
,ClientTimeSecond String
,Title String
,UrlOwner String
,PageID String
)
PARTITIONED BY (logdate INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '^'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE

添加分区:ALTER TABLE uv_etltest  ADD PARTITION (logdate= 20170704)  LOCATION '/logs/uv/etlbeginchar/20170704/';
修改分区的location :ALTER TABLE uv_etltest PARTITION (logdate='20170704') SET LOCATION '/logs/uv/etlbeginchar/20170704/';
修改分区的信息 : ALTER TABLE table_name PARTITION (logdate='20170704') RENAME TO PARTITION (logdate='20170504');
删除分区:ALTER TABLE uv_etltest DROP IF EXISTS PARTITION (logdate='20170704');

查询结果
Select split(split(LocationPath,'03_')[1],'.html')[0] as id,count(Visitor) as pv,count(distinct(Visitor)) as uv from uv_etltest where logdate= 20170704 and concat(LocationDomain,LocationPath,LocationParametersString) not like '%sf_source=%' and concat(LocationDomain,LocationPath) regexp 'm\\.fang\\.com/news/\\w+/03_\\d+.html' group by split(split(LocationPath,'03_')[1],'.html')[0] order by pv desc;

结果存放在hdfs上,并以空格分开
set hive.merge.mapredfiles= true;
insert overwrite directory '/user/hive/warehouse/uvtest.db/uv_etltest' row format delimited fields terminated by "\t" 
Select split(split(LocationPath,'03_')[1],'.html')[0] as id,count(Visitor) as pv,count(distinct(Visitor)) as uv from uv_etltest where logdate= 20170704 and concat(LocationDomain,LocationPath,LocationParametersString) not like '%sf_source=%' and concat(LocationDomain,LocationPath) regexp 'm\\.fang\\.com/news/\\w+/03_\\d+.html' group by split(split(LocationPath,'03_')[1],'.html')[0] order by pv desc;

结果存放在新的hive表中
set hive.merge.mapredfiles= true; 
create table uv_etltestresult as
Select split(split(LocationPath,'03_')[1],'.html')[0] as id,count(Visitor) as pv,count(distinct(Visitor)) as uv from uv_etltest where logdate= 20170704 and concat(LocationDomain,LocationPath,LocationParametersString) not like '%sf_source=%' and concat(LocationDomain,LocationPath) regexp 'm\\.fang\\.com/news/\\w+/03_\\d+.html' group by split(split(LocationPath,'03_')[1],'.html')[0] order by pv desc;

修改表名称:alter table  uv_etltest1 rename to uv_etltestresult;
Select * from uv_etltestresult where pv > 500;