CSV数据导入Parquet表
来源:互联网 发布:mac口红代理价格表 编辑:程序博客网 时间:2024/05/18 02:10
1,执行sql获取csv文件
select * from ad.click where month = 201412 and referralpage like 'http://weibo.com%'
2,将csv文件上传到服务器local:
/home/maoxiao/ad_weibo/data/click/201412.csv
3,创建csv表
CREATE TABLE maoxiao.click_csv(
profile_id int ,
server_time STRING ,
gridsum_id string ,
client_ip string ,
referral_page string ,
referral_host string ,
ad_id string ,
os_brief string ,
os_detail string ,
browser_brief string ,
browser_detail string ,
dotnet_version string ,
is_mobile boolean ,
device_brand string ,
device_type string ,
device_name string ,
geo_code int ,
country string ,
province string ,
city string ,
district string ,
longitude float ,
latitude float ,
isp string ,
year int,
month int,
day int
) row format delimited fields terminated by ',';
CREATE TABLE maoxiao.impression_csv(
profile_id int ,
server_time STRING ,
gridsum_id string ,
client_ip string ,
referral_page string ,
referral_host string ,
ad_id string ,
os_brief string ,
os_detail string ,
browser_brief string ,
browser_detail string ,
dotnet_version string ,
is_mobile boolean ,
device_brand string ,
device_type string ,
device_name string ,
geo_code int ,
country string ,
province string ,
city string ,
district string ,
longitude float ,
latitude float ,
isp string ,
year int,
month int,
day int
)row format delimited fields terminated by ',';
4 ,将csv文件导入csv表
load data local inpath '/home/maoxiao/weibo/data/click/201412.csv' into table maoxiao.click_csv;
load data local inpath '/home/maoxiao/weibo/data/impression/201412.csv' into table maoxiao.impression_csv;
5,创建sequence file table
CREATE TABLE maoxiao.click_seq(
profile_id int ,
server_time STRING ,
gridsum_id string ,
client_ip string ,
referral_page string ,
referral_host string ,
ad_id string ,
os_brief string ,
os_detail string ,
browser_brief string ,
browser_detail string ,
dotnet_version string ,
is_mobile boolean ,
device_brand string ,
device_type string ,
device_name string ,
geo_code int ,
country string ,
province string ,
city string ,
district string ,
longitude float ,
latitude float ,
isp string ,
year int,
month int,
day int ) stored as sequencefile;
CREATE TABLE maoxiao.impression_seq(
profile_id int ,
server_time STRING ,
gridsum_id string ,
client_ip string ,
referral_page string ,
referral_host string ,
ad_id string ,
os_brief string ,
os_detail string ,
browser_brief string ,
browser_detail string ,
dotnet_version string ,
is_mobile boolean ,
device_brand string ,
device_type string ,
device_name string ,
geo_code int ,
country string ,
province string ,
city string ,
district string ,
longitude float ,
latitude float ,
isp string ,
year int,
month int,
day int
) stored as sequencefile;
6,将csv表数据导入seq表
insert overwrite table maoxiao.click_seq select profile_id ,server_time ,gridsum_id,client_ip,referral_page,referral_host ,ad_id ,
os_brief ,os_detail ,browser_brief ,browser_detail ,dotnet_version ,is_mobile ,device_brand ,device_type ,device_name ,geo_code ,
country ,province ,city ,district ,longitude ,latitude ,isp ,year,month,day
from maoxiao.click_csv;
insert overwrite table maoxiao.impression_seq select profile_id ,server_time ,gridsum_id,client_ip,referral_page,referral_host ,ad_id ,
os_brief ,os_detail ,browser_brief ,browser_detail ,dotnet_version ,is_mobile ,device_brand ,device_type ,device_name ,geo_code ,
country ,province ,city ,district ,longitude ,latitude ,isp ,year,month,day
from maoxiao.impression_csv;
6.2,简单验证
select servertime,year,month,day from maoxiao.click_seq limit 1;
7,登陆impala,创建maoxiao.click_parquet
CREATE TABLE maoxiao.click_parquet(
profile_id int ,
server_time STRING ,
gridsum_id string ,
client_ip string ,
referral_page string ,
referral_host string ,
ad_id string ,
os_brief string ,
os_detail string ,
browser_brief string ,
browser_detail string ,
dotnet_version string ,
is_mobile boolean ,
device_brand string ,
device_type string ,
device_name string ,
geo_code int ,
country string ,
province string ,
city string ,
district string ,
longitude float ,
latitude float ,
isp string )
PARTITIONED BY (
year int,
month int,
day int)
ROW FORMAT SERDE
'parquet.hive.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'parquet.hive.DeprecatedParquetInputFormat'
OUTPUTFORMAT
'parquet.hive.DeprecatedParquetOutputFormat';
CREATE TABLE maoxiao.impression_parquet(
profile_id int ,
server_time STRING ,
gridsum_id string ,
client_ip string ,
referral_page string ,
referral_host string ,
ad_id string ,
os_brief string ,
os_detail string ,
browser_brief string ,
browser_detail string ,
dotnet_version string ,
is_mobile boolean ,
device_brand string ,
device_type string ,
device_name string ,
geo_code int ,
country string ,
province string ,
city string ,
district string ,
longitude float ,
latitude float ,
isp string ,
)
PARTITIONED BY (
year int,
month int,
day int)
ROW FORMAT SERDE
'parquet.hive.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'parquet.hive.DeprecatedParquetInputFormat'
OUTPUTFORMAT
'parquet.hive.DeprecatedParquetOutputFormat';
8,
动态分区:
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
set hive.exec.max.dynamic.partitions.pernode=1000;
9,将_seq表中的数据导入到_parquet表
insert overwrite table maoxiao.click_parquet partition(year,month,day)
select profile_id ,server_time ,gridsum_id,client_ip,referral_page,referral_host ,ad_id ,
os_brief ,os_detail ,browser_brief ,browser_detail ,dotnet_version ,is_mobile ,device_brand ,device_type ,device_name ,geo_code ,
country ,province ,city ,district ,longitude ,latitude ,isp ,year,month,day from maoxiao.click_seq where day <=20141215;
insert overwrite table maoxiao.click_parquet partition(year,month,day)
select profile_id ,server_time ,gridsum_id,client_ip,referral_page,referral_host ,ad_id ,
os_brief ,os_detail ,browser_brief ,browser_detail ,dotnet_version ,is_mobile ,device_brand ,device_type ,device_name ,geo_code ,
country ,province ,city ,district ,longitude ,latitude ,isp ,year,month,day from maoxiao.click_seq where day >20141215;
insert overwrite table maoxiao.impression_parquet partition(year,month,day)
select profile_id ,server_time ,gridsum_id,client_ip,referral_page,referral_host ,ad_id ,
os_brief ,os_detail ,browser_brief ,browser_detail ,dotnet_version ,is_mobile ,device_brand ,device_type ,device_name ,geo_code ,
country ,province ,city ,district ,longitude ,latitude ,isp ,year,month,day from maoxiao.impression_seq where day <=20141215;
insert overwrite table maoxiao.impression_parquet partition(year,month,day)
select profile_id ,server_time ,gridsum_id,client_ip,referral_page,referral_host ,ad_id ,
os_brief ,os_detail ,browser_brief ,browser_detail ,dotnet_version ,is_mobile ,device_brand ,device_type ,device_name ,geo_code ,
country ,province ,city ,district ,longitude ,latitude ,isp ,year,month,day from maoxiao.impression_seq where day >20141215;
9.2,简单验证
select * from maoxiao.click_parquet limit 5;
0 0
- CSV数据导入Parquet表
- SQLSERVER导入CSV表数据
- 导入导出mysql表数据到csv
- MySql 导入CSV数据
- Android SQLite导入CSV数据
- MySQL CSV 数据导入导出
- 将CSV数据导入mysql
- MongoBD导入csv格式数据
- mongdb导入csv格式数据
- oracle-csv数据导入方法
- 61 mysql csv导入数据
- ActiveAdmin扩展:CSV数据导入
- mysql导入csv格式数据
- MongoDB 导入数据(CSV文件)
- mongodb 导入CSV格式数据
- CSV数据导入mysql数据库
- mysql导入csv文件数据
- csv中的数据导入数据库
- 权限系统与UI交互心得
- play! 关于在 Eclipse 中找不到 CRUD 类的解决办法
- 有道词典 Andriod 版本数据格式分析
- 自考路上,我们更加精彩
- HDU2825 Wireless Password【AC自动机】
- CSV数据导入Parquet表
- java语言实现创建型模式—简单工厂模式
- Play通过message转换窗体显示的Label
- XS_C 学生类的成绩输入比较
- Linux高级编程复习笔记 第五章 exec 进程 父子进程 fork()
- maven配置scala环境
- eclipse 不识别jdk1.5以上版本自动装箱问题的解决
- linux下的四个简单函数介绍:getpid, getppid, getuid, getgid
- UVA11624 Fire! 两次BFS 读懂题意很重要