CSV数据导入Parquet表

来源:互联网 发布:mac口红代理价格表 编辑:程序博客网 时间:2024/05/18 02:10
1,执行sql获取csv文件
select * from ad.click where month = 201412 and referralpage like 'http://weibo.com%'
2,将csv文件上传到服务器local:
/home/maoxiao/ad_weibo/data/click/201412.csv
3,创建csv表
CREATE TABLE maoxiao.click_csv(
  profile_id int  ,
  server_time STRING  ,
  gridsum_id string  ,
  client_ip string  ,
  referral_page string  ,
  referral_host string  ,
  ad_id string  ,
  os_brief string  ,
  os_detail string  ,
  browser_brief string  ,
  browser_detail string  ,
  dotnet_version string  ,
  is_mobile boolean  ,
  device_brand string  ,
  device_type string  ,
  device_name string  ,
  geo_code int  ,
  country string  ,
  province string  ,
  city string  ,
  district string  ,
  longitude float  ,
  latitude float  ,
  isp string ,
  year int,
  month int,
  day int
 ) row format delimited fields terminated by ',';

CREATE TABLE maoxiao.impression_csv(
  profile_id int  ,
  server_time STRING  ,
  gridsum_id string  ,
  client_ip string  ,
  referral_page string  ,
  referral_host string  ,
  ad_id string  ,
  os_brief string  ,
  os_detail string  ,
  browser_brief string  ,
  browser_detail string  ,
  dotnet_version string  ,
  is_mobile boolean  ,
  device_brand string  ,
  device_type string  ,
  device_name string  ,
  geo_code int  ,
  country string  ,
  province string  ,
  city string  ,
  district string  ,
  longitude float  ,
  latitude float  ,
  isp string ,
  year int,
  month int,
  day int
  )row format delimited fields terminated by ',';


4 ,将csv文件导入csv表
load data local inpath '/home/maoxiao/weibo/data/click/201412.csv' into table maoxiao.click_csv;
load data local inpath '/home/maoxiao/weibo/data/impression/201412.csv' into table maoxiao.impression_csv;

5,创建sequence file table
CREATE TABLE maoxiao.click_seq(
   profile_id int  ,
  server_time STRING  ,
  gridsum_id string  ,
  client_ip string  ,
  referral_page string  ,
  referral_host string  ,
  ad_id string  ,
  os_brief string  ,
  os_detail string  ,
  browser_brief string  ,
  browser_detail string  ,
  dotnet_version string  ,
  is_mobile boolean  ,
  device_brand string  ,
  device_type string  ,
  device_name string  ,
  geo_code int  ,
  country string  ,
  province string  ,
  city string  ,
  district string  ,
  longitude float  ,
  latitude float  ,
  isp string ,
  year int,
  month int,
  day int  ) stored as sequencefile;

CREATE TABLE maoxiao.impression_seq(
   profile_id int  ,
  server_time STRING  ,
  gridsum_id string  ,
  client_ip string  ,
  referral_page string  ,
  referral_host string  ,
  ad_id string  ,
  os_brief string  ,
  os_detail string  ,
  browser_brief string  ,
  browser_detail string  ,
  dotnet_version string  ,
  is_mobile boolean  ,
  device_brand string  ,
  device_type string  ,
  device_name string  ,
  geo_code int  ,
  country string  ,
  province string  ,
  city string  ,
  district string  ,
  longitude float  ,
  latitude float  ,
  isp string ,
  year int,
  month int,
  day int
  ) stored as sequencefile;

6,将csv表数据导入seq表
insert overwrite table maoxiao.click_seq select profile_id ,server_time ,gridsum_id,client_ip,referral_page,referral_host ,ad_id ,
os_brief ,os_detail ,browser_brief ,browser_detail ,dotnet_version ,is_mobile ,device_brand ,device_type ,device_name ,geo_code ,
country ,province ,city ,district ,longitude ,latitude ,isp ,year,month,day
 from maoxiao.click_csv;

insert overwrite table maoxiao.impression_seq select profile_id ,server_time ,gridsum_id,client_ip,referral_page,referral_host ,ad_id ,
os_brief ,os_detail ,browser_brief ,browser_detail ,dotnet_version ,is_mobile ,device_brand ,device_type ,device_name ,geo_code ,
country ,province ,city ,district ,longitude ,latitude ,isp ,year,month,day
 from maoxiao.impression_csv;

6.2,简单验证
select servertime,year,month,day from maoxiao.click_seq limit 1;
7,登陆impala,创建maoxiao.click_parquet
CREATE TABLE maoxiao.click_parquet(
  profile_id int  ,
  server_time STRING  ,
  gridsum_id string  ,
  client_ip string  ,
  referral_page string  ,
  referral_host string  ,
  ad_id string  ,
  os_brief string  ,
  os_detail string  ,
  browser_brief string  ,
  browser_detail string  ,
  dotnet_version string  ,
  is_mobile boolean  ,
  device_brand string  ,
  device_type string  ,
  device_name string  ,
  geo_code int  ,
  country string  ,
  province string  ,
  city string  ,
  district string  ,
  longitude float  ,
  latitude float  ,
  isp string ) 
  PARTITIONED BY (
  year int,
  month int,
  day int)
ROW FORMAT SERDE
  'parquet.hive.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
  'parquet.hive.DeprecatedParquetInputFormat'
OUTPUTFORMAT
  'parquet.hive.DeprecatedParquetOutputFormat';

CREATE TABLE maoxiao.impression_parquet(
 profile_id int  ,
  server_time STRING  ,
  gridsum_id string  ,
  client_ip string  ,
  referral_page string  ,
  referral_host string  ,
  ad_id string  ,
  os_brief string  ,
  os_detail string  ,
  browser_brief string  ,
  browser_detail string  ,
  dotnet_version string  ,
  is_mobile boolean  ,
  device_brand string  ,
  device_type string  ,
  device_name string  ,
  geo_code int  ,
  country string  ,
  province string  ,
  city string  ,
  district string  ,
  longitude float  ,
  latitude float  ,
  isp string ,
  )
 PARTITIONED BY (
  year int,
  month int,
  day int)
ROW FORMAT SERDE
  'parquet.hive.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
  'parquet.hive.DeprecatedParquetInputFormat'
OUTPUTFORMAT
  'parquet.hive.DeprecatedParquetOutputFormat';

8,
动态分区:
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
set hive.exec.max.dynamic.partitions.pernode=1000;
9,将_seq表中的数据导入到_parquet表
insert overwrite   table maoxiao.click_parquet partition(year,month,day) 
select profile_id ,server_time ,gridsum_id,client_ip,referral_page,referral_host ,ad_id ,
os_brief ,os_detail ,browser_brief ,browser_detail ,dotnet_version ,is_mobile ,device_brand ,device_type ,device_name ,geo_code ,
country ,province ,city ,district ,longitude ,latitude ,isp ,year,month,day from maoxiao.click_seq where day <=20141215;


insert overwrite   table maoxiao.click_parquet partition(year,month,day) 
select profile_id ,server_time ,gridsum_id,client_ip,referral_page,referral_host ,ad_id ,
os_brief ,os_detail ,browser_brief ,browser_detail ,dotnet_version ,is_mobile ,device_brand ,device_type ,device_name ,geo_code ,
country ,province ,city ,district ,longitude ,latitude ,isp ,year,month,day from maoxiao.click_seq where day >20141215;

insert overwrite   table maoxiao.impression_parquet partition(year,month,day) 
select profile_id ,server_time ,gridsum_id,client_ip,referral_page,referral_host ,ad_id ,
os_brief ,os_detail ,browser_brief ,browser_detail ,dotnet_version ,is_mobile ,device_brand ,device_type ,device_name ,geo_code ,
country ,province ,city ,district ,longitude ,latitude ,isp ,year,month,day from maoxiao.impression_seq where day <=20141215;

insert overwrite   table maoxiao.impression_parquet partition(year,month,day) 
select profile_id ,server_time ,gridsum_id,client_ip,referral_page,referral_host ,ad_id ,
os_brief ,os_detail ,browser_brief ,browser_detail ,dotnet_version ,is_mobile ,device_brand ,device_type ,device_name ,geo_code ,
country ,province ,city ,district ,longitude ,latitude ,isp ,year,month,day from maoxiao.impression_seq where day >20141215;


9.2,简单验证
select * from maoxiao.click_parquet limit 5;






0 0