parquet-thrfit 数据读写以及hive表读取
来源:互联网 发布:sql select语句例子 编辑:程序博客网 时间:2024/06/16 03:43
博客地址:http://www.fanlegefan.com
文章地址:http://www.fanlegefan.com/archives/parquet-thrfit-hive/
摘要
本文将用MapReduce方式读写parquet-thrfit格式文件,并将数据导入到hive表中进行查询操
定义person.thrift文件
namespace java com.fan.hadoop.parquet.thriftstruct Person{ 1:optional string name; 2:optional i32 age;}
生成java bean
thrift --gen java person.thrift
生成路径如下
├── gen-java│ └── com│ └── fan│ └── hadoop│ └── parquet│ └── thrift│ └── Person.java└── person.thrift
将Person.java复制到项目中去
用MR写parquet-thrift格式文件
package com.fan.hadoop.parquet.thrift;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.parquet.hadoop.metadata.CompressionCodecName;import org.apache.parquet.hadoop.thrift.ParquetThriftOutputFormat;import java.io.IOException;/** * Created by http://www.fanlegefan.com on 17-7-21. */public class ParquetThriftWriterMR { public static class ParthriftMap extends Mapper<LongWritable, Text, Void, Person> { private Text word = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] arr = value.toString().split(","); Person p = new Person(); p.setName(arr[0]); p.setAge(Integer.valueOf(arr[1].trim())); context.write(null,p); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(ParquetThriftWriterMR.class); job.setJobName("parquetthrfit"); String in = "hdfs://localhost:9000/test/user.txt"; String out = "hdfs://localhost:9000/test/wdthrfit"; job.setMapperClass(ParthriftMap.class); TextInputFormat.addInputPath(job, new Path(in)); job.setInputFormatClass(TextInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(ParquetThriftOutputFormat.class); ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.GZIP); ParquetThriftOutputFormat.setOutputPath(job, new Path(out)); ParquetThriftOutputFormat.setThriftClass(job, Person.class); job.waitForCompletion(true); }}
生成的文件如下
hadoop dfs -ls /test/wdthrfitFound 4 items-rw-r--r-- 3 work supergroup 0 2017-07-21 11:51 /test/wdthrfit/_SUCCESS-rw-r--r-- 3 work supergroup 529 2017-07-21 11:51 /test/wdthrfit/_common_metadata-rw-r--r-- 3 work supergroup 680 2017-07-21 11:51 /test/wdthrfit/_metadata-rw-r--r-- 3 work supergroup 776 2017-07-21 11:51 /test/wdthrfit/part-m-00000.gz.parquet
读取parquet-thrift格式文件
package com.fan.hadoop.parquet.thrift;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.parquet.hadoop.thrift.ParquetThriftInputFormat;import java.io.IOException;/** * Created by http://www.fanlegefan.com on 17-7-21. */public class ParquetThriftReaderMR { public static class ParthriftMap extends Mapper<Void, Person, LongWritable, Text> { private Text p = new Text(); public void map(Void key, Person value, Context context) throws IOException, InterruptedException { p.set(value.getName()+","+value.getAge()); context.write(null, p); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(ParquetThriftReaderMR.class); job.setJobName("parquetthrfit"); String in = "hdfs://localhost:9000/test/wdthrfit"; String out = "hdfs://localhost:9000/test/wdthriftread"; job.setMapperClass(ParthriftMap.class); job.setInputFormatClass(ParquetThriftInputFormat.class); ParquetThriftInputFormat.setInputPaths(job, new Path(in)); job.setNumReduceTasks(0); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(out)); job.waitForCompletion(true); }}
查看生成的文本文件内容
hadoop dfs -cat /test/wdthriftread/part-m-00000leo,20jack,10jim,30andy,30
用hive表读取parquet-thrift格式文件
创建parquet格式表
create table test_parquet(name string,age int) stored as parquet
查看表结构
hive> show create table test_parquet;OKCREATE TABLE `test_parquet`( `name` string, `age` int)ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'LOCATION 'hdfs://localhost:9000/user/hive/warehouse/test_parquet'TBLPROPERTIES ( 'transient_lastDdlTime'='1495038003')
将之间生成的文件导入到hive表的数据目录下
hadoop dfs -cp /test/wdthrfit/* /user/hive/warehouse/test_parquet/
hive读取数据
hive> select name,age from test_parquet;OKleo 20jack 10jim 30andy 30Time taken: 0.049 seconds, Fetched: 4 row(s)
阅读全文
0 0
- parquet-thrfit 数据读写以及hive表读取
- spark 读取hive parquet
- Parquet 读写
- spark 批量读取HDFS(hive分区)parquet文件
- Hive parquet 表查询报错
- CSV数据导入Parquet表
- Parquet_2. 在 Impala/Hive 中使用 Parquet 格式存储数据
- Hive Parquet配置
- Hive ORC和Parquet
- Hive Parquet配置
- hive读取es数据
- hiveContext读取hive数据
- hive使用orcfile parquet sequencefile
- Hive扩展功能(一)--Parquet
- spark 2.1 读取parquet外部表返回null
- hcatalog读取hive数据并写入hive
- spark、hive、impala、hbase、gbase在结构化数据方面查询原理对比(含parquet/orc)
- 使用hive读取hbase数据
- mongodb shell
- F28335 ePWM模块使用总结
- linux下jdk安装
- redis安装及cluster集群环境搭建
- android 自定义控件--Path 基本用法
- parquet-thrfit 数据读写以及hive表读取
- 指针运算
- 数据库sql如何查询某个字段只含有数字和字母或者是汉字
- 设计模式6-代理模式
- javaseday12(线程,同步代码块 函数,线程安全,死锁)
- [SMOJ2072]长路
- [bzoj2521][Shoi2010]最小生成树 最小割
- mongodb 杂记
- 数据结构和算法部分