parquet-thrfit 数据读写以及hive表读取

来源:互联网 发布:sql select语句例子 编辑:程序博客网 时间:2024/06/16 03:43

博客地址:http://www.fanlegefan.com
文章地址:http://www.fanlegefan.com/archives/parquet-thrfit-hive/


摘要

本文将用MapReduce方式读写parquet-thrfit格式文件,并将数据导入到hive表中进行查询操

定义person.thrift文件

namespace java com.fan.hadoop.parquet.thriftstruct Person{    1:optional string name;    2:optional i32 age;}

生成java bean

thrift --gen java person.thrift

生成路径如下

├── gen-java│   └── com│       └── fan│           └── hadoop│               └── parquet│                   └── thrift│                       └── Person.java└── person.thrift

将Person.java复制到项目中去

用MR写parquet-thrift格式文件

package com.fan.hadoop.parquet.thrift;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.parquet.hadoop.metadata.CompressionCodecName;import org.apache.parquet.hadoop.thrift.ParquetThriftOutputFormat;import java.io.IOException;/** * Created by http://www.fanlegefan.com on 17-7-21. */public class ParquetThriftWriterMR {    public static class ParthriftMap extends            Mapper<LongWritable, Text, Void, Person> {        private Text word = new Text();        public void map(LongWritable key, Text value, Context context)                throws IOException, InterruptedException {            String[] arr = value.toString().split(",");            Person p = new Person();            p.setName(arr[0]);            p.setAge(Integer.valueOf(arr[1].trim()));            context.write(null,p);        }    }    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        Job job = Job.getInstance(conf);        job.setJarByClass(ParquetThriftWriterMR.class);        job.setJobName("parquetthrfit");        String in = "hdfs://localhost:9000/test/user.txt";        String out = "hdfs://localhost:9000/test/wdthrfit";        job.setMapperClass(ParthriftMap.class);        TextInputFormat.addInputPath(job, new Path(in));        job.setInputFormatClass(TextInputFormat.class);        job.setNumReduceTasks(0);        job.setOutputFormatClass(ParquetThriftOutputFormat.class);        ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.GZIP);        ParquetThriftOutputFormat.setOutputPath(job, new Path(out));        ParquetThriftOutputFormat.setThriftClass(job, Person.class);        job.waitForCompletion(true);    }}

生成的文件如下

hadoop dfs -ls /test/wdthrfitFound 4 items-rw-r--r--   3 work supergroup          0 2017-07-21 11:51 /test/wdthrfit/_SUCCESS-rw-r--r--   3 work supergroup        529 2017-07-21 11:51 /test/wdthrfit/_common_metadata-rw-r--r--   3 work supergroup        680 2017-07-21 11:51 /test/wdthrfit/_metadata-rw-r--r--   3 work supergroup        776 2017-07-21 11:51 /test/wdthrfit/part-m-00000.gz.parquet

读取parquet-thrift格式文件

package com.fan.hadoop.parquet.thrift;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.parquet.hadoop.thrift.ParquetThriftInputFormat;import java.io.IOException;/** * Created by http://www.fanlegefan.com on 17-7-21. */public class ParquetThriftReaderMR {    public static class ParthriftMap extends            Mapper<Void, Person,  LongWritable, Text> {        private Text p = new Text();        public void map(Void key, Person value, Context context)                throws IOException, InterruptedException {            p.set(value.getName()+","+value.getAge());            context.write(null, p);        }    }    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        Job job = Job.getInstance(conf);        job.setJarByClass(ParquetThriftReaderMR.class);        job.setJobName("parquetthrfit");        String in = "hdfs://localhost:9000/test/wdthrfit";        String out = "hdfs://localhost:9000/test/wdthriftread";        job.setMapperClass(ParthriftMap.class);        job.setInputFormatClass(ParquetThriftInputFormat.class);        ParquetThriftInputFormat.setInputPaths(job, new Path(in));        job.setNumReduceTasks(0);        job.setOutputFormatClass(TextOutputFormat.class);        TextOutputFormat.setOutputPath(job, new Path(out));        job.waitForCompletion(true);    }}

查看生成的文本文件内容

hadoop dfs -cat /test/wdthriftread/part-m-00000leo,20jack,10jim,30andy,30

用hive表读取parquet-thrift格式文件

创建parquet格式表

create table test_parquet(name string,age int) stored as parquet

查看表结构

hive> show create table test_parquet;OKCREATE TABLE `test_parquet`(  `name` string,   `age` int)ROW FORMAT SERDE   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'STORED AS INPUTFORMAT   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'OUTPUTFORMAT   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'LOCATION  'hdfs://localhost:9000/user/hive/warehouse/test_parquet'TBLPROPERTIES (  'transient_lastDdlTime'='1495038003')

将之间生成的文件导入到hive表的数据目录下

hadoop dfs -cp /test/wdthrfit/*  /user/hive/warehouse/test_parquet/

hive读取数据

hive> select name,age from test_parquet;OKleo 20jack    10jim 30andy    30Time taken: 0.049 seconds, Fetched: 4 row(s)