MapReduce 读取ORC格式文件
来源:互联网 发布:怎么登录熊片数据库 编辑:程序博客网 时间:2024/05/19 17:04
1、创建orc格式hive表:
create table test_orc(name string,age int) stored as orc2、查看表结构:
show create table test_orcCREATE TABLE `test_orc`( `name` string, `age` int)ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'LOCATION 'hdfs://localhost:9000/user/work/warehouse/test_orc'TBLPROPERTIES ( 'transient_lastDdlTime'='1502868725')
3、插入测试数据:
insert into table test_orc select name ,age from test limit 10;
4、读取mr:
1)pom.xml:
<dependency> <groupId>org.apache.orc</groupId> <artifactId>orc-core</artifactId> <version>1.2.3</version></dependency><dependency> <groupId>org.apache.orc</groupId> <artifactId>orc-mapreduce</artifactId> <version>1.1.0</version></dependency><dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>2.6.0</version></dependency>2)代码:
package com.fan.hadoop.orc;import com.fan.hadoop.parquet.thrift.ParquetThriftWriterMR;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.orc.mapred.OrcStruct;import org.apache.orc.mapreduce.OrcInputFormat;import java.io.IOException;public class OrcReaderMR { public static class OrcMap extends Mapper<NullWritable,OrcStruct,Text,IntWritable> { // Assume the ORC file has type: struct<s:string,i:int> public void map(NullWritable key, OrcStruct value, Context output) throws IOException, InterruptedException { // take the first field as the key and the second field as the value output.write((Text) value.getFieldValue(0), (IntWritable) value.getFieldValue(1)); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(ParquetThriftWriterMR.class); job.setJobName("parquetthrfit"); String in = "hdfs://localhost:9000/user/work/warehouse/test_orc"; String out = "hdfs://localhost:9000/test/orc"; job.setMapperClass(OrcMap.class); OrcInputFormat.addInputPath(job, new Path(in)); job.setInputFormatClass(OrcInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(TextOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(out)); job.waitForCompletion(true); }}3)查看生成的文件:
hadoop dfs -cat /test/orc/part-m-00000kafka 14tensflow 98hadoop 34hbase 68flume 57kafka 99kafka 28flume 24tensflow 35flume 44
5、mr写orc文件:
1)代码:
package com.fan.hadoop.orc;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.orc.OrcConf;import org.apache.orc.TypeDescription;import org.apache.orc.mapred.OrcStruct;import org.apache.orc.mapreduce.OrcOutputFormat;import java.io.IOException;public class OrcWriterMR { public static class OrcWriterMapper extends Mapper<LongWritable,Text,NullWritable,OrcStruct> { private TypeDescription schema = TypeDescription.fromString("struct<name:string,age:int>"); private OrcStruct pair = (OrcStruct) OrcStruct.createValue(schema); private final NullWritable nada = NullWritable.get(); private Text name = new Text(); private IntWritable age = new IntWritable(); public void map(LongWritable key, Text value, Context output ) throws IOException, InterruptedException { if(!"".equals(value.toString())){ String[] arr = value.toString().split("\t"); name.set(arr[0]); age.set(Integer.valueOf(arr[1])); pair.setFieldValue(0, name); pair.setFieldValue(1,age); output.write(nada, pair); } } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf,"struct<name:string,age:int>"); Job job = Job.getInstance(conf); job.setJarByClass(OrcWriterMR.class); job.setJobName("OrcWriterMR"); String in = "hdfs://localhost:9000/user/work/warehouse/test/ddd.txt"; String out = "hdfs://localhost:9000/test/orc2"; job.setMapperClass(OrcWriterMapper.class); job.setInputFormatClass(TextInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(OrcOutputFormat.class); FileInputFormat.addInputPath(job, new Path(in)); OrcOutputFormat.setOutputPath(job, new Path(out)); job.waitForCompletion(true); }}2)查看:
#### 生成orc文件 hadoop dfs -ls /test/orc2-rw-r--r-- 3 work supergroup 0 2017-08-16 17:45 /test/orc2/_SUCCESS-rw-r--r-- 3 work supergroup 6314874 2017-08-16 17:45 /test/orc2/part-m-00000.orc3)导入到hive:
hadoop fs -cp /test/orc2/part-m-00000.orc /user/work/warehouse/test_orc/hive> select * from test_orc limit 13;OKkafka 14tensflow 98hadoop 34hbase 68flume 57kafka 99kafka 28flume 24tensflow 35flume 44flume 44tensflow 35flume 24Time taken: 0.045 seconds, Fetched: 13 row(s)
阅读全文
0 0
- MapReduce 读取ORC格式文件
- ORC格式文件读取
- MapReduce读写orc文件
- MapReduce输出压缩格式文件
- Hive ORC数据格式的MapReduce读写
- Hive ORC数据格式的MapReduce Shuffle
- orc
- Java API 读取Hive Orc文件
- presto源码分析(hive orc读取)
- presto对orc文件的读取
- Java API 读取Hive Orc文件
- PSD格式文件的读取
- PSD格式文件的读取
- 读取DXF格式文件
- 读取shapefile格式文件
- 读取shapefile格式文件
- 如何读取DXF格式文件?
- 读取DXF格式文件
- 简单的SpringBoot工程搭建
- python中defaultdict方法的使用
- HTML标签marquee实现滚动效果
- UDP 实例
- BZOJ3827: [Poi2014]Around the world
- MapReduce 读取ORC格式文件
- xss安全漏洞分析以及项目实施解决方案
- 7.5
- Java类加载机制与Tomcat类加载器架构
- NoSQLUnit Core
- 指针,数组指针,指针数组的剖析
- jvm四:jvm内存说明
- 1035. 插入与归并(25)---Python
- JVM 类加载机制