读取RcFile文件

来源：互联网发布：windows apache 启动编辑：程序博客网时间：2024/05/16 06:59

在对rcfile进行读取操作时，需要重写InputFormat和RecordReader这两个类，然后在调用时，需要使用hive-exec-*.jar，主要用到了其中的org.apache.hadoop.hive.ql.io.RCFile.*，可以使用如下方式调用：MultipleInputs.addInputPath(job, input, RCFileInputFormat.class)。

下面给出InputFormat和RecordReader的重写示例和一个读取rcfile文件并添加Tab键作为分隔符的示例。

RCFileInputFormat.java

import java.io.IOException;import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;/** * RCFileInputFormat. * * @param <K> * @param <V> */public class RCFileInputFormat<K extends LongWritable, V extends BytesRefArrayWritable>    extends FileInputFormat<K, V>  {public RCFileInputFormat() {}@SuppressWarnings({ "unchecked", "rawtypes" })@Overridepublic org.apache.hadoop.mapreduce.RecordReader<K, V> createRecordReader(org.apache.hadoop.mapreduce.InputSplit arg0, TaskAttemptContext arg1)throws IOException, InterruptedException { return new RCFileRecordReader();}}

RCFileRecordReader.java

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hive.ql.io.RCFile;import org.apache.hadoop.hive.ql.io.RCFile.Reader;import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileSplit;/** * RCFileRecordReader. * * @param <K> * @param <V> */public class RCFileRecordReader<K extends LongWritable, V extends BytesRefArrayWritable>extends RecordReader<LongWritable, BytesRefArrayWritable> {private Reader in;private long start;private long end;private boolean more = true;private LongWritable key = null;private BytesRefArrayWritable value = null;protected Configuration conf;/** * * Return the progress within the input split. * *  * @return 0.0 to 1.0 of * the input byte range */public float getProgress() throws IOException {if (end == start) {return 0.0f;} else {return Math.min(1.0f, (in.getPosition() - start)/ (float) (end - start));}}public void close() throws IOException {in.close();}@Overridepublic LongWritable getCurrentKey() throws IOException,InterruptedException {return key;}@Overridepublic BytesRefArrayWritable getCurrentValue() throws IOException,InterruptedException {return value;}@Overridepublic void initialize(InputSplit split, TaskAttemptContext context)throws IOException, InterruptedException {FileSplit fileSplit = (FileSplit) split;conf = context.getConfiguration();Path path = fileSplit.getPath();FileSystem fs = path.getFileSystem(conf);this.in = new RCFile.Reader(fs, path, conf);this.end = fileSplit.getStart() + fileSplit.getLength();if (fileSplit.getStart() > in.getPosition()) {in.sync(fileSplit.getStart()); // sync to start}this.start = in.getPosition();more = start < end;}@Overridepublic boolean nextKeyValue() throws IOException, InterruptedException {if (!more) {return false;}if (key == null) {key = new LongWritable();}if (value == null) {value = new BytesRefArrayWritable();}more = in.next(key);if (!more) {return false;}long lastSeenSyncPos = in.lastSeenSyncPos();if (lastSeenSyncPos >= end) {more = false;return more;}in.getCurrentRow(value);return more;}}

以下是一个读取rcfile的示例：

RcFileMapper.java

import java.io.IOException;import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.JobID;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.TaskAttemptID;public class RcFileMapper extends Mapper<Object, BytesRefArrayWritable, Text, NullWritable>{long linecount=0;String fileName = "";JobID jobID;TaskAttemptID taskID;    long length = 0;@Overrideprotected void map(Object key, BytesRefArrayWritable value,Context context)throws IOException, InterruptedException {        Text txt = new Text();       //因为RcFile行存储和列存储，所以每次进来的一行数据，Value是个列簇，遍历，输出。  StringBuffer sb = new StringBuffer();         for (int i = 0; i < value.size(); i++) {              BytesRefWritable v = value.get(i);              txt.set(v.getData(), v.getStart(), v.getLength());              if(i == value.size()-1){                  sb.append(txt.toString());              }else{                  sb.append(txt.toString()+"\t");              }          }          context.write(new Text(sb.toString()),NullWritable.get());}@Overrideprotected void cleanup(Context context) throws IOException,InterruptedException {super.cleanup(context);//context.write(new Text(String.valueOf(fileName)), new Text(String.valueOf(length)));//System.out.println(linecount);}@Overrideprotected void setup(Context context) throws IOException,InterruptedException {super.setup(context);}}

RcFileReduce.java

import java.io.IOException;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.Reducer.Context;public class RcFileReduce  extends Reducer<Text, NullWritable, Text, NullWritable>{private IntWritable result = new IntWritable();@Overrideprotected void reduce(Text key, Iterable<NullWritable> values,Context context)throws IOException, InterruptedException {context.write(key, NullWritable.get());/*int sum = 0;for (IntWritable val : values) {sum += val.get();}result.set(sum);context.write(key, result);*/}}

RcFileDriver.java

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;import com.autonavi.dxp.util.RCFileInputFormat;public class RcFileDriver {public static boolean runLoadMapReducue(Configuration conf,  Path input, Path output) throws IOException,   ClassNotFoundException, InterruptedException {Job job = new Job(conf, "RcFile log");job.setJarByClass(RcFileDriver.class);job.setJobName("rcFile");job.setNumReduceTasks(1);job.setMapperClass(RcFileMapper.class);job.setReducerClass(RcFileReduce.class); MultipleInputs.addInputPath(job, input, RCFileInputFormat.class);                               job.setOutputKeyClass(Text.class);job.setOutputValueClass(NullWritable.class);FileOutputFormat.setOutputPath(job, output);return job.waitForCompletion(true);}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();conf.set("mapred.job.queue.name","base");//需要将jar上传到hdfs，这个jar可以在hadoop或者hive的lib中找到，一般cdh肯定会有的。conf.set("tmpjars","hive-exec-0.10.0.jar,hive-serde-0.10.0-cdh4.2.0.jar");if (args.length != 2) {System.err.println("Usage: rcfile <in> <out>");System.exit(2);}RcFileDriver.runLoadMapReducue(conf, new Path(args[0]), new Path(args[1]));  }}

0 0