读取RcFile文件
来源:互联网 发布:windows apache 启动 编辑:程序博客网 时间:2024/05/16 06:59
在对rcfile进行读取操作时,需要重写InputFormat和RecordReader这两个类,然后在调用时,需要使用hive-exec-*.jar,主要用到了其中的org.apache.hadoop.hive.ql.io.RCFile.*,可以使用如下方式调用:MultipleInputs.addInputPath(job, input, RCFileInputFormat.class)。
下面给出InputFormat和RecordReader的重写示例和一个读取rcfile文件并添加Tab键作为分隔符的示例。
RCFileInputFormat.java
import java.io.IOException;import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;/** * RCFileInputFormat. * * @param <K> * @param <V> */public class RCFileInputFormat<K extends LongWritable, V extends BytesRefArrayWritable> extends FileInputFormat<K, V> {public RCFileInputFormat() {}@SuppressWarnings({ "unchecked", "rawtypes" })@Overridepublic org.apache.hadoop.mapreduce.RecordReader<K, V> createRecordReader(org.apache.hadoop.mapreduce.InputSplit arg0, TaskAttemptContext arg1)throws IOException, InterruptedException { return new RCFileRecordReader();}}
import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hive.ql.io.RCFile;import org.apache.hadoop.hive.ql.io.RCFile.Reader;import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileSplit;/** * RCFileRecordReader. * * @param <K> * @param <V> */public class RCFileRecordReader<K extends LongWritable, V extends BytesRefArrayWritable>extends RecordReader<LongWritable, BytesRefArrayWritable> {private Reader in;private long start;private long end;private boolean more = true;private LongWritable key = null;private BytesRefArrayWritable value = null;protected Configuration conf;/** * * Return the progress within the input split. * * * @return 0.0 to 1.0 of * the input byte range */public float getProgress() throws IOException {if (end == start) {return 0.0f;} else {return Math.min(1.0f, (in.getPosition() - start)/ (float) (end - start));}}public void close() throws IOException {in.close();}@Overridepublic LongWritable getCurrentKey() throws IOException,InterruptedException {return key;}@Overridepublic BytesRefArrayWritable getCurrentValue() throws IOException,InterruptedException {return value;}@Overridepublic void initialize(InputSplit split, TaskAttemptContext context)throws IOException, InterruptedException {FileSplit fileSplit = (FileSplit) split;conf = context.getConfiguration();Path path = fileSplit.getPath();FileSystem fs = path.getFileSystem(conf);this.in = new RCFile.Reader(fs, path, conf);this.end = fileSplit.getStart() + fileSplit.getLength();if (fileSplit.getStart() > in.getPosition()) {in.sync(fileSplit.getStart()); // sync to start}this.start = in.getPosition();more = start < end;}@Overridepublic boolean nextKeyValue() throws IOException, InterruptedException {if (!more) {return false;}if (key == null) {key = new LongWritable();}if (value == null) {value = new BytesRefArrayWritable();}more = in.next(key);if (!more) {return false;}long lastSeenSyncPos = in.lastSeenSyncPos();if (lastSeenSyncPos >= end) {more = false;return more;}in.getCurrentRow(value);return more;}}
以下是一个读取rcfile的示例:
RcFileMapper.java
import java.io.IOException;import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.JobID;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.TaskAttemptID;public class RcFileMapper extends Mapper<Object, BytesRefArrayWritable, Text, NullWritable>{long linecount=0;String fileName = "";JobID jobID;TaskAttemptID taskID; long length = 0;@Overrideprotected void map(Object key, BytesRefArrayWritable value,Context context)throws IOException, InterruptedException { Text txt = new Text(); //因为RcFile行存储和列存储,所以每次进来的一行数据,Value是个列簇,遍历,输出。 StringBuffer sb = new StringBuffer(); for (int i = 0; i < value.size(); i++) { BytesRefWritable v = value.get(i); txt.set(v.getData(), v.getStart(), v.getLength()); if(i == value.size()-1){ sb.append(txt.toString()); }else{ sb.append(txt.toString()+"\t"); } } context.write(new Text(sb.toString()),NullWritable.get());}@Overrideprotected void cleanup(Context context) throws IOException,InterruptedException {super.cleanup(context);//context.write(new Text(String.valueOf(fileName)), new Text(String.valueOf(length)));//System.out.println(linecount);}@Overrideprotected void setup(Context context) throws IOException,InterruptedException {super.setup(context);}}
RcFileReduce.java
import java.io.IOException;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.Reducer.Context;public class RcFileReduce extends Reducer<Text, NullWritable, Text, NullWritable>{private IntWritable result = new IntWritable();@Overrideprotected void reduce(Text key, Iterable<NullWritable> values,Context context)throws IOException, InterruptedException {context.write(key, NullWritable.get());/*int sum = 0;for (IntWritable val : values) {sum += val.get();}result.set(sum);context.write(key, result);*/}}
RcFileDriver.java
import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;import com.autonavi.dxp.util.RCFileInputFormat;public class RcFileDriver {public static boolean runLoadMapReducue(Configuration conf, Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException {Job job = new Job(conf, "RcFile log");job.setJarByClass(RcFileDriver.class);job.setJobName("rcFile");job.setNumReduceTasks(1);job.setMapperClass(RcFileMapper.class);job.setReducerClass(RcFileReduce.class); MultipleInputs.addInputPath(job, input, RCFileInputFormat.class); job.setOutputKeyClass(Text.class);job.setOutputValueClass(NullWritable.class);FileOutputFormat.setOutputPath(job, output);return job.waitForCompletion(true);}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();conf.set("mapred.job.queue.name","base");//需要将jar上传到hdfs,这个jar可以在hadoop或者hive的lib中找到,一般cdh肯定会有的。conf.set("tmpjars","hive-exec-0.10.0.jar,hive-serde-0.10.0-cdh4.2.0.jar");if (args.length != 2) {System.err.println("Usage: rcfile <in> <out>");System.exit(2);}RcFileDriver.runLoadMapReducue(conf, new Path(args[0]), new Path(args[1])); }}
0 0
- 读取RcFile文件
- 在pig中读取RCFILE文件
- rcfile 读取数据
- rcfile
- RcFile
- MapReduce读/写RCFile文件
- Hive-RCFile文件存储格式
- RCFile的文件格式及创建与读取
- Mapreduce RCFile写入和读取API示例
- Mapreduce RCFile写入和读取API示例
- MapReduce产生RCFile文件在HDFS,HIve将RCFile文件加载到hive的表中
- RCFile文件格式
- HIve的rcfile文件存储格式的介绍以及如何将HIve的textfile文件存储格式转化为rcfile文件格式
- 文件读取(读取图片)
- 读取文件
- 读取文件
- 文件读取
- 文件读取
- 字符,字节和编码
- 写在第一次离职
- TI-AM1808_LINUX开发笔记
- 敏感词库 包含中英文
- Android_相对布局(中)第一季重制版
- 读取RcFile文件
- Log4D for Delphi XE7
- 解决UITableView分割线距左边有距离的办法
- 泛型理解之元组类库
- picasso框架的使用
- ubuntu下python2.7 安装 xlrd、xlwt、pip、 第三方库(library)的简易方法、pip 安装 selenium
- 《大话数据结构》笔记
- 新立得软件下载安装包
- Android服务之Service(其一)