Mapreduce中的RCFile输入RCFileInputFormat实现及其应用
来源:互联网 发布:淘宝客手机网站搭建 编辑:程序博客网 时间:2024/05/18 14:14
import java.io.IOException;import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;/** * RCFileInputFormat. * * @param <K> * @param <V> */public class RCFileInputFormat<K extends LongWritable, V extends BytesRefArrayWritable> extends FileInputFormat<K, V> { public RCFileInputFormat() { }@SuppressWarnings("unchecked")@Overridepublic org.apache.hadoop.mapreduce.RecordReader<K, V> createRecordReader(org.apache.hadoop.mapreduce.InputSplit arg0, TaskAttemptContext arg1)throws IOException, InterruptedException { return new RCFileRecordReader();}}
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hive.ql.io.RCFile;import org.apache.hadoop.hive.ql.io.RCFile.Reader;import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/** * RCFileRecordReader. * * @param <K> * @param <V> */public class RCFileRecordReader<K extends LongWritable, V extends BytesRefArrayWritable> extends RecordReader<LongWritable, BytesRefArrayWritable> {
private Reader in; private long start; private long end; private boolean more = true; private LongWritable key = null; private BytesRefArrayWritable value = null; protected Configuration conf;
/** * Return the progress within the input split. * * @return 0.0 to 1.0 of the input byte range */ public float getProgress() throws IOException { if (end == start) { return 0.0f; } else { return Math.min(1.0f, (in.getPosition() - start) / (float) (end - start)); } }
public void close() throws IOException { in.close(); }
@Override public LongWritable getCurrentKey() throws IOException, InterruptedException {
return key; }
@Override public BytesRefArrayWritable getCurrentValue() throws IOException, InterruptedException {
return value; }
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; conf = context.getConfiguration(); Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(conf); this.in = new RCFile.Reader(fs, path, conf); this.end = fileSplit.getStart() + fileSplit.getLength();
if (fileSplit.getStart() > in.getPosition()) { in.sync(fileSplit.getStart()); // sync to start }
this.start = in.getPosition(); more = start < end; }
@Override public boolean nextKeyValue() throws IOException, InterruptedException { if (!more) { return false; } if (key == null) { key = new LongWritable(); } if (value == null) { value = new BytesRefArrayWritable(); } more = in.next(key); if (!more) { return false; } long lastSeenSyncPos = in.lastSeenSyncPos(); if (lastSeenSyncPos >= end) { more = false; return more; } in.getCurrentRow(value); return more; }}
应用方式:
job.setInputFormatClass(RCFileInputFormat.class);
public static class Map extends Mapper<LongWritable, BytesRefArrayWritable, Text, NullWritable> {
@Override
protected void map(LongWritable key, BytesRefArrayWritable value, Context context) throws IOException, InterruptedException {
String top = new String(value.get(32).getBytesCopy());
byte[] channel = value.get(12).getBytesCopy();
......
- Mapreduce中的RCFile输入RCFileInputFormat实现及其应用
- Mapreduce中的RCFile输出RCFileOutputFormat实现及其应用
- Mapreduce中的RCFile输出RCFileOutputFormat实现及其应用
- RCFile 简介及其应用
- 基于 Hive 的文件格式:RCFile 简介及其应用
- 基于Hive 的文件格式:RCFile 简介及其应用
- 基于 Hive 的文件格式:RCFile 简介及其应用
- PageRank及其MapReduce实现
- MapReduce读/写RCFile文件
- Hive中的RCFile
- mapreduce来清洗数据生成RCFile
- Mapreduce RCFile写入和读取API示例
- Mapreduce RCFile写入和读取API示例
- Hadoop MapReduce 二次排序原理及其应用
- Hadoop MapReduce 二次排序原理及其应用
- Hadoop MapReduce 二次排序原理及其应用
- 排序算法及其在MapReduce的应用
- 排序算法及其在MapReduce的应用
- 俞敏洪说
- 【PB】指定某行某列的tooltip属性值uf_setcell_tooltip
- 隐式类型转换(C++学习)
- 编译OSG及OsgOcean
- 《IT小小鸟》笔记
- Mapreduce中的RCFile输入RCFileInputFormat实现及其应用
- fork and signal
- 龙之谷跟背包有关的代码
- 【PB】指定某行某列的颜色属性值uf_setcell_color
- 初步撸a
- C语言调用方式
- 农业银行 CIE7 : 指定的两码不存在?
- 防止.NET木马列所有站物理路径,aspxspy木马简单防范方法 防止.JPG类型木马
- 【PB】颜色转换公式