Hadoop使用CombineFileInputFormat处理大量小文件接口实现（Hadoop-1.0.4）

来源：互联网发布：ubuntu mp3播放器推荐编辑：程序博客网时间：2024/05/21 19:29

Configuration设置块大小64M

Configuration conf = new Configuration();conf.setLong(MyCombineFileInputFormat.SPLIT_MINSIZE_PERNODE, 64 * 1024 * 1024);conf.setLong(MyCombineFileInputFormat.SPLIT_MINSIZE_PERRACK, 64 * 1024 * 1024);conf.setLong("mapreduce.input.fileinputformat.split.maxsize", 64 * 1024 * 1024);

CombineFileInputFormat具体实现：（内部的RecordReader仿照TextInputFormat的LineRecordReader）

自定义KEY:InputSplitFile类，两个成员：offset和filename。作为map的输入KEY。

package com.****.hadoop;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.io.compress.CompressionCodec;import org.apache.hadoop.io.compress.CompressionCodecFactory;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;import org.apache.hadoop.util.LineReader;public class MyCombineFileInputFormatextendsCombineFileInputFormat<InputSplitFile, Text> {@Overridepublic RecordReader<InputSplitFile, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {return new CombineFileRecordReader<InputSplitFile, Text>((CombineFileSplit) split, context,MyCombineFileRecordReader.class);}}class MyCombineFileRecordReader extends RecordReader<InputSplitFile, Text> {private static final Log LOG = LogFactory.getLog(MyCombineFileRecordReader.class);private CompressionCodecFactory compressionCodecs = null;private long start;private long pos;private long end;private Path path;private LineReader in;private int maxLineLength;private InputSplitFile key = null;private Text value = null;public MyCombineFileRecordReader(CombineFileSplit split,TaskAttemptContext context, Integer index) throws IOException {Configuration job = context.getConfiguration();this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength",Integer.MAX_VALUE);this.path = split.getPath(index);this.start = split.getOffset(index);this.end = start + split.getLength(index);compressionCodecs = new CompressionCodecFactory(job);final CompressionCodec codec = compressionCodecs.getCodec(this.path);boolean skipFirstLine = false;FileSystem fs = path.getFileSystem(job);FSDataInputStream fileIn = fs.open(split.getPath(index));if (codec != null) {in = new LineReader(codec.createInputStream(fileIn), job);end = Long.MAX_VALUE;} else {if (start != 0) {skipFirstLine = true;--start;fileIn.seek(start);}in = new LineReader(fileIn, job);}if (skipFirstLine) // skip first line and re-establish "startOffset".{start += in.readLine(new Text(), 0,(int) Math.min((long) Integer.MAX_VALUE, end - start));}this.pos = start;}@Overridepublic void initialize(InputSplit genericSplit, TaskAttemptContext context)throws IOException, InterruptedException {}@Overridepublic boolean nextKeyValue() throws IOException, InterruptedException {if (key == null) {key = new InputSplitFile();key.setFileName(path.getName());}key.setOffset(pos);if (value == null) {value = new Text();}int newSize = 0;while (pos < end) {newSize = in.readLine(value, maxLineLength,Math.max((int) Math.min(Integer.MAX_VALUE, end - pos),maxLineLength));if (newSize == 0) {break;}pos += newSize;if (newSize < maxLineLength) {break;}// line too long. try againLOG.info("Skipped line of size " + newSize + " at pos "+ (pos - newSize));}if (newSize == 0) {key = null;value = null;return false;} else {return true;}}@Overridepublic InputSplitFile getCurrentKey() throws IOException,InterruptedException {return key;}@Overridepublic Text getCurrentValue() throws IOException, InterruptedException {return value;}@Overridepublic float getProgress() throws IOException, InterruptedException {if (start == end) {return 0.0f;} else {return Math.min(1.0f, (pos - start) / (float) (end - start));}}@Overridepublic void close() throws IOException {if (in != null)in.close();}}class InputSplitFile implements WritableComparable<InputSplitFile> {private long offset;private String fileName;public long getOffset() {return offset;}public void setOffset(long offset) {this.offset = offset;}public String getFileName() {return fileName;}public void setFileName(String fileName) {this.fileName = fileName;}public void readFields(DataInput in) throws IOException {this.offset = in.readLong();this.fileName = Text.readString(in);}public void write(DataOutput out) throws IOException {out.writeLong(offset);Text.writeString(out, fileName);}public int compareTo(InputSplitFile o) {InputSplitFile that = (InputSplitFile) o;int f = this.fileName.compareTo(that.fileName);if (f == 0) {return (int) Math.signum((double) (this.offset - that.offset));}return f;}public boolean equals(InputSplitFile obj) {if (obj instanceof InputSplitFile)return this.compareTo(obj) == 0;return false;}@Overridepublic int hashCode() {assert false : "hashCode not designed";return 42; // an arbitrary constant}}