Hadoop MapReduce处理海量小文件：基于CombineFileInputFormat（整个小文件读入到map中）

来源：互联网发布：手机淘宝海报尺寸编辑：程序博客网时间：2024/05/16 08:13

CombinFIleInputFormat通常默认情况下是把合并后的文件一行一行读入到map中，这里给改成了每次往map中读入整个小文件的内容

----------------------------------------------------------------------------------------------------------------------------

CombineSmallfileInputFormat.java

package combinAllFile;

import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;

public class CombineSmallfileInputFormat extends CombineFileInputFormat<LongWritable, BytesWritable> {

@Override
public RecordReader<LongWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {

CombineFileSplit combineFileSplit = (CombineFileSplit) split;
CombineFileRecordReader<LongWritable, BytesWritable> recordReader = new CombineFileRecordReader<LongWritable, BytesWritable>(combineFileSplit, context, CombineSmallfileRecordReader.class);
try {
recordReader.initialize(combineFileSplit, context);
} catch (InterruptedException e) {
new RuntimeException("Error to initialize CombineSmallfileRecordReader.");
}
return recordReader;
}

@Override
protected boolean isSplitable(JobContext context, Path file)
{
return false;
}
}

------------------------------------------------------------------------------------------------------------------------------------------------------

在RecordReader中设置读取格式，及获取文件名

CombineSmallfileRecordReader .java

package combinAllFile;

import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

public class CombineSmallfileRecordReader extends RecordReader<LongWritable, BytesWritable> {

private CombineFileSplit combineFileSplit;
private Path[] paths;
private int totalLength;
private int currentIndex;//当前小文件在split中的索引
private float currentProgress = 0;
private LongWritable currentKey;
private BytesWritable currentValue = new BytesWritable();
private FileSplit fileSplit;
private boolean finishConverting = false;
private JobContext jobContext;

public CombineSmallfileRecordReader(CombineFileSplit combineFileSplit, TaskAttemptContext context, Integer index) throws IOException {
super();
this.combineFileSplit = combineFileSplit;
this.currentIndex = index; //当前要处理的小文件Block在CombineFileSplit中的索引

}

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.combineFileSplit = (CombineFileSplit) split;
// 此时的split中放的是文件数组，比如0-9，即10个文件，里面都是每个文件的文件全路径及该文件长度

fileSplit = new FileSplit(combineFileSplit.getPath(currentIndex), combineFileSplit.getOffset(currentIndex), combineFileSplit.getLength(currentIndex), combineFileSplit.getLocations());
this.jobContext = context;

this.paths = combineFileSplit.getPaths();
totalLength = paths.length;
context.getConfiguration().set("map.input.file.name", combineFileSplit.getPath(currentIndex).getName());
}

@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return currentKey;
}

@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return currentValue;
}

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (currentIndex >= 0 && currentIndex < totalLength) {
if (!finishConverting) {
currentValue = new BytesWritable();
int len = (int) fileSplit.getLength();
byte[] content = new byte[len];
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(jobContext.getConfiguration());
FSDataInputStream in = null;
try {
in = fs.open(file);
IOUtils.readFully(in, content, 0, len);
currentValue.set(content, 0, len);
} finally {
if (in != null) {
IOUtils.closeStream(in);
}
}
finishConverting = true;
return true;
}
} else {
return false;
}
return false;
}

@Override
public float getProgress() throws IOException {
/*if (currentIndex >= 0 && currentIndex < totalLength) {
currentProgress = (float) currentIndex / totalLength;
return currentProgress;
}
return currentProgress;*/
float progress = 0;
if (finishConverting) {
progress = 1;
}
return progress;
}

@Override
public void close() throws IOException {
// lineRecordReader.close();
}
}

-----------------------------------------------------------------------------------------------------------------------

CombineSmallfileMapper.java

package CombineLine;

import java.io.IOException;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class CombineSmallfileMapper extends Mapper<LongWritable, BytesWritable, Text, BytesWritable> {

private Text file = new Text();

@Override
protected void map(LongWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
  String fileName = context.getConfiguration().get("map.input.file.name");
  String ybString =new String(value.getBytes(),0,value.getLength());//把value从ByteWritable转成String格式
  file.set(fileName);
  context.write(file, value);
}

}

-----------------------------------------------------------------------------------------

CombineSmallfiles.java 程序入口

package CombineLine;

import java.io.IOException;
import java.util.Calendar;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import CombineLine.IdentityReducer;

public class CombineSmallfiles {

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

  JobConf conf = new JobConf();
  long start = System.currentTimeMillis();
  args = new String[2];
  args[0] = "hdfs://master:9000/newdata/YB2";
  args[1] = "hdfs://master:9000/NoDelete1/tes-"+Calendar.getInstance().getTimeInMillis();
  String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
  if (otherArgs.length != 2) {
   System.err.println("Usage: conbinesmallfiles <in> <out>");
   System.exit(2);
  }

  conf.set("fs.default.name", "hdfs://master:9000");
  conf.set("mapred.job.tracker", "master:9001");
  conf.setNumMapTasks(20);
  Job job = new Job(conf, "combine smallfiles");
  job.setJarByClass(CombineSmallfiles.class);
  job.setMapperClass(CombineSmallfileMapper.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(BytesWritable.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(BytesWritable.class);

job.setInputFormatClass(CombineSmallfileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);//若想将value以Text格式输出，则此设置不能要，当然，还得将上面java文件的value格式改变

FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

  while (!job.waitForCompletion(true)) {
  }
  long end = System.currentTimeMillis();
  System.out.println(end - start);
}

}

0 0