Hadoop MapReduce处理海量小文件（每次整个小文件整体读入到map）：基于FileInputFormat

来源：互联网发布：kof98um下载 mac 编辑：程序博客网时间：2024/05/16 07:12

采用FileInputFormat时，每个split中放入的是一个小文件，导致split过多，读取没有CombinFileInputFormat这种方式快

--------------------------------------------------------------------------------------------------------

WholeFileInputFormat .java 中的代码如下：

package AllFileInput;
import java.io.IOException;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

public class WholeFileInputFormat extends FileInputFormat<NullWritable, BytesWritable> {

@Override
public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
RecordReader<NullWritable, BytesWritable> recordReader = new WholeFileRecordReader();
recordReader.initialize(split, context);
return recordReader;
}

}

----------------------------------------------------------------------------------------------

WholeFileRecordReader.java

对小文件，每个文件对应一个InputSplit，我们读取这个InputSplit实际上就是具有一个Block的整个文件的内容，将整个文件的内容读取到BytesWritable，也就是一个字节数组。

package AllFileInput;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class WholeFileRecordReader extends RecordReader<NullWritable, BytesWritable> {

private FileSplit fileSplit;
private JobContext jobContext;
private NullWritable currentKey = NullWritable.get();
private BytesWritable currentValue;
private boolean finishConverting = false;

@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return currentKey;
}

@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return currentValue;
}

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.fileSplit = (FileSplit) split;
this.jobContext = context;
context.getConfiguration().set("map.input.file", fileSplit.getPath().getName());
}

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!finishConverting) {
currentValue = new BytesWritable();
int len = (int) fileSplit.getLength();
byte[] content = new byte[len];
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(jobContext.getConfiguration());
FSDataInputStream in = null;
try {
in = fs.open(file);
IOUtils.readFully(in, content, 0, len);
currentValue.set(content, 0, len);
} finally {
if (in != null) {
IOUtils.closeStream(in);
}
}
finishConverting = true;
return true;
}
return false;
}

@Override
public float getProgress() throws IOException {
float progress = 0;
if (finishConverting) {
progress = 1;
}
return progress;
}

@Override
public void close() throws IOException {
// TODO Auto-generated method stub

}
}

---------------------------------------------------------------------------------------------------------------

在map中WholeSmallfilesMapper.java：

package AllFileInput;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import bill323.core.Compare;
import bill323.core.Result;
import bill323.core.Sample;

public class WholeSmallfilesMapper extends Mapper<NullWritable, BytesWritable, Text, Text> {

private Text file = new Text();
private static Compare cmp=new Compare();
@Override
protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
String fileName = context.getConfiguration().get("map.input.file");
file.set(fileName);

//把整篇文章转换成String类型
String ybString =new String(value.getBytes(),0,value.getLength()); //把BytesWritable 转换成String类型的
context.write(file,value);
}

}

在main函数中：

package AllFileInput;

import java.io.IOException;
import java.util.Calendar;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import bill323.core.Compare;

public class WholeCombinedSmallfiles {

@SuppressWarnings("static-access")
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
long start = System.currentTimeMillis();
Compare cmp=new Compare();
args = new String[2];
args[0] = "hdfs://master:9000/newdata/YB2";
args[1] = "hdfs://master:9000/NoDelete1/tes-"+Calendar.getInstance().getTimeInMillis();
String aimFilePath = "/newdata/MB/SQ2015AA0100489.txt";
int mapCount= 20;
JobConf conf = new JobConf();
conf.set("aimFilePath", aimFilePath);
conf.set("mbString", cmp.getStringByTXT(aimFilePath, conf));
conf.set("mapred.job.tracker", "master:9001");
conf.setNumMapTasks(mapCount);
conf.set("fs.default.name", "hdfs://master:9000");
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: conbinesmallfiles <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "compare map test");
job.setJarByClass(WholeCombinedSmallfiles.class);
job.setMapperClass(WholeSmallfilesMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(WholeFileInputFormat.class);
// job.setOutputFormatClass(SequenceFileOutputFormat.class);//如果map中value的输出格式是BytesWritable，则需要设置SequenceFileOutputFormat
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
while (!job.waitForCompletion(true)) {
}
long end = System.currentTimeMillis();
System.out.println(end - start);
}
}

采用FileInputFormat使split增多，即一个split对应一个小文件，不建议采用，可以把小文件合并后使用，即采用CombinFileInputFormat，下篇文章继续介绍

0 0