Hadoop MapReduce处理海量小文件(每次整个小文件整体读入到map):基于FileInputFormat

来源:互联网 发布:kof98um下载 mac 编辑:程序博客网 时间:2024/05/16 07:12

采用FileInputFormat时,每个split中放入的是一个小文件,导致split过多,读取没有CombinFileInputFormat这种方式快

--------------------------------------------------------------------------------------------------------

WholeFileInputFormat .java   中的代码如下:

package AllFileInput;
import java.io.IOException;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

public class WholeFileInputFormat extends FileInputFormat<NullWritable, BytesWritable> {

@Override
public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
RecordReader<NullWritable, BytesWritable> recordReader = new WholeFileRecordReader();
recordReader.initialize(split, context);
return recordReader;
}

}

----------------------------------------------------------------------------------------------

WholeFileRecordReader.java

对小文件,每个文件对应一个InputSplit,我们读取这个InputSplit实际上就是具有一个Block的整个文件的内容,将整个文件的内容读取到BytesWritable,也就是一个字节数组。

package AllFileInput;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class WholeFileRecordReader extends RecordReader<NullWritable, BytesWritable> {

private FileSplit fileSplit;
private JobContext jobContext;
private NullWritable currentKey = NullWritable.get();
private BytesWritable currentValue;
private boolean finishConverting = false;

@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return currentKey;
}

@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return currentValue;
}

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.fileSplit = (FileSplit) split;
this.jobContext = context;
context.getConfiguration().set("map.input.file", fileSplit.getPath().getName());
}

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!finishConverting) {
currentValue = new BytesWritable();
int len = (int) fileSplit.getLength();
byte[] content = new byte[len];
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(jobContext.getConfiguration());
FSDataInputStream in = null;
try {
in = fs.open(file);
IOUtils.readFully(in, content, 0, len);
currentValue.set(content, 0, len);
} finally {
if (in != null) {
IOUtils.closeStream(in);
}
}
finishConverting = true;
return true;
}
return false;
}

@Override
public float getProgress() throws IOException {
float progress = 0;
if (finishConverting) {
progress = 1;
}
return progress;
}

@Override
public void close() throws IOException {
// TODO Auto-generated method stub

}
}

---------------------------------------------------------------------------------------------------------------

在map中WholeSmallfilesMapper.java:

package AllFileInput;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import bill323.core.Compare;
import bill323.core.Result;
import bill323.core.Sample;

public class WholeSmallfilesMapper extends Mapper<NullWritable, BytesWritable, Text, Text> {

private Text file = new Text();
private static Compare cmp=new Compare();
@Override
protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
String fileName = context.getConfiguration().get("map.input.file");
file.set(fileName);

//把整篇文章转换成String类型
String ybString =new String(value.getBytes(),0,value.getLength());  //把BytesWritable 转换成String类型的
context.write(file,value);
}

}

在main函数中:

package AllFileInput;

import java.io.IOException;
import java.util.Calendar;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import bill323.core.Compare;

public class WholeCombinedSmallfiles {

@SuppressWarnings("static-access")
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
long start = System.currentTimeMillis();
Compare cmp=new Compare();
args = new String[2];
args[0] = "hdfs://master:9000/newdata/YB2";
args[1] = "hdfs://master:9000/NoDelete1/tes-"+Calendar.getInstance().getTimeInMillis();
String aimFilePath = "/newdata/MB/SQ2015AA0100489.txt";
int mapCount= 20;
JobConf conf = new JobConf();
conf.set("aimFilePath", aimFilePath);
conf.set("mbString", cmp.getStringByTXT(aimFilePath, conf));
conf.set("mapred.job.tracker", "master:9001");
conf.setNumMapTasks(mapCount);
conf.set("fs.default.name", "hdfs://master:9000");
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: conbinesmallfiles <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "compare map test");
job.setJarByClass(WholeCombinedSmallfiles.class);
job.setMapperClass(WholeSmallfilesMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(WholeFileInputFormat.class);
// job.setOutputFormatClass(SequenceFileOutputFormat.class);//如果map中value的输出格式是BytesWritable,则需要设置SequenceFileOutputFormat
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
while (!job.waitForCompletion(true)) {
}
long end = System.currentTimeMillis();
System.out.println(end - start);
}
}

采用FileInputFormat使split增多,即一个split对应一个小文件,不建议采用,可以把小文件合并后使用,即采用CombinFileInputFormat,下篇文章继续介绍

0 0
原创粉丝点击