MapReduce 的格式输入----MultipleInputs多个输入

来源:互联网 发布:奚梦瑶天涯扒皮知乎 编辑:程序博客网 时间:2024/06/15 03:02
针对 MapReduce的数据嘚瑟输入格式可能不同,有些数据可能以制表符分隔文本文件,有些数据可能是二进制顺序文件,即使它们的格式相同,它们的表示也看可能不同,因此需要分别进行解析。
MultipleInputs可以妥善处理这些问题,它允许为每条输入路径指定InputFprmat和Mapper
public static voidaddInputPath(JobConf conf, Path path, Class<?extendsInputFormat> inputFormatClass)
请看一个demo案例:
一个是sequencefile文件,一个是keyvalue文件
原理图:
1、SeqMapper
package hadoop.mr.input.multiple;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/*** SeqMapper*/public class SeqMapper extends Mapper<IntWritable, IntWritable, IntWritable, IntWritable> {protected void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {context.write(key,value);}}


2、KeyValueTextMapper
package hadoop.mr.input.multiple;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/*** KeyValueTextMapper*/public class KeyValueTextMapper extends Mapper<Text, Text, IntWritable, IntWritable> {protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {IntWritable year = new IntWritable(Integer.parseInt(key.toString()));IntWritable temp = new IntWritable(Integer.parseInt(value.toString()));context.write(year,temp);}}


3、MaxTempReducer
package hadoop.mr.input.multiple;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/*** MaxTempReducer*/public class MaxTempReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {int max = Integer.MIN_VALUE ;for(IntWritable iw : values){max = max > iw.get() ? max : iw.get() ;}context.write(key,new IntWritable(max));}}


4、App
package hadoop.mr.input.multiple;import hadoop.mr.input.nline.WordCountMapper;import hadoop.mr.input.nline.WordCountReducer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.*;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/***/public class App {public static void main(String[] args) throws Exception {args = new String[]{"d:/java/mr/data" , "d:/java/mr/out"} ;Configuration conf = new Configuration();conf.set("fs.defaultFS","file:///");conf.set("mapreduce.framework.name","local");FileSystem fs = FileSystem.get(conf);if(fs.exists(new Path(args[1]))){fs.delete(new Path(args[1]),true);}Job job = Job.getInstance(conf);job.setJobName("WordCount");job.setJarByClass(App.class);job.setReducerClass(MaxTempReducer.class);FileOutputFormat.setOutputPath(job,new Path(args[1]));MultipleInputs.addInputPath(job,new Path("d:/java/mr/data/temp.seq"), SequenceFileInputFormat.class,SeqMapper.class);job.getConfiguration().set(KeyValueLineRecordReader.KEY_VALUE_SEPERATOR," ");MultipleInputs.addInputPath(job,new Path("d:/java/mr/data/temp.dat"), KeyValueTextInputFormat.class,KeyValueTextMapper.class);job.setOutputKeyClass(IntWritable.class);job.setOutputValueClass(IntWritable.class);job.setNumReduceTasks(2);job.waitForCompletion(true);}}