mapreduce程序reduce输出控制

来源:互联网 发布:2017淘宝改销量代码 编辑:程序博客网 时间:2024/06/05 08:02

1,在hadoop中,reduce支持多个输出,输出的文件名也是可控的,就是继承MultipleTextOutputFormat类,重写generateFileNameForKey方法

public class LzoHandleLogMr extends Configured implements Tool { static class LzoHandleLogMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {         public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)throws IOException {    try {        String[] sp = value.toString().split(",");    output.collect(new Text(sp[0]), value);    }catch (Exception e) {   e.printStackTrace();    }    }}static class LzoHandleLogReducer  extends MapReduceBase implements Reducer<Text, Text, Text, NullWritable> {        @Overridepublic void reduce(Text key, Iterator<Text> values,OutputCollector<Text, NullWritable> output, Reporter reporter)throws IOException {while (values.hasNext()) {     output.collect(values.next(), NullWritable.get());         }}}public static class LogNameMultipleTextOutputFormat extends MultipleTextOutputFormat<Text, NullWritable>    {@Overrideprotected String generateFileNameForKeyValue(Text key,NullWritable value, String name) {String sp[] = key.toString().split(",");String filename = sp[0];if(sp[0].contains(".")) filename="000000000000";return filename;}}    @Overridepublic int run(String[] args) throws Exception {     JobConf jobconf = new JobConf(LzoHandleLogMr.class);    jobconf.setMapperClass(LzoHandleLogMapper.class);    jobconf.setReducerClass(LzoHandleLogReducer.class);    jobconf.setOutputFormat(LogNameMultipleTextOutputFormat.class);    jobconf.setOutputKeyClass(Text.class);    jobconf.setNumReduceTasks(12);         FileInputFormat.setInputPaths(jobconf,new Path(args[0]));    FileOutputFormat.setOutputPath(jobconf,new Path(args[1]));    FileOutputFormat.setCompressOutput(jobconf, true);    FileOutputFormat.setOutputCompressorClass(jobconf, LzopCodec.class);          JobClient.runJob(jobconf);      return 0;}}


在新版本的hadoopAPI是通过Job类来设置各种参数的,但是我调用 Job.setOutputFormatClass()来使用MultipleTextOutputFormat的时候,竟然报错,原因是必须继承子org.apache.hadoop.mapreduce.OutputFormat。0.20.2比较致命的其中一个bug, 升级到0.21能解决


2, 如果同一行数据,需要同时输出至多个文件的话,我们可以使用MultipleOutputs类:

  1. public class MultiFile extends Confi gured implements Tool {  
  2.     public static class MapClass extends MapReduceBase  
  3.         implements Mapper<LongWritable, Text, NullWritable, Text> {  
  4.             private MultipleOutputs mos;  
  5.   
  6.             private OutputCollector<NullWritable, Text> collector;  
  7.             public void confi gure(JobConf conf) {  
  8.                 mos = new MultipleOutputs(conf);  
  9.             }  
  10.   
  11.             public void map(LongWritable key, Text value,  
  12.                     OutputCollector<NullWritable, Text> output,  
  13.                     Reporter reporter) throws IOException {  
  14.                 String[] arr = value.toString().split(",", -1);  
  15.                 String chrono = arr[0] + "," + arr[1] + "," + arr[2];  
  16.                 String geo = arr[0] + "," + arr[4] + "," + arr[5];  
  17.                 collector = mos.getCollector("chrono", reporter);  
  18.                 collector.collect(NullWritable.get(), new Text(chrono));  
  19.                 collector = mos.getCollector("geo", reporter);  
  20.                 collector.collect(NullWritable.get(), new Text(geo));  
  21.             }  
  22.   
  23.             public void close() throws IOException {  
  24.                 mos.close();  
  25.             }  
  26.     }  
  27.   
  28.     public int run(String[] args) throws Exception {  
  29.         Confi guration conf = getConf();  
  30.         JobConf job = new JobConf(conf, MultiFile.class);  
  31.         Path in = new Path(args[0]);  
  32.         Path out = new Path(args[1]);  
  33.         FileInputFormat.setInputPaths(job, in);  
  34.         FileOutputFormat.setOutputPath(job, out);  
  35.         job.setJobName("MultiFile");  
  36.         job.setMapperClass(MapClass.class);  
  37.         job.setInputFormat(TextInputFormat.class);  
  38.         job.setOutputKeyClass(NullWritable.class);  
  39.         job.setOutputValueClass(Text.class);  
  40.         job.setNumReduceTasks(0);  
  41.         MultipleOutputs.addNamedOutput(job,  
  42.                 "chrono",  
  43.                 TextOutputFormat.class,  
  44.                 NullWritable.class,  
  45.                 Text.class);  
  46.         MultipleOutputs.addNamedOutput(job,  
  47.                 "geo",  
  48.                 TextOutputFormat.class,  
  49.                 NullWritable.class,  
  50.                 Text.class);  
  51.         JobClient.runJob(job);  
  52.         return 0;  
  53.     }  
  54. }  

这个类维护了一个<name, OutputCollector>的map。我们可以在job配置里添加collector,然后在reduce方法中,取得对应的collector并调用collector.write即可。


原创粉丝点击