MapReduce之OutputFormat理解

来源:互联网 发布:python中execute 编辑:程序博客网 时间:2024/05/15 18:09

一 OutputFormat作用

1校验job中指定输出路径是否存在

2将结果写入输出文件

 

二 OutputFormat的实现

2.1DBOutputFormat: 发送Reduce结果到SQL表中

2.2FileOutputFormat: 将Reduce结果写入文件中

2.2.1MapFileOutputFormat: 主要是处理MapFile(特殊的SequenceFile)的输出

2.2.2SequenceFileOutputFormat: 主要是处理SequenceFile的输出

2.2.3TextFileOutputFormat: 主要是处理普通文本的输出,也是默认实现

2.3FilterOutputFormat:主要就是方便包装其他OutputFromat(没用过)

2.4NullOutputFormat: 把所有的输出放到/dev/null(没用过)

 

 

 

三 MultipleOutputs

在有些场景中,我们需要将Map-Reduce结果输出到多个文件中,我们就可以使用MapOutputs这个类。

MultipleOutputs的使用步骤:

3.1我们需要在Mapper中setup方法实例化MapOutputs

3.2在map方法中使用MapOutputs对象进行write, 并且需要把你的文件命传入write方法中

3.3在完成后需要在close方法中关闭MapOutputs

3.4最后生成的结果就是 你传入的文件名-m|r-0000这样的序列

publicclass OutputMultipleFile extends Configuredimplements Tool{

publicstatic class OutputMultipleMapperextendsMapper<LongWritable, Text, Text, Text>{

      private Text key1 = new Text();

      private Text value1 = new Text();

      private MultipleOutputs<Text, Text> mos;

      @Override

      protected voidcleanup(Mapper<LongWritable, Text, Text,Text>.Contextcontext)

                throws IOException, InterruptedException {

          super.cleanup(context);

          mos.close();

      }

 

      @Override

      protected voidsetup(Mapper<LongWritable, Text, Text,Text>.Contextcontext)

                throws IOException, InterruptedException {

          super.setup(context);

          mos = newMultipleOutputs<Text, Text>(context);

      }

 

      @Override

      protected voidmap(LongWritable key, Textvalue, Mapper<LongWritable, Text, Text,Text>.Contextcontext)

                throws IOException, InterruptedException {

          if (value ==null) {

                return;

           }

           StringTokenizertokenizer = newStringTokenizer(value.toString());

          while (tokenizer.hasMoreTokens()) {

                 Stringtoken = tokenizer.nextToken();

                key1.set(token);

                value1.set("=>"+key1);

                mos.write(key1,value1, generateFileName(key1));

           }

      }

     

      private String generateFileName(Text key){

          if (key ==null) {

                return "default";

           }

          int len =key.toString().length();

          if (len <5) {

                return "primary";

           }

          return "extended";

      }

}

 

 

publicint run(String[]args) throws Exception {

      Configurationconf = newConfiguration();

      String[]otherArgs = newGenericOptionsParser(conf,args).getRemainingArgs();

      //对数组长度进行校验

      if (otherArgs.length <2) {

           System.err.println("Usage:wordcount <in> [<in>...] <out>");

           System.exit(2);

      }

 

      Jobjob = Job.getInstance(conf,this.getClass().getSimpleName());

      //设置要运行的任务

      job.setJarByClass(OutputMultipleFile.class);

      //设置输入路径

      Pathin = newPath(args[0]);

      FileInputFormat.addInputPath(job,in);

 

      //设置输出路径

      Pathout = newPath(args[1]);

      FileOutputFormat.setOutputPath(job,out);

 

      //设置要运行的Mapper

      job.setMapperClass(OutputMultipleMapper.class);

      //设置Mapper的输出key和输出value的类型

      job.setMapOutputKeyClass(LongWritable.class);

      job.setMapOutputValueClass(Text.class);

      job.setNumReduceTasks(0);

      boolean isSuccess =job.waitForCompletion(Boolean.TRUE);

      return isSuccess ?0 : 1;

}

 

publicstatic voidmain(String[] args)throws Exception {

      int num =new Random().nextInt(1000);

      if (args ==null || args.length == 0) {

          args = new String[]{

                "hdfs://hdfs-cluster/user/hadoop/input",

                "hdfs://hdfs-cluster/user/hadoop/output"+num

           };

      }

 

      int status =new OutputMultipleFile().run(args);

      System.exit(status);

}

}

 

原创粉丝点击