MapReduce之OutputFormat理解

来源：互联网发布：python中execute 编辑：程序博客网时间：2024/05/15 18:09

一 OutputFormat作用

1校验job中指定输出路径是否存在

2将结果写入输出文件

二 OutputFormat的实现

2.1DBOutputFormat: 发送Reduce结果到SQL表中

2.2FileOutputFormat: 将Reduce结果写入文件中

2.2.1MapFileOutputFormat: 主要是处理MapFile(特殊的SequenceFile)的输出

2.2.2SequenceFileOutputFormat: 主要是处理SequenceFile的输出

2.2.3TextFileOutputFormat: 主要是处理普通文本的输出，也是默认实现

2.3FilterOutputFormat：主要就是方便包装其他OutputFromat(没用过)

2.4NullOutputFormat: 把所有的输出放到/dev/null(没用过)

三 MultipleOutputs

在有些场景中，我们需要将Map-Reduce结果输出到多个文件中，我们就可以使用MapOutputs这个类。

MultipleOutputs的使用步骤：

3.1我们需要在Mapper中setup方法实例化MapOutputs

3.2在map方法中使用MapOutputs对象进行write, 并且需要把你的文件命传入write方法中

3.3在完成后需要在close方法中关闭MapOutputs

3.4最后生成的结果就是你传入的文件名-m|r-0000这样的序列

publicclass OutputMultipleFile extends Configuredimplements Tool{

publicstatic class OutputMultipleMapperextendsMapper<LongWritable, Text, Text, Text>{

private Text key1 = new Text();

private Text value1 = new Text();

private MultipleOutputs<Text, Text> mos;

@Override

protected voidcleanup(Mapper<LongWritable, Text, Text,Text>.Contextcontext)

throws IOException, InterruptedException {

super.cleanup(context);

mos.close();

}

@Override

protected voidsetup(Mapper<LongWritable, Text, Text,Text>.Contextcontext)

throws IOException, InterruptedException {

super.setup(context);

mos = newMultipleOutputs<Text, Text>(context);

}

@Override

protected voidmap(LongWritable key, Textvalue, Mapper<LongWritable, Text, Text,Text>.Contextcontext)

throws IOException, InterruptedException {

if (value ==null) {

return;

}

StringTokenizertokenizer = newStringTokenizer(value.toString());

while (tokenizer.hasMoreTokens()) {

Stringtoken = tokenizer.nextToken();

key1.set(token);

value1.set("=>"+key1);

mos.write(key1,value1, generateFileName(key1));

}

private String generateFileName(Text key){

if (key ==null) {

return "default";

}

int len =key.toString().length();

if (len <5) {

return "primary";

}

return "extended";

}

publicint run(String[]args) throws Exception {

Configurationconf = newConfiguration();

String[]otherArgs = newGenericOptionsParser(conf,args).getRemainingArgs();

//对数组长度进行校验

if (otherArgs.length <2) {

System.err.println("Usage:wordcount <in> [<in>...] <out>");

System.exit(2);

}

Jobjob = Job.getInstance(conf,this.getClass().getSimpleName());

//设置要运行的任务

job.setJarByClass(OutputMultipleFile.class);

//设置输入路径

Pathin = newPath(args[0]);

FileInputFormat.addInputPath(job,in);

//设置输出路径

Pathout = newPath(args[1]);

FileOutputFormat.setOutputPath(job,out);

//设置要运行的Mapper

job.setMapperClass(OutputMultipleMapper.class);

//设置Mapper的输出key和输出value的类型

job.setMapOutputKeyClass(LongWritable.class);

job.setMapOutputValueClass(Text.class);

job.setNumReduceTasks(0);

boolean isSuccess =job.waitForCompletion(Boolean.TRUE);

return isSuccess ?0 : 1;

}

publicstatic voidmain(String[] args)throws Exception {

int num =new Random().nextInt(1000);

if (args ==null || args.length == 0) {

args = new String[]{

"hdfs://hdfs-cluster/user/hadoop/input",

"hdfs://hdfs-cluster/user/hadoop/output"+num

};

}

int status =new OutputMultipleFile().run(args);

System.exit(status);

}

阅读全文

0 0