hadoop多输出

来源:互联网 发布:工商银行重庆分行数据 编辑:程序博客网 时间:2024/05/19 10:10
旧API中有 org.apache.hadoop.mapred.lib.MultipleOutputFormat和org.apache.hadoop.mapred.lib.MultipleOutputs

在新API中 用 org.apache.hadoop.mapreduce.lib.output.MultipleOutputs 整合了上面旧API两个的功能

MultipleOutputs 作用

1,输出到多个文件或多个文件夹:
MultipleOutputs .write(Key key,Value value,String baseOutputPath)
2,以多种格式输出:

MultipleOutputs.write(namedOutput, key, value, baseOutputPath);

此时还需要调用

MultipleOutputs.addNamedOutput(job, namedOutput,outputFormatClass,keyClass, valueClass)

例子

package example;


import java.io.IOException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
/**
 * 多个路径输入
 * @author lijl
 *
 */


public class MultiOutputFileMR {
static class MultiOutputFileMapper extends Mapper<LongWritable, Text, Text, Text>{
public void map(LongWritable key,Text value,Context context){
try {
String[] str = value.toString().split("\\|");
context.write(new Text(str[0]), new Text(str[1]));
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
static class MultiOutputFileReducer extends Reducer<Text, Text, Text, Text>{

MultipleOutputs<Text, Text> collector = null;
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
collector.close();
}


@Override
protected void setup(Context context) throws IOException,
InterruptedException {
collector = new MultipleOutputs<Text, Text>(context);
}



public void reduce(Text key,Iterable<Text> values,Context context){
try {
for(Text value:values){
// collector.write( key, value,"/sina_yq/path7/a");
// collector.write("a1", key, value);
// collector.write("a2", key, value);
// collector.write("a3", key, value);
// collector.write("a4", key, value,"/sina_yq/path7/");
collector.write( key, value,"/sina_yq/path7/"+key.toString());
}

} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
static class MultiOutPutTestFormat extends MultipleTextOutputFormat<Text, Text>{
protected String generateFileNameForKeyValue(Text key,Text value, String name) {
   return key.toString();
 }
}

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = new Job(conf,"MultiPathFileInput");
job.setJarByClass(MultiOutputFileMR.class);
// FileInputFormat.addInputPath(job, new Path("hdfs://RS5-112:9000/cs/path1"));
// FileInputFormat.addInputPath(job, new Path("hdfs://RS5-112:9000/cs/path2"));
FileInputFormat.addInputPaths(job, "hdfs://RS5-112:9000/sina_yq/path1,hdfs://RS5-112:9000/cs/path2");
FileOutputFormat.setOutputPath(job, new Path("hdfs://RS5-112:9000/cs/path7"));

// MultipleOutputs.addNamedOutput(job, "a1", TextOutputFormat.class, Text.class, Text.class);
// MultipleOutputs.addNamedOutput(job, "a2", TextOutputFormat.class, Text.class, Text.class);
// MultipleOutputs.addNamedOutput(job, "a3", TextOutputFormat.class, Text.class, Text.class);
// MultipleOutputs.addNamedOutput(job, "a4", TextOutputFormat.class, Text.class, Text.class);

job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);

job.setOutputFormatClass(TextOutputFormat.class);

job.setMapperClass(MultiOutputFileMapper.class);
job.setReducerClass(MultiOutputFileReducer.class);
job.setNumReduceTasks(1);

System.exit(job.waitForCompletion(true)?0:1);
}


}

原创粉丝点击