MapReduce 自定义outputFormat

来源:互联网 发布:华为荣耀4x手机套淘宝 编辑:程序博客网 时间:2024/05/29 13:58
  1. 写一个类继承 FileOutputFormat 泛型为最终输出的数据类型

    public class MyFileOutputFormat extends FileOutputFormat<Text, NullWritable> {
  2. 重写getRecordWriter(TaskAttemptContext context)方法

    maptask或者reducetask在最终输出时,先调用OutputFormat的getRecordWriter方法拿到一个RecordWriter 然后再调用RecordWriter的write(k,v)方法将数据写出
    @Override    public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {

public class LogEnhanceOutputFormat extends FileOutputFormat<Text, NullWritable> {    @Override    public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {        FileSystem fs = FileSystem.get(context.getConfiguration());        Path enhancePath = new Path("D:/temp/en/log.dat");        Path tocrawlPath = new Path("D:/temp/crw/url.dat");        FSDataOutputStream enhancedOs = fs.create(enhancePath);        FSDataOutputStream tocrawlOs = fs.create(tocrawlPath);        return new EnhanceRecordWriter(enhancedOs, tocrawlOs);    }    /**     * 构造一个自己的recordwriter     *      * @author     *      */    static class EnhanceRecordWriter extends RecordWriter<Text, NullWritable> {        FSDataOutputStream enhancedOs = null;        FSDataOutputStream tocrawlOs = null;        public EnhanceRecordWriter(FSDataOutputStream enhancedOs, FSDataOutputStream tocrawlOs) {            super();            this.enhancedOs = enhancedOs;            this.tocrawlOs = tocrawlOs;        }        @Override        public void write(Text key, NullWritable value) throws IOException, InterruptedException {            String result = key.toString();            // 如果要写出的数据是待爬的url,则写入待爬清单文件 /logenhance/tocrawl/url.dat            if (result.contains("tocrawl")) {                tocrawlOs.write(result.getBytes());            } else {                // 如果要写出的数据是增强日志,则写入增强日志文件 /logenhance/enhancedlog/log.dat                enhancedOs.write(result.getBytes());            }        }        @Override        public void close(TaskAttemptContext context) throws IOException, InterruptedException {            if (tocrawlOs != null) {                tocrawlOs.close();            }            if (enhancedOs != null) {                enhancedOs.close();            }        }    }}
// 要控制不同的内容写往不同的目标路径,可以采用自定义outputformat的方法        job.setOutputFormatClass(LogEnhanceOutputFormat.class);        FileInputFormat.setInputPaths(job, new Path("D:/srcdata/webloginput/"));        // 尽管我们用的是自定义outputformat,但是它是继承制fileoutputformat        // 在fileoutputformat中,必须输出一个_success文件,所以在此还需要设置输出path        FileOutputFormat.setOutputPath(job, new Path("D:/temp/output/"));        // 不需要reducer        job.setNumReduceTasks(0);
原创粉丝点击