Hadoop 实例15 MultipleInputs实战2:多种自定义文件格式的文件输入处理

来源:互联网 发布:ember.js 官网 编辑:程序博客网 时间:2024/06/08 04:13

MultipleInputs 可以让MR支持多种输入格式。

比如我们有两种文件格式,那么我们就需要有两套 Record Class, RecordReader和InputFormat。

MultipleInputs需要不同的InputFormat, 一种InputFormat使用一种RecordReader来读取文件并返回一种Record格式的值,这就是这三个类型的关系,也是map过程中涉及的几个步骤的工具和产物。

1、数据准备

a文件

1t802t903t1004t505t73

b文件

1tlilit32txiaomingt33tfeifeit34tzhangsant35tlisit3

2、要求自定义实现inputFormat,输出 key、value格式数据

1   FirstClass:80; SecondClass:lili,ClassNum:3; 2   SecondClass:xiaoming,ClassNum:3; FirstClass:90; 3   FirstClass:100; SecondClass:feifei,ClassNum:3; 4   SecondClass:zhangsan,ClassNum:3; FirstClass:50; 5   FirstClass:73; SecondClass:lisi,ClassNum:3; 

3.程序实现:

package cn.edu.bjut.multitwo;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Writable;public class FirstClass implements Writable {    private String value;    public FirstClass() {}    public FirstClass(String value) {        this.value = value;    }    public void write(DataOutput out) throws IOException {        out.writeUTF(value);    }    public void readFields(DataInput in) throws IOException {        this.value = in.readUTF();    }    @Override    public String toString() {        return "FirstClass:"+value;    }    public String getValue() {        return value;    }    public void setValue(String value) {        this.value = value;    }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;public class FirstClassReader extends RecordReader<Text, FirstClass> {    private LineRecordReader lineRecordReader = null;    private Text key = null;    private FirstClass firstClass = null;    @Override    public void initialize(InputSplit split, TaskAttemptContext context)            throws IOException, InterruptedException {        close();        lineRecordReader = new LineRecordReader();        lineRecordReader.initialize(split, context);    }    @Override    public boolean nextKeyValue() throws IOException, InterruptedException {        if(!lineRecordReader.nextKeyValue()) {            key = null;            firstClass = null;            return false;        }        String line = lineRecordReader.getCurrentValue().toString().trim();        String[] arr = line.split("t");        if(2 == arr.length) {            key = new Text(arr[0]);            firstClass = new FirstClass(arr[1]);        }        return true;    }    @Override    public Text getCurrentKey() throws IOException, InterruptedException {        return key;    }    @Override    public FirstClass getCurrentValue() throws IOException,            InterruptedException {        return firstClass;    }    @Override    public float getProgress() throws IOException, InterruptedException {        return lineRecordReader.getProgress();    }    @Override    public void close() throws IOException {        if(null != lineRecordReader) {            lineRecordReader.close();            lineRecordReader = null;        }        key = null;        firstClass = null;    }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;public class FirstInputFormat extends FileInputFormat<Text, FirstClass> {    @Override    public RecordReader<Text, FirstClass> createRecordReader(InputSplit split,            TaskAttemptContext context) throws IOException,            InterruptedException {        return new FirstClassReader();    }}
package cn.edu.bjut.multitwo;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Writable;public class SecondClass implements Writable {    private String value;    private int classNum;    public SecondClass() {}    public SecondClass(String value, int classNum) {        super();        this.value = value;        this.classNum = classNum;    }    public void write(DataOutput out) throws IOException {        out.writeUTF(value);        out.writeInt(classNum);    }    public void readFields(DataInput in) throws IOException {        this.value = in.readUTF();        this.classNum = in.readInt();    }    @Override    public String toString() {        return "SecondClass:"+value+",ClassNum:"+classNum;    }    public String getValue() {        return value;    }    public void setValue(String value) {        this.value = value;    }    public int getClassNum() {        return classNum;    }    public void setClassNum(int classNum) {        this.classNum = classNum;    }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;public class SecondClassReader extends RecordReader<Text, SecondClass> {    private LineRecordReader lineRecordReader = null;    private Text key = null;    private SecondClass secondClass = null;    @Override    public void initialize(InputSplit split, TaskAttemptContext context)            throws IOException, InterruptedException {        close();        lineRecordReader = new LineRecordReader();        lineRecordReader.initialize(split, context);    }    @Override    public boolean nextKeyValue() throws IOException, InterruptedException {        if(!lineRecordReader.nextKeyValue()) {            key = null;            secondClass = null;            return false;        }        String line = lineRecordReader.getCurrentValue().toString().trim();        String[] arr = line.split("t");        if(3 == arr.length) {            key = new Text(arr[0]);            secondClass = new SecondClass(arr[1], Integer.parseInt(arr[2]));        }        return true;    }    @Override    public Text getCurrentKey() throws IOException, InterruptedException {        return key;    }    @Override    public SecondClass getCurrentValue() throws IOException,            InterruptedException {        return secondClass;    }    @Override    public float getProgress() throws IOException, InterruptedException {        return lineRecordReader.getProgress();    }    @Override    public void close() throws IOException {        if(null != lineRecordReader) {            lineRecordReader.close();            lineRecordReader = null;        }        key = null;        secondClass = null;    }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;public class SecondInputFormat extends FileInputFormat<Text, SecondClass> {    @Override    public RecordReader<Text, SecondClass> createRecordReader(InputSplit split,            TaskAttemptContext context) throws IOException,            InterruptedException {        return new SecondClassReader();    }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class MultiMapper1 extends Mapper<Text, FirstClass, Text, Text> {    @Override    protected void map(Text key, FirstClass value, Context context)            throws IOException, InterruptedException {        context.write(key, new Text(value.toString()));    }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class MultiMapper2 extends Mapper<Text, SecondClass, Text, Text> {    @Override    protected void map(Text key, SecondClass value, Context context)            throws IOException, InterruptedException {        context.write(key, new Text(value.toString()));    }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class MultiReducer extends Reducer<Text, Text, Text, Text> {    @Override    protected void reduce(Text key, Iterable<Text> values, Context context)            throws IOException, InterruptedException {        StringBuffer stringBuffer = new StringBuffer();        for(Text text : values) {            stringBuffer.append(text.toString()).append("; ");        }        context.write(key, new Text(stringBuffer.toString()));    }}
package cn.edu.bjut.multitwo;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class MainJob {    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        Job job = new Job(conf, "multi");        job.setJarByClass(MainJob.class);        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(Text.class);        job.setReducerClass(MultiReducer.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(Text.class);        MultipleInputs.addInputPath(job, new Path(args[0]), FirstInputFormat.class, MultiMapper1.class);        MultipleInputs.addInputPath(job, new Path(args[1]), SecondInputFormat.class, MultiMapper2.class);        Path outPath = new Path(args[2]);        FileSystem fs = FileSystem.get(conf);        if(fs.exists(outPath)) {            fs.delete(outPath, true);        }        FileOutputFormat.setOutputPath(job, outPath);        job.waitForCompletion(true);    }}
0 0