Hadoop 实例15 MultipleInputs实战2:多种自定义文件格式的文件输入处理
来源:互联网 发布:ember.js 官网 编辑:程序博客网 时间:2024/06/08 04:13
MultipleInputs 可以让MR支持多种输入格式。
比如我们有两种文件格式,那么我们就需要有两套 Record Class, RecordReader和InputFormat。
MultipleInputs需要不同的InputFormat, 一种InputFormat使用一种RecordReader来读取文件并返回一种Record格式的值,这就是这三个类型的关系,也是map过程中涉及的几个步骤的工具和产物。
1、数据准备
a文件
1t802t903t1004t505t73
b文件
1tlilit32txiaomingt33tfeifeit34tzhangsant35tlisit3
2、要求自定义实现inputFormat,输出 key、value格式数据
1 FirstClass:80; SecondClass:lili,ClassNum:3; 2 SecondClass:xiaoming,ClassNum:3; FirstClass:90; 3 FirstClass:100; SecondClass:feifei,ClassNum:3; 4 SecondClass:zhangsan,ClassNum:3; FirstClass:50; 5 FirstClass:73; SecondClass:lisi,ClassNum:3;
3.程序实现:
package cn.edu.bjut.multitwo;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Writable;public class FirstClass implements Writable { private String value; public FirstClass() {} public FirstClass(String value) { this.value = value; } public void write(DataOutput out) throws IOException { out.writeUTF(value); } public void readFields(DataInput in) throws IOException { this.value = in.readUTF(); } @Override public String toString() { return "FirstClass:"+value; } public String getValue() { return value; } public void setValue(String value) { this.value = value; }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;public class FirstClassReader extends RecordReader<Text, FirstClass> { private LineRecordReader lineRecordReader = null; private Text key = null; private FirstClass firstClass = null; @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { close(); lineRecordReader = new LineRecordReader(); lineRecordReader.initialize(split, context); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if(!lineRecordReader.nextKeyValue()) { key = null; firstClass = null; return false; } String line = lineRecordReader.getCurrentValue().toString().trim(); String[] arr = line.split("t"); if(2 == arr.length) { key = new Text(arr[0]); firstClass = new FirstClass(arr[1]); } return true; } @Override public Text getCurrentKey() throws IOException, InterruptedException { return key; } @Override public FirstClass getCurrentValue() throws IOException, InterruptedException { return firstClass; } @Override public float getProgress() throws IOException, InterruptedException { return lineRecordReader.getProgress(); } @Override public void close() throws IOException { if(null != lineRecordReader) { lineRecordReader.close(); lineRecordReader = null; } key = null; firstClass = null; }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;public class FirstInputFormat extends FileInputFormat<Text, FirstClass> { @Override public RecordReader<Text, FirstClass> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new FirstClassReader(); }}
package cn.edu.bjut.multitwo;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Writable;public class SecondClass implements Writable { private String value; private int classNum; public SecondClass() {} public SecondClass(String value, int classNum) { super(); this.value = value; this.classNum = classNum; } public void write(DataOutput out) throws IOException { out.writeUTF(value); out.writeInt(classNum); } public void readFields(DataInput in) throws IOException { this.value = in.readUTF(); this.classNum = in.readInt(); } @Override public String toString() { return "SecondClass:"+value+",ClassNum:"+classNum; } public String getValue() { return value; } public void setValue(String value) { this.value = value; } public int getClassNum() { return classNum; } public void setClassNum(int classNum) { this.classNum = classNum; }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;public class SecondClassReader extends RecordReader<Text, SecondClass> { private LineRecordReader lineRecordReader = null; private Text key = null; private SecondClass secondClass = null; @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { close(); lineRecordReader = new LineRecordReader(); lineRecordReader.initialize(split, context); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if(!lineRecordReader.nextKeyValue()) { key = null; secondClass = null; return false; } String line = lineRecordReader.getCurrentValue().toString().trim(); String[] arr = line.split("t"); if(3 == arr.length) { key = new Text(arr[0]); secondClass = new SecondClass(arr[1], Integer.parseInt(arr[2])); } return true; } @Override public Text getCurrentKey() throws IOException, InterruptedException { return key; } @Override public SecondClass getCurrentValue() throws IOException, InterruptedException { return secondClass; } @Override public float getProgress() throws IOException, InterruptedException { return lineRecordReader.getProgress(); } @Override public void close() throws IOException { if(null != lineRecordReader) { lineRecordReader.close(); lineRecordReader = null; } key = null; secondClass = null; }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;public class SecondInputFormat extends FileInputFormat<Text, SecondClass> { @Override public RecordReader<Text, SecondClass> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new SecondClassReader(); }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class MultiMapper1 extends Mapper<Text, FirstClass, Text, Text> { @Override protected void map(Text key, FirstClass value, Context context) throws IOException, InterruptedException { context.write(key, new Text(value.toString())); }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class MultiMapper2 extends Mapper<Text, SecondClass, Text, Text> { @Override protected void map(Text key, SecondClass value, Context context) throws IOException, InterruptedException { context.write(key, new Text(value.toString())); }}
package cn.edu.bjut.multitwo;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class MultiReducer extends Reducer<Text, Text, Text, Text> { @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { StringBuffer stringBuffer = new StringBuffer(); for(Text text : values) { stringBuffer.append(text.toString()).append("; "); } context.write(key, new Text(stringBuffer.toString())); }}
package cn.edu.bjut.multitwo;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class MainJob { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "multi"); job.setJarByClass(MainJob.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(MultiReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); MultipleInputs.addInputPath(job, new Path(args[0]), FirstInputFormat.class, MultiMapper1.class); MultipleInputs.addInputPath(job, new Path(args[1]), SecondInputFormat.class, MultiMapper2.class); Path outPath = new Path(args[2]); FileSystem fs = FileSystem.get(conf); if(fs.exists(outPath)) { fs.delete(outPath, true); } FileOutputFormat.setOutputPath(job, outPath); job.waitForCompletion(true); }}
0 0
- Hadoop 实例15 MultipleInputs实战2:多种自定义文件格式的文件输入处理
- Hadoop 实例14 MultipleInputs实战2:多文件输入执行join操作
- hadoop多文件格式输入(MultipleInputs)
- Hadoop-MultipleInputs实例<转>
- hadoop MultipleInputs 的使用
- Hadoop多文件(目录)输出 以及MultipleInputs存在的问题
- Hadoop:自定义输入文件格式类InputFormat
- hadoop 之 MultipleInputs--为多个输入指定不同的InputFormat和Mapper
- hadoop自定义文件的输入格式
- Hadoop-采样器-多输入路径-只采一个文件-(MultipleInputs+getsample(conf.getInputFormat)
- Hadoop-采样器-多输入路径-只采一个文件-(MultipleInputs+getsample(conf.getInputFormat)
- hadoop的MR当用MultipleInputs时要获取文件路径方法
- hadoop用MultipleInputs/MultiInputFormat实现一个mapreduce job中读取不同格式的文件
- 《Hadoop实战》的笔记-2、Hadoop输入与输出
- 《Hadoop实战》的笔记-2、Hadoop输入与输出
- hadoop多文件格式输入
- hadoop多文件格式输入
- hadoop 处理不同的输入文件,文件关联
- 关于cocostudio中如何获取每一帧属性framedata的问题
- 秦九韶算法—— HDU 1111 Secret Code
- template class struct
- jquery获得下拉列表选中值和文本
- 2-LTE Identification I: UE and ME Identifiers
- Hadoop 实例15 MultipleInputs实战2:多种自定义文件格式的文件输入处理
- 网络解析封装类
- 15/9/9/Handler主线程和子线程之间的通讯/AsyncTask
- gulp打包seajs,用gulp-cmd-pack打包seajs模块
- Java环境变量的搭建
- 机器学习_相关概念
- 为什么初始化的全局变量和未初始化的全局变量要保存在不同的段??
- 指数循环节
- unity之Json