MapReduce 笔记 Demo

来源：互联网发布：社交网络的好处与坏处编辑：程序博客网时间：2024/05/04 12:35

map过程中获得正在读取的文件名称

import org.apache.hadoop.mapreduce.lib.input.FileSplit;//获取 input split 所在的文件名String curFileName = ((FileSplit)context.getInputSplit()).getPath().getName();//获得文件路径String curFilePath = ((FileSplit)context.getInputSplit()).getPath().toString();

个人有个小疑问，这个调用是放在map方法中还是放在setup方法中？

如果放在setup方法中就可以避免每次map方法都重新获得一次文件名，但是如果放在setup方法中获取文件名的话，使用CombineFilelnputFormat的时候会不会获得错误的文件名？希望有大神解答

map/reduce过程中设置计数器

context.getCounter("GroupName", "FieldName").increment(1);

reduce多目录/自定义文件名输出

package com.demo;import java.io.IOException;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;/** * @author chichuduxing * @date 2017年3月31日 下午1:58:33 */public class MyReduce extends Reducer<Text, Text, Text, Text> {private MultipleOutputs<Text, Text> _mos;@Overrideprotected void setup(Context context) throws IOException, InterruptedException {_mos = new MultipleOutputs<Text, Text>(context);}long _count = 0l;@Overrideprotected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {// 定义job的时候设置// MultipleOutputs.addNamedOutput(job,"result",TextOutputFormat.class,Text.class,Text.class);// MultipleOutputs.addNamedOutput(job,"count",TextOutputFormat.class,Text.class,LongWritable.class);_count = 0;for (Text value : values) {_count++;context.write(key, value);// 默认输出，文件名为:part-r-00000_mos.write("result", key, values);// 输出文件名样例:result-r-00000_mos.write("result", key, value, key + "/");// 输出文件名样例：-r-00000// 文件会被按照key分类，放在对应的以key命名的文件夹里面}_mos.write("count", key, new LongWritable(_count));// 输出文件名样例:count-m-0000}@Overrideprotected void cleanup(Context context) throws IOException, InterruptedException {// 一定要记得关闭_mos.close();}}

多个MapReduce设置依赖关系

package com.demo;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;/** * @author chichuduxing * @date 2017年3月31日 下午1:39:18 */public class MyMain {public static void main(String[] args) throws IOException {Configuration conf = new Configuration();Job job1 = Job.getInstance(conf, "JOB1");Job job2 = Job.getInstance(conf, "JOB2");// 具体job设置略ControlledJob controlledJob1 = new ControlledJob(job1.getConfiguration());controlledJob1.setJob(job1);ControlledJob controlledJob2 = new ControlledJob(job2.getConfiguration());controlledJob2.setJob(job2);// 设置依赖controlledJob2.addDependingJob(controlledJob1);// 主控制器JobControl jc = new JobControl("JobControl");jc.addJob(controlledJob1);jc.addJob(controlledJob2);Thread jcThread = new Thread(jc);jcThread.start();while (true) {if (jc.allFinished()) {System.out.println(jc.getSuccessfulJobList());jc.stop();return;}if (jc.getFailedJobList().size() > 0) {System.out.println(jc.getFailedJobList());jc.stop();return;}}}}

MapReduce自定义类型做key/value

如果是做value的话，只要实现Writable接口就行了，但是如果做key的话，则需要实现WritableComparable接口

如果要添加一个自定义的构造函数用于自定义的Writable类一定要保持默认的空构造函数

如果使用TextOutputFormat序列化自定义Writable类型的实例。要确保用于自定义的Writable数据类型有一个有意义的toString()实现

在读取输入数据时，Hadoop课重复使用Writable类的一个实例。在readFileds()方法里面填充字段时，不应该依赖与该对象的现有状态

package com.bean;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.WritableComparable;/** * @author chichuduxing * @date 2017年3月31日 下午3:20:39 */public class MyValue implements WritableComparable<MyValue> {public String name;public int counter;public long timestamp;public MyValue() {}public MyValue(String name, int counter, long timestamp) {this.name = name;this.counter = counter;this.timestamp = timestamp;}@Overridepublic void readFields(DataInput in) throws IOException {name = in.readUTF();counter = in.readInt();timestamp = in.readLong();}@Overridepublic void write(DataOutput out) throws IOException {out.writeUTF(name);out.writeInt(counter);out.writeLong(timestamp);}@Overridepublic int compareTo(MyValue compareValue) {if (this == compareValue) {return 0;}if (null == this.name && null != compareValue.name) {return -1;} else if (this.name == compareValue.name || this.name.equals(compareValue.name)) {if (this.counter == compareValue.counter) {if (this.timestamp == compareValue.timestamp) {return 0;} else {return this.timestamp > compareValue.timestamp ? 1 : -1;}} else {return this.counter > compareValue.counter ? 1 : -1;}} else if (null == compareValue.name) {return 1;} else {return this.name.compareTo(compareValue.name);}}@Overridepublic String toString() {return new StringBuilder().append(name).append('\t').append(counter).append('\t').append(timestamp).append('\t').toString();}}

0 0