回顾Hadoop二次排序

来源:互联网 发布:肩膀酸痛知乎 编辑:程序博客网 时间:2024/04/29 15:29

MapReduce二次排序原理:

1.在map的时候会通过InputFormat的getSplits来把数据集分割成splites

2.InputFormat会提供RecordReader来读取每一条的Record,读取之后传送给map来接受处理。

3.在Map阶段最后会通过Partitioner对Mapper的计算结果进行分区。可以通过job的setPartitionerClass来自定义partition的实现。

4.在每个partition分区内可以调用Job的setSortComparatorClass来对分区内基于key比较函数进行排序。如果没有设置则会调用key的compartTo方法

5.在Reducer端会接收到所有Mapper端中属于自己的数据,其实我们可以通过Job的setSortComparatorClass来对当前Reducer收到的所有数据基于Key比较函数进行排序。由于Reducer端每个key对应的是Value list,因此需要使用Job的SetGroupingComparatorClass来设置分组函数的类。

6.最后调用Reduce对自己收到的所有数据进行最后的处理。



package com.dtspark;import org.apache.hadoop.conf.Configuration;  import org.apache.hadoop.fs.Path;  import org.apache.hadoop.io.IntWritable;  import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;  import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.io.WritableComparator;import org.apache.hadoop.mapreduce.Job;  import org.apache.hadoop.mapreduce.Mapper;  import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.Reducer;  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  import org.apache.hadoop.util.GenericOptionsParser;  import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;     public class MutlpleSort {         public static class MyMapper extends Mapper<LongWritable, Text, MyselfKey, IntWritable>{      private MyselfKey mapKey = new MyselfKey();    private IntWritable intWritable = new IntWritable(0);        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {          String[] splits  = value.toString().split("\t");        mapKey.setFirst(splits[0]);        mapKey.setSecond(splits[1]);        intWritable.set(Integer.valueOf(splits[1]));        context.write(mapKey, intWritable);        }      }         public static class MyReducer extends Reducer<MyselfKey,IntWritable,Text,Text> {             public void reduce(MyselfKey key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {              StringBuffer buffer = new StringBuffer();            for(IntWritable vv:values){            buffer.append(vv.get()+",");            }            context.write(new Text(key.getFirst()), new Text(buffer.toString().substring(0,buffer.toString().length()-1)));        }      }         public static void main(String[] args) throws Exception {          Configuration conf = new Configuration();          String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();          if (otherArgs.length < 2) {              System.err.println("Usage: EventCount <in> <out>");              System.exit(2);          }          Job job = Job.getInstance(conf, "event count");          job.setJarByClass(MutlpleSort.class);          job.setMapperClass(MyMapper.class);          job.setReducerClass(MyReducer.class);          job.setMapOutputKeyClass(MyselfKey.class);        job.setMapOutputValueClass(IntWritable.class);        job.setGroupingComparatorClass(MyselfGroupComparator.class);        job.setPartitionerClass(MyselfSortPartition.class);        job.setSortComparatorClass(MyselfSortingComparator.class);        job.setOutputKeyClass(Text.class);          job.setOutputValueClass(IntWritable.class);          FileInputFormat.addInputPath(job, new Path(otherArgs[0]));          FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));          System.exit(job.waitForCompletion(true) ? 0 : 1);      }  }  class MyselfKey implements WritableComparable<IntPair>{private String first;private String second;public MyselfKey(){}public MyselfKey(String first, String second) {super();this.first = first;this.second = second;}@Overridepublic void write(DataOutput out) throws IOException {out.writeUTF(this.first);out.writeUTF(this.second);}@Overridepublic void readFields(DataInput in) throws IOException {this.first = in.readUTF();this.second = in.readUTF();}@Overridepublic int compareTo(IntPair o) {return 0;}public String getFirst() {return first;}public void setFirst(String first) {this.first = first;}public String getSecond() {return second;}public void setSecond(String second) {this.second = second;}}class MyselfSortingComparator extends WritableComparator{public MyselfSortingComparator(){super(MyselfKey.class,true);}@Overridepublic int compare(WritableComparable a, WritableComparable b) {MyselfKey key1 = (MyselfKey)a;MyselfKey key2 = (MyselfKey)b;if(!key1.getFirst().equals(key2.getFirst())){return key1.getFirst().compareTo(key2.getFirst());}else{return key1.getSecond().compareTo(key2.getSecond());}}}class MyselfGroupComparator extends WritableComparator{public MyselfGroupComparator(){super(MyselfKey.class,true);}@Overridepublic int compare(WritableComparable a, WritableComparable b) {MyselfKey key1 = (MyselfKey)a;MyselfKey key2 = (MyselfKey)b;return key1.getFirst().compareTo(key2.getFirst());}}class MyselfSortPartition extends Partitioner<MyselfKey, IntWritable>{@Overridepublic int getPartition(MyselfKey key, IntWritable value, int numPartitions) {return Math.abs(key.getFirst().hashCode()*127)%numPartitions;}}



1 0