回顾Hadoop二次排序
来源:互联网 发布:肩膀酸痛知乎 编辑:程序博客网 时间:2024/04/29 15:29
MapReduce二次排序原理:
1.在map的时候会通过InputFormat的getSplits来把数据集分割成splites
2.InputFormat会提供RecordReader来读取每一条的Record,读取之后传送给map来接受处理。
3.在Map阶段最后会通过Partitioner对Mapper的计算结果进行分区。可以通过job的setPartitionerClass来自定义partition的实现。
4.在每个partition分区内可以调用Job的setSortComparatorClass来对分区内基于key比较函数进行排序。如果没有设置则会调用key的compartTo方法
5.在Reducer端会接收到所有Mapper端中属于自己的数据,其实我们可以通过Job的setSortComparatorClass来对当前Reducer收到的所有数据基于Key比较函数进行排序。由于Reducer端每个key对应的是Value list,因此需要使用Job的SetGroupingComparatorClass来设置分组函数的类。
6.最后调用Reduce对自己收到的所有数据进行最后的处理。
package com.dtspark;import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.io.WritableComparator;import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import java.io.DataInput;import java.io.DataOutput;import java.io.IOException; public class MutlpleSort { public static class MyMapper extends Mapper<LongWritable, Text, MyselfKey, IntWritable>{ private MyselfKey mapKey = new MyselfKey(); private IntWritable intWritable = new IntWritable(0); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { String[] splits = value.toString().split("\t"); mapKey.setFirst(splits[0]); mapKey.setSecond(splits[1]); intWritable.set(Integer.valueOf(splits[1])); context.write(mapKey, intWritable); } } public static class MyReducer extends Reducer<MyselfKey,IntWritable,Text,Text> { public void reduce(MyselfKey key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { StringBuffer buffer = new StringBuffer(); for(IntWritable vv:values){ buffer.append(vv.get()+","); } context.write(new Text(key.getFirst()), new Text(buffer.toString().substring(0,buffer.toString().length()-1))); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: EventCount <in> <out>"); System.exit(2); } Job job = Job.getInstance(conf, "event count"); job.setJarByClass(MutlpleSort.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setMapOutputKeyClass(MyselfKey.class); job.setMapOutputValueClass(IntWritable.class); job.setGroupingComparatorClass(MyselfGroupComparator.class); job.setPartitionerClass(MyselfSortPartition.class); job.setSortComparatorClass(MyselfSortingComparator.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } } class MyselfKey implements WritableComparable<IntPair>{private String first;private String second;public MyselfKey(){}public MyselfKey(String first, String second) {super();this.first = first;this.second = second;}@Overridepublic void write(DataOutput out) throws IOException {out.writeUTF(this.first);out.writeUTF(this.second);}@Overridepublic void readFields(DataInput in) throws IOException {this.first = in.readUTF();this.second = in.readUTF();}@Overridepublic int compareTo(IntPair o) {return 0;}public String getFirst() {return first;}public void setFirst(String first) {this.first = first;}public String getSecond() {return second;}public void setSecond(String second) {this.second = second;}}class MyselfSortingComparator extends WritableComparator{public MyselfSortingComparator(){super(MyselfKey.class,true);}@Overridepublic int compare(WritableComparable a, WritableComparable b) {MyselfKey key1 = (MyselfKey)a;MyselfKey key2 = (MyselfKey)b;if(!key1.getFirst().equals(key2.getFirst())){return key1.getFirst().compareTo(key2.getFirst());}else{return key1.getSecond().compareTo(key2.getSecond());}}}class MyselfGroupComparator extends WritableComparator{public MyselfGroupComparator(){super(MyselfKey.class,true);}@Overridepublic int compare(WritableComparable a, WritableComparable b) {MyselfKey key1 = (MyselfKey)a;MyselfKey key2 = (MyselfKey)b;return key1.getFirst().compareTo(key2.getFirst());}}class MyselfSortPartition extends Partitioner<MyselfKey, IntWritable>{@Overridepublic int getPartition(MyselfKey key, IntWritable value, int numPartitions) {return Math.abs(key.getFirst().hashCode()*127)%numPartitions;}}
1 0
- 回顾Hadoop二次排序
- hadoop 二次排序
- Hadoop二次排序
- Hadoop二次排序
- Hadoop二次排序
- hadoop二次排序
- hadoop之二次排序
- hadoop二次排序一
- hadoop二次排序二
- hadoop二次排序三
- Hadoop二次排序
- hadoop的二次排序
- Hadoop二次排序<转>
- Hadoop二次排序
- Hadoop二次排序
- hadoop二次排序
- hadoop 二次排序
- hadoop二次排序<转>
- 数据结构之快速排序的Java实现
- 【记录】kernel打补丁,编译
- 权限系统 拾遗
- ffmpeg中的sws_scale算法性能测试
- LoadRunner常用知识点-----LoadRunner日志输出
- 回顾Hadoop二次排序
- 2038: [2009国家集训队]小Z的袜子(hose) (莫队算法)
- 初学linux设备驱动遇到的编译和操作问题解决
- 课堂笔记2016.8.5
- clone学习
- CSV操作(导出和导入)
- UVA12563 0-1背包变形
- JavaScript基本类型和引用类型
- 程序优化