MapReduce---自定义分区类实现全排序

来源：互联网发布：thinkphp数组转字符串编辑：程序博客网时间：2024/06/06 04:24

排序是MapReduce的核心技术，排序分为部分排序，全排序和二次排序。

部分排序：调用默认的HsshPartitioner，不需要操作，每个reduce聚合的key都是有序的。

全排序：对reduce输出的所有的key实现排序

方法1：设置一个reducde

方法2：自定义分区类实现全排序

下面以统计每年的最高气温为例进行示例：

注意：源文件是一个sequenceFile序列文件<IntWritable, IntWritable>

1、MaxTempMapper

package hadoop.mr.sort.total.custom;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * MaxTempMapper */public class MaxTempMapper extends Mapper<IntWritable, IntWritable, IntWritable, IntWritable> {protected void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {context.write(key,value);}}

2、MaxTempReducer

package hadoop.mr.sort.total.custom;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/**MaxTempReducer */public class MaxTempReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {int max = Integer.MIN_VALUE ;for(IntWritable iw : values){max = max > iw.get() ? max : iw.get() ;}context.write(key,new IntWritable(max));}}

3、自定义分区类,实现全排序

package hadoop.mr.sort.total.custom;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Partitioner;/** * 自定义分区类,实现全排序 */public class YearPartitioner extends Partitioner<IntWritable, IntWritable> {public int getPartition(IntWritable key, IntWritable value, int numPartitions) {int year = key.get();if(year < 1930){return 0 ;}else if(year > 1960) {return 2 ;}return 1 ;}}

4.、App

package hadoop.mr.sort.total.custom;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** */public class App {public static void main(String[] args) throws Exception {args = new String[]{"d:/java/mr/data/temp.seq", "d:/java/mr/out"};Configuration conf = new Configuration();FileSystem fs = FileSystem.get(conf);if(fs.exists(new Path(args[1]))){fs.delete(new Path(args[1]),true);}Job job = Job.getInstance(conf);job.setJobName("maxTemp");job.setJarByClass(App.class);job.setMapperClass(MaxTempMapper.class);job.setReducerClass(MaxTempReducer.class);FileInputFormat.addInputPath(job,new Path(args[0]));FileOutputFormat.setOutputPath(job,new Path(args[1]));//设置输入格式job.setInputFormatClass(SequenceFileInputFormat.class);job.setPartitionerClass(YearPartitioner.class);job.setOutputKeyClass(IntWritable.class);job.setOutputValueClass(IntWritable.class);job.setNumReduceTasks(3);job.waitForCompletion(true);}}

阅读全文

0 0