Mapreduce---RandomSampler采样实现全排序

来源:互联网 发布:旋转门编程器plc 编辑:程序博客网 时间:2024/06/14 21:12

排序是MapReduce的核心技术,排序分为部分排序,全排序和二次排序。

部分排序:调用默认的HashPartitioner,不需要操作,每个reduce聚合的key都是有序的。

全排序:对reduce输出的所有的key实现排序

             方法1:设置一个reducde

             方法2:自定义分区类实现全排序

            方法3 :使用采样        

下面以统计每年的最高气温为例进行示例:

注意:源文件是一个sequenceFile序列文件<IntWritable, IntWritable>

1、MaxTempMapper

package hadoop.mr.sort.total.totalorder;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * MaxTempMapper */public class MaxTempMapper extends Mapper<IntWritable, IntWritable, IntWritable, IntWritable> {protected void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {context.write(key,value);}}
2、MaxTempReducer
package hadoop.mr.sort.total.totalorder;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/** */public class MaxTempReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {int max = Integer.MIN_VALUE ;for(IntWritable iw : values){max = max > iw.get() ? max : iw.get() ;}context.write(key,new IntWritable(max));}}

3、App

package hadoop.mr.sort.total.totalorder;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.partition.InputSampler;import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;/** */public class App {public static void main(String[] args) throws Exception {args = new String[]{"d:/java/mr/data/temp.seq", "d:/java/mr/out"};Configuration conf = new Configuration();FileSystem fs = FileSystem.get(conf);if(fs.exists(new Path(args[1]))){fs.delete(new Path(args[1]),true);}Job job = Job.getInstance(conf);job.setJobName("maxTemp");job.setJarByClass(App.class);job.setMapperClass(MaxTempMapper.class);job.setReducerClass(MaxTempReducer.class);FileInputFormat.addInputPath(job,new Path(args[0]));FileOutputFormat.setOutputPath(job,new Path(args[1]));//设置combine输入格式job.setInputFormatClass(SequenceFileInputFormat.class);job.setPartitionerClass(TotalOrderPartitioner.class);job.setNumReduceTasks(3);job.setMapOutputKeyClass(IntWritable.class);job.setMapOutputValueClass(IntWritable.class);job.setOutputKeyClass(IntWritable.class);job.setOutputValueClass(IntWritable.class);TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("file:///d:/java/mr/par.seq"));//随机采样器InputSampler.RandomSampler<IntWritable,IntWritable> r = new InputSampler.RandomSampler<IntWritable, IntWritable>(1f,5,3);//创建分区文件InputSampler.writePartitionFile(job,r);job.waitForCompletion(true);}}


原创粉丝点击