Hadoop MapReduce做大数据排序

来源:互联网 发布:sys wait.h windows 编辑:程序博客网 时间:2024/04/29 11:55

1. 我们知道mapreduce天生适合作排序,由于他有一个shuffer的过程,当数据量很少的时候我们可以把reduce的num设置成1来进行排序,但是如果数据量很大,在一个reduce上处理不过来或者处理时间太长,那么我们就需要重新考虑这个排序(需要设置多个reduce)

2. 假设我们现在的数据是这样的,每个数字占一行,如:

6156436515549347562465909357259536

  map中按行读取文件,直接把这个值作为key输出到reduce,由于没有value,我们可以使用NullWritable来代替,那么map输出大概就是这样的格式:

(2:null,null)(3:null) (10: null)

3. 以下是最重要的步骤,我们需要根据reduce的书目决定如何分区,也就是要自定义分区函数,让结果形成多个区间,比如我认为大于50的应该在一个区间,一共3个reduce,那么最后的数据应该是三个区间,大于50的直接分到第一个分区0,25到50之间的分到第二个分区1,小于25的分到第三个分区2.因为分区数和reduce数是相同的,所以不同的分区对应不同的reduce,因为分区是从0开始的,分区是0的会分到第一个reduce处理,分区是1的会分到第2个reduce处理,依次类推。并且reduce对应着输出文件,所以,第一个reduce生成的文件就会是part-r-0000,第二个reduce对应的生成文件就会是part-r-0001,依次类推,所以reduce处理时只需要把key和value再倒过来直接输出。这样最后就会让形成数目最大的字符串就会在第一个生成文件里,排好的序就会文件命的顺序,这里也需要自定义一个Comparator来保证每个reduce的输出是有序的(我们按照降序排列),代码如下:

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.io.WritableComparator;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class SortJob {public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {if (args.length < 2) {System.out.println("参数数量不对,至少两个以上参数:<数据文件输出路径>、<输入路径...>");System.exit(1);}String dataOutput = args[0];String[] inputs = new String[args.length - 1];System.arraycopy(args, 1, inputs, 0, inputs.length);Configuration conf = new Configuration();Job job = Job.getInstance(conf, "sort 测试");job.setJarByClass(JoinJob.class);job.setMapperClass(SortMapper.class);job.setReducerClass(SortReducer.class);job.setSortComparatorClass(SortComparator.class);job.setPartitionerClass(SortPartitoner.class);job.setNumReduceTasks(3);job.setOutputKeyClass(IntWritable.class);job.setOutputValueClass(NullWritable.class);Path[] inputPathes = new Path[inputs.length];for (int i = 0; i < inputs.length; i++) {inputPathes[i] = new Path(inputs[i]);}Path outputPath = new Path(dataOutput);FileInputFormat.setInputPaths(job, inputPathes);FileOutputFormat.setOutputPath(job, outputPath);job.waitForCompletion(true);}static class SortMapper extends Mapper<LongWritable, Text, IntWritable, NullWritable> {@Overridepublic void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {IntWritable in = new IntWritable(Integer.parseInt(value.toString()));context.write(in, NullWritable.get());}}static class SortReducer extends Reducer<IntWritable, NullWritable, IntWritable, NullWritable> {@Overridepublic void reduce(IntWritable key, Iterable<NullWritable> values, Context context) throws IOException,InterruptedException {for (NullWritable value : values) {System.out.println(key.toString() + value.toString());context.write(key, NullWritable.get());}}}static class SortPartitoner<K, V> extends Partitioner<K, V> {@Overridepublic int getPartition(K key, V value, int numReduceTasks) {int maxValue = 100;int keySection = 0;// 只有传过来的key值大于maxValue 并且numReduceTasks比如大于1个才需要分区,否则直接返回0if (numReduceTasks > 1 && key.hashCode() < maxValue) {int sectionValue = maxValue / (numReduceTasks - 1);int count = 0;while ((key.hashCode() - sectionValue * count) > sectionValue) {count++;}keySection = numReduceTasks - 1 - count;}return keySection;}}static class SortComparator extends WritableComparator {protected SortComparator() {super(IntWritable.class, true);}@Overridepublic int compare(WritableComparable a, WritableComparable b) {return -super.compare(a, b);}}}


0 0