Hadoop MapReduce做大数据排序

来源：互联网发布：sys wait.h windows 编辑：程序博客网时间：2024/04/29 11:55

1. 我们知道mapreduce天生适合作排序，由于他有一个shuffer的过程，当数据量很少的时候我们可以把reduce的num设置成1来进行排序，但是如果数据量很大，在一个reduce上处理不过来或者处理时间太长，那么我们就需要重新考虑这个排序（需要设置多个reduce）

2. 假设我们现在的数据是这样的，每个数字占一行，如：

6156436515549347562465909357259536

map中按行读取文件，直接把这个值作为key输出到reduce，由于没有value，我们可以使用NullWritable来代替，那么map输出大概就是这样的格式：

（2：null，null）（3：null）（10： null）

3. 以下是最重要的步骤，我们需要根据reduce的书目决定如何分区，也就是要自定义分区函数，让结果形成多个区间，比如我认为大于50的应该在一个区间，一共3个reduce，那么最后的数据应该是三个区间，大于50的直接分到第一个分区0，25到50之间的分到第二个分区1，小于25的分到第三个分区2.因为分区数和reduce数是相同的，所以不同的分区对应不同的reduce，因为分区是从0开始的，分区是0的会分到第一个reduce处理,分区是1的会分到第2个reduce处理，依次类推。并且reduce对应着输出文件，所以，第一个reduce生成的文件就会是part-r-0000，第二个reduce对应的生成文件就会是part-r-0001，依次类推，所以reduce处理时只需要把key和value再倒过来直接输出。这样最后就会让形成数目最大的字符串就会在第一个生成文件里，排好的序就会文件命的顺序，这里也需要自定义一个Comparator来保证每个reduce的输出是有序的（我们按照降序排列），代码如下：

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.io.WritableComparator;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class SortJob {public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {if (args.length < 2) {System.out.println("参数数量不对，至少两个以上参数：<数据文件输出路径>、<输入路径...>");System.exit(1);}String dataOutput = args[0];String[] inputs = new String[args.length - 1];System.arraycopy(args, 1, inputs, 0, inputs.length);Configuration conf = new Configuration();Job job = Job.getInstance(conf, "sort 测试");job.setJarByClass(JoinJob.class);job.setMapperClass(SortMapper.class);job.setReducerClass(SortReducer.class);job.setSortComparatorClass(SortComparator.class);job.setPartitionerClass(SortPartitoner.class);job.setNumReduceTasks(3);job.setOutputKeyClass(IntWritable.class);job.setOutputValueClass(NullWritable.class);Path[] inputPathes = new Path[inputs.length];for (int i = 0; i < inputs.length; i++) {inputPathes[i] = new Path(inputs[i]);}Path outputPath = new Path(dataOutput);FileInputFormat.setInputPaths(job, inputPathes);FileOutputFormat.setOutputPath(job, outputPath);job.waitForCompletion(true);}static class SortMapper extends Mapper<LongWritable, Text, IntWritable, NullWritable> {@Overridepublic void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {IntWritable in = new IntWritable(Integer.parseInt(value.toString()));context.write(in, NullWritable.get());}}static class SortReducer extends Reducer<IntWritable, NullWritable, IntWritable, NullWritable> {@Overridepublic void reduce(IntWritable key, Iterable<NullWritable> values, Context context) throws IOException,InterruptedException {for (NullWritable value : values) {System.out.println(key.toString() + value.toString());context.write(key, NullWritable.get());}}}static class SortPartitoner<K, V> extends Partitioner<K, V> {@Overridepublic int getPartition(K key, V value, int numReduceTasks) {int maxValue = 100;int keySection = 0;// 只有传过来的key值大于maxValue 并且numReduceTasks比如大于1个才需要分区，否则直接返回0if (numReduceTasks > 1 && key.hashCode() < maxValue) {int sectionValue = maxValue / (numReduceTasks - 1);int count = 0;while ((key.hashCode() - sectionValue * count) > sectionValue) {count++;}keySection = numReduceTasks - 1 - count;}return keySection;}}static class SortComparator extends WritableComparator {protected SortComparator() {super(IntWritable.class, true);}@Overridepublic int compare(WritableComparable a, WritableComparable b) {return -super.compare(a, b);}}}

0 0