mapreduce编程模型之WritableComparator

来源:互联网 发布:mac按键失灵 编辑:程序博客网 时间:2024/06/06 02:35

WritableComparator是一个类 这个类是用于mapreduce编程模型中的比较 排序 

mapreduce中有两次排序 一次是 在环形缓冲区域之中进行分区 排序

还有一次是数据在reduce端获取文件之后进行分组

现在我讲的是后面那个

 //Define the comparator that controls which keys are grouped together or a single call to Reducer#reduce
job.setGroupingComparatorClass(MyComparator.class);

上面是我们在定义job时候进行的配置 配置如何进行分组

setGroupingComparatorClass内部的参数是RawComparator

而WritableComparator是实现RawComparator

所以我们直接继承WritableComparator类就可以自己定义一个MyComparator

public static class MyComparator extends WritableComparator {        public MyComparator() {            super(Text.class,true);        }        @Override        public int compare(WritableComparable a, WritableComparable b) {            Text a1 = (Text) a;            Text b1 = (Text) b;            if (a1.toString().equals("hello") && b1.toString().equals("hello")) {                return -1;            } else {                return 0;            }        }    }
上面这段代码我必须说一个坑 坑了我好几个小时 最后在statckvoerflow网站上才找到的提示

就是那个无参构造子 必须调用父类的构造子 不然会报空指针 未初始化 buffer

通过查找源码 也确实发现了这个问题

  protected WritableComparator(Class<? extends WritableComparable> keyClass,                               Configuration conf,                               boolean createInstances) {    this.keyClass = keyClass;    this.conf = (conf != null) ? conf : new Configuration();    if (createInstances) {      key1 = newKey();      key2 = newKey();      buffer = new DataInputBuffer();    } else {      key1 = key2 = null;      buffer = null;    }  }

因为从报错的空指针来说 是buffer为空 整个类也就只有这人对buffer进行了初始化


最后来看一下结果

hdfs@yksp005206:/home/jumpserver$ hadoop fs -cat /test/wc/output/part-r-00000
hello value[] hello,
hello value[] hello,
world value[] hello,hellp,hive,kylin,spark,world,

从结果上来看 确实是hello没有被分到一个reduce中 而其他的所有单词都被分配到了同一个reducer中



package com.hit.ee;/** * Created by zh on 2017/9/28. */import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.*;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.IOException;import java.util.StringTokenizer;public class WorldCount2 {    public static class TokenizerMapper            extends Mapper<Object, Text, Text, Text> {        private Text word = new Text();        public void map(Object key, Text value, Context context        ) throws IOException, InterruptedException {            StringTokenizer itr = new StringTokenizer(value.toString());            while (itr.hasMoreTokens()) {                word.set(itr.nextToken());                context.write(word, word);            }        }    }    public static class IntSumReducer            extends Reducer<Text, Text, Text, Text> {        public void reduce(Text key, Iterable<Text> values,                           Context context        ) throws IOException, InterruptedException {            Text text = new Text();            StringBuffer sb = new StringBuffer("value[] ");            for (Text value : values) {                sb.append(value).append(",");            }            text.set(sb.toString());            context.write(key, text);        }    }    public static class MyComparator2 implements RawComparator<Text>{        DataInputBuffer buffer = new DataInputBuffer();        @Override        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {            try {                Text a = new Text();                buffer.reset(b1,s1,l1);                a.readFields(buffer);                Text b = new Text();                buffer.reset(b2,s2,l2);                b.readFields(buffer);                return compare(a,b);            } catch (IOException e) {                e.printStackTrace();            }            return -1;        }        @Override        public int compare(Text a1, Text b1) {            if (a1.toString().equals("hello") && b1.toString().equals("hello")) {                return -1;            } else {                return 0;            }        }    }    public static class MyComparator extends WritableComparator {        public MyComparator() {            super(Text.class,true);        }        @Override        public int compare(WritableComparable a, WritableComparable b) {            Text a1 = (Text) a;            Text b1 = (Text) b;            if (a1.toString().equals("hello") && b1.toString().equals("hello")) {                return -1;            } else {                return 0;            }        }    }    public static class MyPartitioner extends Partitioner<Text,Text>{        @Override        public int getPartition(Text key, Text value, int numPartitions) {            if (key.toString().equals("hello"))                return 0;            else                return 1;        }    }    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        FileSystem.get(conf).deleteOnExit(new Path(args[1]));        Job job = Job.getInstance(conf, "word count");        job.setJarByClass(WorldCount2.class);        job.setMapperClass(TokenizerMapper.class);        job.setReducerClass(IntSumReducer.class);//        job.setNumReduceTasks(2);//        job.setPartitionerClass(MyPartitioner.class);        //Define the comparator that controls how the keys are sorted before they are passed to the reducer        //job.setSortComparatorClass(MyComparator.class);        //Define the comparator that controls which keys are grouped together or a single call to Reducer#reduce        job.setGroupingComparatorClass(MyComparator.class);        job.setMapOutputValueClass(Text.class);        job.setMapOutputKeyClass(Text.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(Text.class);        FileInputFormat.addInputPath(job, new Path(args[0]));        FileOutputFormat.setOutputPath(job, new Path(args[1]));        System.exit(job.waitForCompletion(true) ? 0 : 1);    }}