MapReduce SecondarySort

来源：互联网发布：淘宝上活动技巧编辑：程序博客网时间：2024/05/16 05:04
package wjj;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.io.WritableComparator;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.Reducer;public class SecondarySort {/* * 这里新定义的类型为IntPair封装了两个个int型， * 依次存放两次排序的value。Hadoop要求key的类型必须实现Writable和Comparable，前者为了支持序列化和反序列化，后者为了实现基于比较的排序。 * 需要注意的是compareTo()方法中先按first升序排列，后按second排列。 * 我们可以构造一个复合类IntPair，他有两个字段，先利用分区对第一字段排序，再利用分区内的比较对第二字段排序。       所有自定义的key应该实现接口WritableComparable，因为是可序列的并且可比较的 *  *  * */public  static  class IntPair implements WritableComparable<IntPair>{int first;int second;public void set(int left,int right){first=left;second=right;}public int getFirst() {return first;}public void setFirst(int first) {this.first = first;}public int getSecond() {return second;}public void setSecond(int second) {this.second = second;} //反序列化，从流中的二进制转换成IntPair  @Overridepublic void readFields(DataInput in) throws IOException {// TODO Auto-generated method stubfirst=in.readInt();second=in.readInt();} //序列化，将IntPair转化成使用流传送的二进制  @Overridepublic void write(DataOutput out) throws IOException {// TODO Auto-generated method stubout.writeInt(first);            out.writeInt(second);//这里有的地方写的是write()方法，但是我在实际的测试过程中会抛出异常}@Overridepublic int compareTo(IntPair o) {// TODO Auto-generated method stubif(first!=o.first)return first<o.first?-1:1;else if(second!=o.second)return second<o.second?-1:1; else return 0;}}/* * 定义key后还不能满足需求。因为默认的HashPartitioner会将相同的key分配给同一个reduce， * 而我们希望的是first相同的key分给同一个reduce处理，默认的Partitioner显然保证不了这一点。 * 这就需要我们自定义Partitioner，实现first相同的key分配给同一个reduce。 * 只考虑first，不考虑second，这样就满足了我们的需求。这是key的第一次比较。 * */public static class FirstPartitioner extends Partitioner<IntPair,IntWritable>{@Overridepublic int getPartition(IntPair key, IntWritable value, int num) {// TODO Auto-generated method stubreturn Math.abs(key.getFirst()*127)%num; }}/* * 而我们希望first相同的key中，只获取第一个的second即可， * 其他数据可以忽略。这就需要数据执行reduce前按照key的first字段 * 进行归并，即grouping。first相同的key归为一个group， * 将第一个key和所有的value传给reduce()方法。 * 然后reduce将key输出即可实现目的。 * 为了实现这样的grouping操纵，需要自定义归并比较器 * （ValueGroupingComparator），  * *//*//第一种方法，实现接口RawComparator     public static class GroupingComparator implements RawComparator<IntPair> {         @Override         public int compare(IntPair o1, IntPair o2) {             int l = o1.getFirst();             int r = o2.getFirst();             return l == r ? 0 : (l < r ? -1 : 1);         }         @Override         //一个字节一个字节的比，直到找到一个不相同的字节，然后比这个字节的大小作为两个字节流的大小比较结果。         public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){             // TODO Auto-generated method stub              return WritableComparator.compareBytes(b1, s1, Integer.SIZE/8,                       b2, s2, Integer.SIZE/8);         }     }*/   public static class GroupingComparator extends WritableComparator    {        protected GroupingComparator()        {            super(IntPair.class, true);        }        @Override        //Compare two WritableComparables.        public int compare(WritableComparable w1, WritableComparable w2)        {            IntPair ip1 = (IntPair) w1;            IntPair ip2 = (IntPair) w2;            int l = ip1.getFirst();            int r = ip2.getFirst();            return l == r ? 0 : (l < r ? -1 : 1);        }    }public static class Map extends Mapper<LongWritable,Text,IntPair,IntWritable>{private final IntPair intkey=new IntPair();private final IntWritable intvalue=new IntWritable();public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{String line=value.toString();StringTokenizer tokenizer=new StringTokenizer(line);int left=0;int right=0;if(tokenizer.hasMoreTokens()){left=Integer.parseInt(tokenizer.nextToken());if(tokenizer.hasMoreTokens()){right=Integer.parseInt(tokenizer.nextToken());}intkey.set(left, right);intvalue.set(right);context.write(intkey, intvalue);}}}public static class Reduce extends Reducer<IntPair,IntWritable,Text,IntWritable>{private final Text left=new Text();private static  final Text SEPARATOR=new Text("--------------------");public void reduce(IntPair key,Iterable<IntWritable>values,Context context) throws IOException, InterruptedException{context.write(SEPARATOR, null);left.set(Integer.toString(key.getFirst()));for(IntWritable val:values){context.write(left, val);}}}public static void main(String[] args) throws Exception {   Configuration conf=new Configuration();   @SuppressWarnings("deprecation")   // 实例化一道作业     Job job=new Job(conf,"secondarysort");   job.setJarByClass(SecondarySort.class);      //Mapper   job.setMapperClass(Map.class);   //Reducer   // 不再需要Combiner类型，因为Combiner的输出类型<Text, IntWritable>对Reduce的输入类型<IntPair, IntWritable>不适用         //job.setCombinerClass(Reduce.class);     job.setReducerClass(Reduce.class);      job.setPartitionerClass(FirstPartitioner.class);   job.setGroupingComparatorClass(GroupingComparator.class);      job.setMapOutputKeyClass(IntPair.class);   job.setOutputKeyClass(Text.class);      job.setOutputValueClass(IntWritable.class);      job.setInputFormatClass(TextInputFormat.class);   job.setOutputFormatClass(TextOutputFormat.class);      FileInputFormat.setInputPaths(job, new Path(args[0]));   FileOutputFormat.setOutputPath(job, new Path(args[1]));   System.exit(job.waitForCompletion(true)?0:1);}}