Hadoop自定义分组Group

来源：互联网发布：ubuntu下安装jdk rpm 编辑：程序博客网时间：2024/05/20 23:05

自定义分组MyGroup：

主要是继承WritableComparator类，重写compare函数

我这里重写的是该源码函数：

/** Compare two WritableComparables.   *   * <p> The default implementation uses the natural ordering, calling {@link   * Comparable#compareTo(Object)}. */  @SuppressWarnings("unchecked")  public int compare(WritableComparable a, WritableComparable b) {    return a.compareTo(b);  }

原始文件数据：

hadoop  aspark   ahive    ahbase   atachyon astorm   aredis   a

代码：

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.io.WritableComparator;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class MyGroup {public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {Configuration conf = new Configuration();String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();if(otherArgs.length!=2){System.err.println("Usage databaseV1 <inputpath> <outputpath>");}Job job = Job.getInstance(conf, MyGroup.class.getSimpleName() + "1");job.setJarByClass(MyGroup.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(Text.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);job.setMapperClass(MyMapper1.class);job.setGroupingComparatorClass(MyGroupComparator.class);job.setReducerClass(MyReducer1.class);job.setInputFormatClass(TextInputFormat.class);job.setOutputFormatClass(TextOutputFormat.class);FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));job.waitForCompletion(true);}public static class MyMapper1 extends Mapper<LongWritable, Text, Text, Text>{@Overrideprotected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)throws IOException, InterruptedException {String[] spl=value.toString().split("\t");context.write(new Text(spl[0].trim()), new Text(spl[1].trim()));}}public static class MyReducer1 extends Reducer<Text, Text, Text, Text>{@Overrideprotected void reduce(Text k2, Iterable<Text> v2s, Reducer<Text, Text, Text, Text>.Context context)throws IOException, InterruptedException {Long count=0L;for (@SuppressWarnings("unused") Text v2 : v2s) {count++;context.write(new Text("in--"+k2), new Text(count.toString()));}context.write(new Text("out--"+k2), new Text(count.toString()));}}public static class MyGroupComparator extends WritableComparator{public MyGroupComparator(){super(Text.class,true);}@SuppressWarnings("rawtypes")public int compare(WritableComparable a, WritableComparable b) {Text p1 = (Text) a;Text p2 = (Text) b;p1.compareTo(p2);return  0;  }}}

默认分组结果是这样的：

public static class MyGroupComparator extends WritableComparator{public MyGroupComparator(){super(Text.class,true);}@SuppressWarnings("rawtypes")public int compare(WritableComparable a, WritableComparable b) {Text p1 = (Text) a;Text p2 = (Text) b;return p1.compareTo(p2);  }}

相同Key为一组：

in--hadoop      1out--hadoop     1in--hbase       1out--hbase      1in--hive        1out--hive       1in--redis       1out--redis      1in--spark       1out--spark      1in--storm       1out--storm      1in--tachyon     1out--tachyon    1

自定义后：

public static class MyGroupComparator extends WritableComparator{public MyGroupComparator(){super(Text.class,true);}@SuppressWarnings("rawtypes")public int compare(WritableComparable a, WritableComparable b) {Text p1 = (Text) a;Text p2 = (Text) b;p1.compareTo(p2);return 0;  }}

所有key均为一组：

in--hadoop      1in--hbase       2in--hive        3in--redis       4in--spark       5in--storm       6in--tachyon     7out--tachyon    7

0 0