MapReduce II

来源：互联网发布：俞兆林内裤怎么样知乎编辑：程序博客网时间：2024/05/22 00:06

排序

1.部分排序    默认.2.全排序    1.一个reduce    2.自定义分区类        可能会产生数据倾斜。    3.使用hadoop内置的全排序分区类。        采样.        分区文件(sequencefile)。3.二次排序    对value进行排序。    value做到key中。合成key.

数据倾斜

大量数据涌向到一个或者几个reduce，造成大量的reduce空闲。reduce个数程序决定.

连接

[sql]1.交叉连接    select a.*,b.* from customers a cross join orders b ;2.笛卡尔积    select a.*,b.* from customers a , orders b ;3.内连接    select a.*,b.* from customers a inner join orders b on a.id = b.cid ;4.左外链接    select a.*,b.* from customers a left outer join orders b on a.id = b.cid ;5.右外连接    select a.*,b.* from customers a right outer join orders b on a.id = b.cid ;[hadoop]1.map端连接    大表 + 小表(载入内容)。2.reduce端连接    大表 + 小表。

排序

1.部分排序

    nothing!!    每个reduce中聚合的所有key都是排序的。

2.全排序

    对reduce输出的所有key进行排序。    2.1)设置一个reduce    2.2)自定义分区类        a)创建类

package com.hadoop.mr.sort.total;                import org.apache.hadoop.io.IntWritable;                import org.apache.hadoop.mapreduce.Partitioner;                /**                 * 自定义分区类,实现全排序                 */                public class YearPartitioner extends Partitioner<IntWritable, IntWritable> {                    public int getPartition(IntWritable key, IntWritable value, int numPartitions) {                        int year = key.get();                        if(year < 1930){                            return 0 ;                        }                        else if(year > 1960) {                            return 2 ;                        }                        return 1 ;                    }                }

            b)设置app                job.setPartitionerClass(YearPartitioner.class);    2.3)使用采样        对输入数据进行抽取，分析数据key分布，界定分区线。        采样代码需要在job的最后调用，sampler访问conf的配置信息。

public static void main(String[] args) throws Exception {                args = new String[]{"d:/java/mr/data/temp.seq", "d:/java/mr/out"};                Configuration conf = new Configuration();                FileSystem fs = FileSystem.get(conf);                if(fs.exists(new Path(args[1]))){                    fs.delete(new Path(args[1]),true);                }                Job job = Job.getInstance(conf);                job.setJobName("maxTemp");                job.setJarByClass(App.class);                job.setMapperClass(MaxTempMapper.class);                job.setReducerClass(MaxTempReducer.class);                FileInputFormat.addInputPath(job,new Path(args[0]));                FileOutputFormat.setOutputPath(job,new Path(args[1]));                //设置combine输入格式                job.setInputFormatClass(SequenceFileInputFormat.class);                job.setPartitionerClass(TotalOrderPartitioner.class);                job.setNumReduceTasks(3);                job.setMapOutputKeyClass(IntWritable.class);                job.setMapOutputValueClass(IntWritable.class);                job.setOutputKeyClass(IntWritable.class);                job.setOutputValueClass(IntWritable.class);                TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("file:///d:/java/mr/par.seq"));                //随机采样器                InputSampler.RandomSampler<IntWritable,IntWritable> r = new InputSampler.RandomSampler<IntWritable, IntWritable>(1f,5,3);                //创建分区文件                InputSampler.writePartitionFile(job,r);                job.waitForCompletion(true);            }

3.二次排序

    secondary sort,辅助排序。    对value进行排序。

3.1)自定义组合key

package com.hadoop.mr.sort.secondary;            import org.apache.hadoop.io.WritableComparable;            import java.io.DataInput;            import java.io.DataOutput;            import java.io.IOException;            /**             * 组合key             */            public class CombKey implements WritableComparable<CombKey>{                public int year ;                public int temp ;                public int compareTo(CombKey o) {                    int oyear = o.year ;                    int otemp = o.temp ;                    //同一年份                    if(year == oyear){                        return otemp - temp ;                    }                    else{                        return year - oyear ;                    }                }                public void write(DataOutput out) throws IOException {                    out.writeInt(year);                    out.writeInt(temp);                }                public void readFields(DataInput in) throws IOException {                    this.year = in.readInt() ;                    this.temp = in.readInt() ;                }            }

3.2)自定义分区类

        按照CombKey的year进行分区

public class YearPartitioner extends Partitioner<CombKey , NullWritable> {                public int getPartition(CombKey key, NullWritable nullWritable, int numPartitions) {                    return key.year % numPartitions ;                }            }

3.3)修改Mapper

package com.hadoop.mr.sort.secondary;            import org.apache.hadoop.io.IntWritable;            import org.apache.hadoop.io.NullWritable;            import org.apache.hadoop.mapreduce.Mapper;            import java.io.IOException;            /**             * WordCountMapper             */            public class MaxTempMapper extends Mapper<IntWritable, IntWritable, CombKey, NullWritable> {                protected void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {                    int year = key.get() ;                    int temp = value.get() ;                    CombKey keyOut = new CombKey() ;                    keyOut.year= year ;                    keyOut.temp = temp ;                    context.write(keyOut,NullWritable.get());                }            }        3.4)ComboKeyComparator            package com.it18zhang.hadoop.mr.sort.secondary;            import org.apache.hadoop.io.WritableComparable;            import org.apache.hadoop.io.WritableComparator;            /**             * 自定义key对比器             */            public class CombKeyComparator extends WritableComparator{                protected CombKeyComparator() {                    super(CombKey.class, true);                }                public int compare(WritableComparable k1, WritableComparable k2) {                    CombKey ck1 = (CombKey) k1;                    CombKey ck2 = (CombKey) k2;                    return ck1.compareTo(ck2) ;                }            }

3.5)年分组对比器

package com.hadoop.mr.sort.secondary;            import org.apache.hadoop.io.WritableComparable;            import org.apache.hadoop.io.WritableComparator;            /**             * 年度分组对比器             */            public class YearGroupComparator extends WritableComparator{                protected YearGroupComparator() {                    super(CombKey.class, true);                }                public int compare(WritableComparable k1, WritableComparable k2) {                    CombKey ck1 = (CombKey) k1;                    CombKey ck2 = (CombKey) k2;                    return ck1.year - ck2.year ;                }            }

3.6)Reducer类

package com.hadoop.mr.sort.secondary;            import org.apache.hadoop.io.IntWritable;            import org.apache.hadoop.io.NullWritable;            import org.apache.hadoop.mapreduce.Reducer;            import java.io.IOException;            import java.util.Iterator;            /**             * reduce             */            public class MaxTempReducer extends Reducer<CombKey, NullWritable, IntWritable, IntWritable>{                protected void reduce(CombKey key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {                    int year = key.year ;                    int temp = key.temp ;                    context.write(new IntWritable(year),new IntWritable(temp));                }            }

3.7)App

package com.hadoop.mr.sort.secondary;            import org.apache.hadoop.conf.Configuration;            import org.apache.hadoop.fs.FileSystem;            import org.apache.hadoop.fs.Path;            import org.apache.hadoop.io.IntWritable;            import org.apache.hadoop.io.NullWritable;            import org.apache.hadoop.mapreduce.Job;            import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;            import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;            import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;            import org.apache.hadoop.mapreduce.lib.partition.InputSampler;            import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;            /**             */            public class App {                public static void main(String[] args) throws Exception {                    args = new String[]{"d:/java/mr/data/temp.seq", "d:/java/mr/out"};                    Configuration conf = new Configuration();                    FileSystem fs = FileSystem.get(conf);                    if(fs.exists(new Path(args[1]))){                        fs.delete(new Path(args[1]),true);                    }                    Job job = Job.getInstance(conf);                    job.setJobName("maxTemp");                    job.setJarByClass(App.class);                    job.setMapperClass(MaxTempMapper.class);                    job.setReducerClass(MaxTempReducer.class);                    FileInputFormat.addInputPath(job,new Path(args[0]));                    FileOutputFormat.setOutputPath(job,new Path(args[1]));                    //设置combine输入格式                    job.setInputFormatClass(SequenceFileInputFormat.class);                    //year分区                    job.setPartitionerClass(YearPartitioner.class);                    job.setNumReduceTasks(3);                    job.setMapOutputKeyClass(CombKey.class);                    job.setMapOutputValueClass(NullWritable.class);                    job.setOutputKeyClass(IntWritable.class);                    job.setOutputValueClass(IntWritable.class);                    job.setSortComparatorClass(CombKeyComparator.class);                    job.setGroupingComparatorClass(YearGroupComparator.class);                    job.waitForCompletion(true);                }            }

阅读全文

0 0