【MR】MapReduce 常见的排序

来源:互联网 发布:浙江大学软件学院就业 编辑:程序博客网 时间:2024/05/21 11:35

【MR】MapReduce 常见的排序

一,Hadoop默认的排序算法,只会针对key值进行排序,按照字典顺序排序。
直接上代码
Map端

package Hadoop.MR.sort;import java.io.IOException;/** * 文本排序-map分组,在Hadoop默认的排序算法中,只会针对key值进行排序 * @author Young * created on 2017-6-30 */import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class SortMapper extends Mapper<LongWritable, Text, Text, NullWritable> {    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)            throws IOException, InterruptedException {        // TODO Auto-generated method stub        String line = value.toString();//获取文本内容        context.write(new Text(line), NullWritable.get());    }}

Reduce端

package Hadoop.MR.sort;import java.io.IOException;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;/** * 文本排序-reduce端排序,在Hadoop默认的排序算法中,只会针对key值进行排序 * @author Young * created on 2017-6-30 */import org.apache.hadoop.mapreduce.Reducer;public class SortReducer extends Reducer<Text, NullWritable, Text, NullWritable> {    @Override    protected void reduce(Text k2, Iterable<NullWritable> v2,            Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {        // TODO Auto-generated method stub        context.write(k2, NullWritable.get());    }}

驱动程序

package Hadoop.MR.sort;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;/** * 排序驱动程序 * @author Young * @version 创建时间:2017年6月30日上午9:31:50 */public class SortDriver extends Configured implements Tool {    public int run(String[] arg0) throws Exception {        if (arg0.length != 2){            System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName());            ToolRunner.printGenericCommandUsage(System.err);            return -1;        }//      Configuration conf = new Configuration();//      Job job = new Job(getConf(),"Max Temperture");        Job job = Job.getInstance(getConf(), "Sort");        job.setJarByClass(getClass());        FileInputFormat.addInputPath(job, new Path(arg0[0]));        FileOutputFormat.setOutputPath(job, new Path(arg0[1]));        job.setMapperClass(SortMapper.class);        job.setReducerClass(SortReducer.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(NullWritable.class);        return job.waitForCompletion(true)?0:1;    }    public static void main(String[] args) throws Exception {        // TODO Auto-generated method stub        int exitCode = ToolRunner.run(new SortDriver(), args);        System.exit(exitCode);    }}

排序前后
这里写图片描述

二,自定义,先根据第一列排序,若相同则根据第二列排序,
自定义Bean,

package Hadoop.MR.mysort;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.WritableComparable;/** * 实现WritableComparable接口,重写compareTo进行排序 * @author Young * @version 创建时间:2017年6月30日上午9:54:14 */public class SortBean implements WritableComparable<SortBean> {    private long firstNum;    private long secondNum;    public SortBean(){    }    public SortBean(long first,long second){        this.firstNum=first;        this.secondNum=second;    }    public void readFields(DataInput in) throws IOException {        // TODO Auto-generated method stub        this.firstNum=in.readLong();        this.secondNum=in.readLong();    }    public void write(DataOutput out) throws IOException {        // TODO Auto-generated method stub        out.writeLong(firstNum);        out.writeLong(secondNum);    }    public int compareTo(SortBean o) {        // TODO Auto-generated method stub        //返回1则交换,-1则不交换。        if(this.firstNum==o.getFirstNum()){            return this.secondNum>o.getSecondNum() ? 1:-1;        }        else{            return this.firstNum>o.getFirstNum() ? 1:-1;        }    }    @Override    public String toString() {        // TODO Auto-generated method stub        return this.firstNum+" "+this.secondNum;    }    public long getFirstNum() {        return firstNum;    }    public void setFirstNum(long firstNum) {        this.firstNum = firstNum;    }    public long getSecondNum() {        return secondNum;    }    public void setSecondNum(long secondNum) {        this.secondNum = secondNum;    }}

Map端

package Hadoop.MR.mysort;import java.io.IOException;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;/** * 自定排序map端 * @author Young * @version 创建时间:2017年6月30日上午10:20:12 */public class MySortMapper extends Mapper<LongWritable, Text, SortBean,NullWritable> {    @Override    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, SortBean, NullWritable>.Context context)            throws IOException, InterruptedException {        // TODO Auto-generated method stub        String line = value.toString();        String []num=line.split("\t");        long firstNum=Long.parseLong(num[0]);        long secondNum=Long.parseLong(num[1]);        SortBean bean = new SortBean(firstNum,secondNum);        context.write(bean, NullWritable.get());    }}

Reduce端

package Hadoop.MR.mysort;import java.io.IOException;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.Reducer;/** * 自定义排序reduce端 * @author Young * @version 创建时间:2017年6月30日上午10:30:42 */public class MySortReducer extends Reducer<SortBean, NullWritable, SortBean, NullWritable> {    @Override    protected void reduce(SortBean k2, Iterable<NullWritable> v2,            Reducer<SortBean, NullWritable, SortBean, NullWritable>.Context context)            throws IOException, InterruptedException {        // TODO Auto-generated method stub        context.write(k2, NullWritable.get());    }}

驱动程序

package Hadoop.MR.mysort;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;/** * 自定义排序驱动程序 * @author Young * @version 创建时间:2017年6月30日上午10:38:23 */public class MySortDriver extends Configured implements Tool {    public int run(String[] arg0) throws Exception {        if (arg0.length != 2){            System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName());            ToolRunner.printGenericCommandUsage(System.err);            return -1;        }//      Configuration conf = new Configuration();//      Job job = new Job(getConf(),"Max Temperture");        Job job = Job.getInstance(getConf(), "MySort");        job.setJarByClass(getClass());        FileInputFormat.addInputPath(job, new Path(arg0[0]));        FileOutputFormat.setOutputPath(job, new Path(arg0[1]));        job.setMapperClass(MySortMapper.class);        job.setReducerClass(MySortReducer.class);        job.setOutputKeyClass(SortBean.class);        job.setOutputValueClass(NullWritable.class);        return job.waitForCompletion(true)?0:1;    }    public static void main(String[] args) throws Exception {        // TODO Auto-generated method stub        int exitCode = ToolRunner.run(new MySortDriver(), args);        System.exit(exitCode);    }}

排序前后
这里写图片描述

三,求最值
Map端

package Hadoop.MR.max;import java.io.IOException;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;/** * 最大值Map阶段 * @author Young * @version 创建时间:2017年9月5日下午3:47:02 */public class MaxMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {    @Override    protected void map(LongWritable key, Text value,            Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)            throws IOException, InterruptedException {        String lines[]=value.toString().split("\t");        String account=lines[0];        double income=Double.parseDouble(lines[1]);        context.write(new Text(account), new DoubleWritable(income));    }}

Reduce端

package Hadoop.MR.max;import java.io.IOException;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;/** * 最大值Reduce阶段 * @author Young * @version 创建时间:2017年9月5日下午4:15:29 */public class MaxReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {    @Override    protected void reduce(Text key, Iterable<DoubleWritable> value,            Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)            throws IOException, InterruptedException {        double Max=Double.MIN_VALUE;        for(DoubleWritable v:value){            Max=Math.max(Max, v.get());        }        context.write(key, new DoubleWritable(Max));    }}

驱动程序

package Hadoop.MR.max;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;/** * 最大值驱动类 * @author Young * @version 创建时间:2017年9月5日下午4:17:32 */public class MaxDriver extends Configured implements Tool {    public static void main(String[] args) throws Exception {        // TODO Auto-generated method stub        int exitCode = ToolRunner.run(new MaxDriver(), args);        System.exit(exitCode);    }    public int run(String[] arg0) throws Exception {        if (arg0.length != 2){            System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName());            ToolRunner.printGenericCommandUsage(System.err);            return -1;        }        Job job = Job.getInstance(getConf(), "Max");        job.setJarByClass(getClass());        FileInputFormat.addInputPath(job, new Path(arg0[0]));        FileOutputFormat.setOutputPath(job, new Path(arg0[1]));        job.setMapperClass(MaxMapper.class);        job.setReducerClass(MaxReducer.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(DoubleWritable.class);        return job.waitForCompletion(true)?0:1;    }}

排序前后
这里写图片描述

四,TopN,文件中包含不同的key,键值不唯一,取每个键值前三个最小值。
Map端

package Hadoop.MR.topn;import java.io.IOException;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;/** * 取前n名Map阶段 * @author Young * @version 创建时间:2017年9月10日下午3:26:00 */public class TopNMapper extends Mapper<LongWritable,Text,Text ,DoubleWritable> {    private Text k=new Text();    private DoubleWritable v=new DoubleWritable();    @Override    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text ,DoubleWritable>.Context context)            throws IOException, InterruptedException {        String lines[]=value.toString().split("\t");        String account=lines[0];        double income=Double.parseDouble(lines[1]);        v.set(income);        k.set(account);        context.write(k,v);    }}

Reduce端

package Hadoop.MR.topn;import java.io.IOException;import java.util.TreeSet;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;/** * 取前n名reduce阶段 * @author Young * @version 创建时间:2017年9月10日下午3:49:01 */public class TopNReducer extends Reducer<Text ,DoubleWritable,Text ,DoubleWritable> {    private  TreeSet<Double> incomeTreeSet = new TreeSet<Double>();    private  DoubleWritable v=new DoubleWritable();    private  int n=3;    @Override    protected void reduce( Text key, Iterable<DoubleWritable> value,            Reducer<Text ,DoubleWritable, Text ,DoubleWritable>.Context context)            throws IOException, InterruptedException {        // TODO Auto-generated method stub           for (DoubleWritable val : value) {                 incomeTreeSet.add(val.get());                 if (incomeTreeSet.size() > n) {                     incomeTreeSet.remove(incomeTreeSet.first());                 }             }             for (Double in: incomeTreeSet) {                 v.set(in);                 context.write(key, v);             }     }}

驱动程序

package Hadoop.MR.topn;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;/** * 取前n名驱动程序 * @author Young * @version 创建时间:2017年9月10日下午3:59:49 */public class TopNDriver extends Configured implements Tool {    public static void main(String[] args) throws Exception {        // TODO Auto-generated method stub        int exitCode = ToolRunner.run(new TopNDriver(), args);        System.exit(exitCode);    }    public int run(String[] arg0) throws Exception {         //TODO Auto-generated method stub        if (arg0.length != 2){            System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName());            ToolRunner.printGenericCommandUsage(System.err);            return -1;        }        Job job = Job.getInstance(getConf(), "TopN");        job.setJarByClass(getClass());        FileInputFormat.addInputPath(job, new Path(arg0[0]));        FileOutputFormat.setOutputPath(job, new Path(arg0[1]));        job.setMapperClass(TopNMapper.class);        job.setReducerClass(TopNReducer.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(DoubleWritable.class);        return job.waitForCompletion(true)?0:1;    }}

结果
这里写图片描述
结果是有了,但是后来觉得这样会把Reduce给拖死,TreeSet的排序以及换位会消耗很多内存资源。想在Map就把排序做好,然后Reduce直接取前n就好了,不过没想到。有更好的算法的,请拍砖。

原创粉丝点击