【MR】MapReduce 常见的排序
来源:互联网 发布:浙江大学软件学院就业 编辑:程序博客网 时间:2024/05/21 11:35
【MR】MapReduce 常见的排序
一,Hadoop默认的排序算法,只会针对key值进行排序,按照字典顺序排序。
直接上代码
Map端
package Hadoop.MR.sort;import java.io.IOException;/** * 文本排序-map分组,在Hadoop默认的排序算法中,只会针对key值进行排序 * @author Young * created on 2017-6-30 */import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class SortMapper extends Mapper<LongWritable, Text, Text, NullWritable> { protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub String line = value.toString();//获取文本内容 context.write(new Text(line), NullWritable.get()); }}
Reduce端
package Hadoop.MR.sort;import java.io.IOException;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;/** * 文本排序-reduce端排序,在Hadoop默认的排序算法中,只会针对key值进行排序 * @author Young * created on 2017-6-30 */import org.apache.hadoop.mapreduce.Reducer;public class SortReducer extends Reducer<Text, NullWritable, Text, NullWritable> { @Override protected void reduce(Text k2, Iterable<NullWritable> v2, Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub context.write(k2, NullWritable.get()); }}
驱动程序
package Hadoop.MR.sort;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;/** * 排序驱动程序 * @author Young * @version 创建时间:2017年6月30日上午9:31:50 */public class SortDriver extends Configured implements Tool { public int run(String[] arg0) throws Exception { if (arg0.length != 2){ System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }// Configuration conf = new Configuration();// Job job = new Job(getConf(),"Max Temperture"); Job job = Job.getInstance(getConf(), "Sort"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(arg0[0])); FileOutputFormat.setOutputPath(job, new Path(arg0[1])); job.setMapperClass(SortMapper.class); job.setReducerClass(SortReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); return job.waitForCompletion(true)?0:1; } public static void main(String[] args) throws Exception { // TODO Auto-generated method stub int exitCode = ToolRunner.run(new SortDriver(), args); System.exit(exitCode); }}
排序前后
二,自定义,先根据第一列排序,若相同则根据第二列排序,
自定义Bean,
package Hadoop.MR.mysort;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.WritableComparable;/** * 实现WritableComparable接口,重写compareTo进行排序 * @author Young * @version 创建时间:2017年6月30日上午9:54:14 */public class SortBean implements WritableComparable<SortBean> { private long firstNum; private long secondNum; public SortBean(){ } public SortBean(long first,long second){ this.firstNum=first; this.secondNum=second; } public void readFields(DataInput in) throws IOException { // TODO Auto-generated method stub this.firstNum=in.readLong(); this.secondNum=in.readLong(); } public void write(DataOutput out) throws IOException { // TODO Auto-generated method stub out.writeLong(firstNum); out.writeLong(secondNum); } public int compareTo(SortBean o) { // TODO Auto-generated method stub //返回1则交换,-1则不交换。 if(this.firstNum==o.getFirstNum()){ return this.secondNum>o.getSecondNum() ? 1:-1; } else{ return this.firstNum>o.getFirstNum() ? 1:-1; } } @Override public String toString() { // TODO Auto-generated method stub return this.firstNum+" "+this.secondNum; } public long getFirstNum() { return firstNum; } public void setFirstNum(long firstNum) { this.firstNum = firstNum; } public long getSecondNum() { return secondNum; } public void setSecondNum(long secondNum) { this.secondNum = secondNum; }}
Map端
package Hadoop.MR.mysort;import java.io.IOException;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;/** * 自定排序map端 * @author Young * @version 创建时间:2017年6月30日上午10:20:12 */public class MySortMapper extends Mapper<LongWritable, Text, SortBean,NullWritable> { @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, SortBean, NullWritable>.Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub String line = value.toString(); String []num=line.split("\t"); long firstNum=Long.parseLong(num[0]); long secondNum=Long.parseLong(num[1]); SortBean bean = new SortBean(firstNum,secondNum); context.write(bean, NullWritable.get()); }}
Reduce端
package Hadoop.MR.mysort;import java.io.IOException;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.Reducer;/** * 自定义排序reduce端 * @author Young * @version 创建时间:2017年6月30日上午10:30:42 */public class MySortReducer extends Reducer<SortBean, NullWritable, SortBean, NullWritable> { @Override protected void reduce(SortBean k2, Iterable<NullWritable> v2, Reducer<SortBean, NullWritable, SortBean, NullWritable>.Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub context.write(k2, NullWritable.get()); }}
驱动程序
package Hadoop.MR.mysort;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;/** * 自定义排序驱动程序 * @author Young * @version 创建时间:2017年6月30日上午10:38:23 */public class MySortDriver extends Configured implements Tool { public int run(String[] arg0) throws Exception { if (arg0.length != 2){ System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }// Configuration conf = new Configuration();// Job job = new Job(getConf(),"Max Temperture"); Job job = Job.getInstance(getConf(), "MySort"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(arg0[0])); FileOutputFormat.setOutputPath(job, new Path(arg0[1])); job.setMapperClass(MySortMapper.class); job.setReducerClass(MySortReducer.class); job.setOutputKeyClass(SortBean.class); job.setOutputValueClass(NullWritable.class); return job.waitForCompletion(true)?0:1; } public static void main(String[] args) throws Exception { // TODO Auto-generated method stub int exitCode = ToolRunner.run(new MySortDriver(), args); System.exit(exitCode); }}
排序前后
三,求最值
Map端
package Hadoop.MR.max;import java.io.IOException;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;/** * 最大值Map阶段 * @author Young * @version 创建时间:2017年9月5日下午3:47:02 */public class MaxMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> { @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, DoubleWritable>.Context context) throws IOException, InterruptedException { String lines[]=value.toString().split("\t"); String account=lines[0]; double income=Double.parseDouble(lines[1]); context.write(new Text(account), new DoubleWritable(income)); }}
Reduce端
package Hadoop.MR.max;import java.io.IOException;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;/** * 最大值Reduce阶段 * @author Young * @version 创建时间:2017年9月5日下午4:15:29 */public class MaxReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> { @Override protected void reduce(Text key, Iterable<DoubleWritable> value, Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context) throws IOException, InterruptedException { double Max=Double.MIN_VALUE; for(DoubleWritable v:value){ Max=Math.max(Max, v.get()); } context.write(key, new DoubleWritable(Max)); }}
驱动程序
package Hadoop.MR.max;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;/** * 最大值驱动类 * @author Young * @version 创建时间:2017年9月5日下午4:17:32 */public class MaxDriver extends Configured implements Tool { public static void main(String[] args) throws Exception { // TODO Auto-generated method stub int exitCode = ToolRunner.run(new MaxDriver(), args); System.exit(exitCode); } public int run(String[] arg0) throws Exception { if (arg0.length != 2){ System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Job job = Job.getInstance(getConf(), "Max"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(arg0[0])); FileOutputFormat.setOutputPath(job, new Path(arg0[1])); job.setMapperClass(MaxMapper.class); job.setReducerClass(MaxReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); return job.waitForCompletion(true)?0:1; }}
排序前后
四,TopN,文件中包含不同的key,键值不唯一,取每个键值前三个最小值。
Map端
package Hadoop.MR.topn;import java.io.IOException;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;/** * 取前n名Map阶段 * @author Young * @version 创建时间:2017年9月10日下午3:26:00 */public class TopNMapper extends Mapper<LongWritable,Text,Text ,DoubleWritable> { private Text k=new Text(); private DoubleWritable v=new DoubleWritable(); @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text ,DoubleWritable>.Context context) throws IOException, InterruptedException { String lines[]=value.toString().split("\t"); String account=lines[0]; double income=Double.parseDouble(lines[1]); v.set(income); k.set(account); context.write(k,v); }}
Reduce端
package Hadoop.MR.topn;import java.io.IOException;import java.util.TreeSet;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;/** * 取前n名reduce阶段 * @author Young * @version 创建时间:2017年9月10日下午3:49:01 */public class TopNReducer extends Reducer<Text ,DoubleWritable,Text ,DoubleWritable> { private TreeSet<Double> incomeTreeSet = new TreeSet<Double>(); private DoubleWritable v=new DoubleWritable(); private int n=3; @Override protected void reduce( Text key, Iterable<DoubleWritable> value, Reducer<Text ,DoubleWritable, Text ,DoubleWritable>.Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub for (DoubleWritable val : value) { incomeTreeSet.add(val.get()); if (incomeTreeSet.size() > n) { incomeTreeSet.remove(incomeTreeSet.first()); } } for (Double in: incomeTreeSet) { v.set(in); context.write(key, v); } }}
驱动程序
package Hadoop.MR.topn;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;/** * 取前n名驱动程序 * @author Young * @version 创建时间:2017年9月10日下午3:59:49 */public class TopNDriver extends Configured implements Tool { public static void main(String[] args) throws Exception { // TODO Auto-generated method stub int exitCode = ToolRunner.run(new TopNDriver(), args); System.exit(exitCode); } public int run(String[] arg0) throws Exception { //TODO Auto-generated method stub if (arg0.length != 2){ System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Job job = Job.getInstance(getConf(), "TopN"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(arg0[0])); FileOutputFormat.setOutputPath(job, new Path(arg0[1])); job.setMapperClass(TopNMapper.class); job.setReducerClass(TopNReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); return job.waitForCompletion(true)?0:1; }}
结果
结果是有了,但是后来觉得这样会把Reduce给拖死,TreeSet的排序以及换位会消耗很多内存资源。想在Map就把排序做好,然后Reduce直接取前n就好了,不过没想到。有更好的算法的,请拍砖。
阅读全文
0 3
- 【MR】MapReduce 常见的排序
- MR-5.MapReduce排序
- MR-5.MapReduce常见Joins方法
- 【MR】经典的MapReduce(MapReduce 1) 运行机制
- MapReduce(十七): MR的访问控制
- Mapreduce(二):MR的执行过程分析
- MR(mapreduce)的工作原理图解
- MapReduce(MR)的文件拆分:FileInputFormat
- MR--WordCount的MapReduce程序注释
- MR--MaxTemperature的Mapreduce程序注释
- MR-1.MapReduce概述
- 【MR】MapReduce中的数据流
- MapReduce源码对写MR application帮助最大的部分
- 【MR】MapReduce中shuffle、partition、combiner的作用与关系
- MapReduce的自定义排序
- mapreduce的二次排序
- MapReduce的全排序
- MapReduce的二次排序
- 【Android学习】JNI(Java Native Interface,java本地接口)编程
- Python3 网络编程
- ES6
- AC-Apple
- mysql 查看数据库表、字段信息
- 【MR】MapReduce 常见的排序
- 怎么形象的理解三次握手与四次挥手
- log4j.properties配置详解与实例
- Cesium应用篇:3控件(6) FullScreen/ VR / Home
- java client 连接hbase报错,超时
- 弹幕给我们带来了什么
- 定时任务quartz与spring整合(springboot)
- WebView之简识
- java.lang.NoClassDefFoundError: org/springframework/boot/context/embedded/FilterRegistrationBean