MapReduce/Hadoop的TopN解决方案之键唯一的情况
来源:互联网 发布:按键精灵自动录入数据 编辑:程序博客网 时间:2024/06/06 01:57
TopN问题:上星期访问次数最多的10个URL是哪些?所有猫中体重最大的10只猫是哪些?
本文使用 MapReduce/Hadoop的TopN解决方案,假设所有输入键都是唯一的。也就是说,对于一个给定的输入集合{<K,V>},所有K都是唯一的。
例如对于下面的猫,cat1不会再出现第二次
输入:cat.txt
12,cat1,cat113,cat2,cat214,cat3,cat315,cat4,cat410,cat5,cat5100,cat100,cat100200,cat200,cat200300,cat300,cat3001,cat001,cat00167,cat67,cat6722,cat22,cat2223,cat23,cat231000,cat1000,cat10002000,cat2000,cat2000
2000cat2000,cat20001000cat1000,cat1000300cat300,cat300200cat200,cat200100cat100,cat10067cat67,cat6723cat23,cat2322cat22,cat2215cat4,cat414cat3,cat3
一、用到的核心数据结构:Java中的SortedMap和TreeMap,其中SortedMap可以实现按key值排序。对于如下测试类
package topN_hadoop1;import java.util.Map.Entry;import java.util.SortedMap;import java.util.TreeMap;public class Test {public static void main(String[] args) { SortedMap<Integer, String> top = new TreeMap<Integer, String>(); top.put(1, "chenjie,1"); top.put(10, "zhanghan,10"); top.put(3 ,"renbo,3");for(Entry< Integer, String> entry : top.entrySet()){System.out.println(entry);}System.out.println("------------------------------------------------------");System.out.println("firstKey:" + top.firstKey());System.out.println("first:" + top.get(top.firstKey()));System.out.println("lastKey:" + top.lastKey());System.out.println("last:" + top.get(top.lastKey()));top.remove(top.firstKey());System.out.println("remove first ");System.out.println("------------------------------------------------------");for(Entry< Integer, String> entry : top.entrySet()){System.out.println(entry);}top.remove(top.lastKey());System.out.println("remove last ");System.out.println("------------------------------------------------------");for(Entry< Integer, String> entry : top.entrySet()){System.out.println(entry);}}}
输出为:
1=chenjie,13=renbo,310=zhanghan,10------------------------------------------------------firstKey:1first:chenjie,1lastKey:10last:zhanghan,10remove first ------------------------------------------------------3=renbo,310=zhanghan,10remove last ------------------------------------------------------3=renbo,3
二、code
package topN_hadoop1;import java.io.IOException;import java.util.SortedMap;import java.util.TreeMap;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class TopNMapper extends Mapper<LongWritable,Text , NullWritable, Text> { private int N = 10; // default private SortedMap<Integer, String> top = new TreeMap<Integer, String>(); @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] lines = value.toString().split(","); String keyAsString = value.toString(); int frequency = Integer.valueOf(lines[0]); String compositeValue = keyAsString + "," + frequency; top.put(frequency, compositeValue); if (top.size() > N) { top.remove(top.firstKey()); } } @Override protected void setup(Context context) throws IOException, InterruptedException { this.N = context.getConfiguration().getInt("N", 10); // default is top 10 } @Override protected void cleanup(Context context) throws IOException, InterruptedException { for (String str : top.values()) { context.write(NullWritable.get(), new Text(str)); } }}
package topN_hadoop1;import java.io.IOException;import java.util.SortedMap;import java.util.TreeMap;import java.util.List;import java.util.ArrayList;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class TopNReducer extends Reducer<NullWritable, Text, IntWritable, Text> { private int N = 10; // default private SortedMap<Integer, String> top = new TreeMap<Integer, String>(); @Override public void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text value : values) { //value:2000,cat2000,cat2000,2000 String valueAsString = value.toString().trim(); System.out.println(value); String[] tokens = valueAsString.split(","); String url = tokens[1] + "," + tokens[2];//,cat2000,cat2000 int frequency = Integer.parseInt(tokens[0]);//2000 top.put(frequency, url); if (top.size() > N) { top.remove(top.firstKey()); } } // emit final top N List<Integer> keys = new ArrayList<Integer>(top.keySet()); for(int i=keys.size()-1; i>=0; i--){ context.write(new IntWritable(keys.get(i)), new Text(top.get(keys.get(i)))); } } @Override protected void setup(Context context) throws IOException, InterruptedException { this.N = context.getConfiguration().getInt("N", 10); // default is top 10 }}
package topN_hadoop1;import org.apache.log4j.Logger;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class TopNDriver extends Configured implements Tool { private static Logger THE_LOGGER = Logger.getLogger(TopNDriver.class); public int run(String[] args) throws Exception { Job job = new Job(getConf()); int N = Integer.parseInt(args[0]); // top N job.getConfiguration().setInt("N", N); job.setJobName("TopNDriver"); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(TopNMapper.class); job.setReducerClass(TopNReducer.class); job.setNumReduceTasks(1); // map()'s output (K,V) job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); // reduce()'s output (K,V) job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); // args[1] = input directory // args[2] = output directory FileInputFormat.setInputPaths(job, new Path(args[1])); FileOutputFormat.setOutputPath(job, new Path(args[2])); boolean status = job.waitForCompletion(true); THE_LOGGER.info("run(): status="+status); return status ? 0 : 1; } private static final String INPATH = "input/cat.txt";// 输入文件路径private static final String OUTPATH = "output/cat_out1";// 输出文件路径 public static void main(String[] args) throws Exception { args = new String[3]; args[0] = "10"; args[1] = INPATH; args[2] = OUTPATH; // Make sure there are exactly 3 parameters if (args.length != 3) { THE_LOGGER.warn("usage TopNDriver <N> <input> <output>"); System.exit(1); } THE_LOGGER.info("N="+args[0]); THE_LOGGER.info("inputDir="+args[1]); THE_LOGGER.info("outputDir="+args[2]); int returnStatus = ToolRunner.run(new TopNDriver(), args); System.exit(returnStatus); }}
四、扩展
1、Top5怎么办?传入另一个参数
2、不求前10个求后10个怎么办?将
if (top.size() > N) { top.remove(top.firstKey()); }改成top.lastKey()
阅读全文
0 0
- MapReduce/Hadoop的TopN解决方案之键唯一的情况
- MapReduce/Hadoop的TopN解决方案之键不唯一的情况
- Spark的TopN解决方案(键唯一的情况、键不唯一的情况)
- hadoop topN mapreduce编程
- MapReduce之topN
- MapReduce之TopN
- 关于mapreduce的topN的问题
- MapReduce/Hadoop的二次排序解决方案
- MapReduce/Hadoop的左外连接解决方案
- 分布式情况下生成数据库唯一ID的解决方案
- 使用Hadoop和Spark实现TopN算法(1)——唯一键
- Hadoop之MapReduce任务的优化
- hadoop之MapReduce作业的生命周期
- hadoop之MapReduce编程的权限问题
- Hadoop之MapReduce的HelloWorld(七)
- hadoop之shuffle------>MapReduce的心脏i
- Hadoop-MapReduce之WordCount的实现
- MapReduce获取键的分布情况
- 高德地图的奇葩
- servlet全局变量和局部变量
- APP内支付的接入总结(支付宝&APP STORE)
- log4j从配置文件中读取路径
- Unit4
- MapReduce/Hadoop的TopN解决方案之键唯一的情况
- QT之qss教程-qss文件使用方法
- Python编程:从入门到实践的动手试一试答案(第十章)
- MySQL多线程复制
- Kinect Fusion Explorer D2D C++ Sample中函数简介
- rocketmq 启动
- iOS 开发之键盘类型UIKeyboardType
- callable和future和runnable
- session