topk在mapreduce下面的统计加排序的实现
来源:互联网 发布:三十后换工作 知乎 编辑:程序博客网 时间:2024/05/16 08:16
给定的原始数据集如下:
All of us have read thrilling stories in which the hero had only a limited and specified time to live. Sometimes it was as long as a year, sometimes as short as 24 hours. But always we were interested in discovering just how the doomed hero chose to spend his last days or his last hours. I speak, of course, of free men who have a choice, not condemned criminals whose sphere of activities is strictly delimited.
运行以后的统计结果如下
of 21 a 20 us 15 等等
一共写了2个mapreduce 第一个用来统计单词的总次数,第二个用来进行排序
首先自定义了一个class MyInt
package topk;public class MyInt implements Comparable<MyInt>{private Integer value;public MyInt(Integer value) {this.value = value;}public Integer getValue() {return value;}public void setValue(Integer value) {this.value = value;}@Overridepublic int compareTo(MyInt o) {// TODO Auto-generated method stubreturn value.compareTo(o.getValue());}}
第一部分,第一个mapreduce
package topk;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class top {public static class Map extends Mapper<LongWritable, Text, Text, IntWritable>{IntWritable count = new IntWritable(1);@Overrideprotected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {StringTokenizer st = new StringTokenizer(value.toString());while(st.hasMoreTokens()){String word = st.nextToken().replaceAll("/", "").replace("'", "").replace(".", "");context.write(new Text(word), count);}}}public static class Reducer extends org.apache.hadoop.mapreduce.Reducer<Text, IntWritable, Text, IntWritable>{@Overrideprotected void reduce(Text key, Iterable<IntWritable> values,Context context)throws IOException, InterruptedException {int count = 0;for(IntWritable word :values){count++;}context.write(key, new IntWritable(count));}}@SuppressWarnings("deprecation")public static boolean run(String in, String out) throws IOException,ClassNotFoundException,InterruptedException {Configuration conf = new Configuration();Job job = Job.getInstance(conf);job.setJarByClass(top.class);job.setMapperClass(Map.class);job.setReducerClass(Reducer.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.addInputPath(job, new Path(in));FileOutputFormat.setOutputPath(job, new Path(out));return job.waitForCompletion(true);}}
第二部分,第二个mapreduce
package topk;import java.io.IOException;import java.util.Comparator;import java.util.Map.Entry;import java.util.Set;import java.util.StringTokenizer;import java.util.TreeMap;import java.util.regex.Pattern;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;public class topk {public static class Map extends Mapper<Object, Text, IntWritable, Text>{IntWritable outKey = new IntWritable();Text outValue = new Text();@Overrideprotected void map(Object key, Text value, Context context)throws IOException, InterruptedException {StringTokenizer st = new StringTokenizer(value.toString());while(st.hasMoreTokens()){String element = st.nextToken();if(Pattern.matches("\\d+", element)){//用来匹配单词的个数outKey.set(Integer.parseInt(element));}else{outValue.set(element);}}context.write(outKey, outValue);}}public static class Reducer extends org.apache.hadoop.mapreduce.Reducer<IntWritable, Text, Text, IntWritable>{private static MultipleOutputs<Text,IntWritable> mos = null;private static final int k = 10;private static TreeMap<MyInt,String> tm = new TreeMap<MyInt,String>(new Comparator<MyInt>(){@Overridepublic int compare(MyInt o1,MyInt o2) { return o2.compareTo(o1);}});protected void reduce(IntWritable key, java.lang.Iterable<Text> values, Context context) throws IOException ,InterruptedException {for(Text text:values){context.write(text, key);tm.put(new MyInt(key.get()), text.toString());if(tm.size() > k){}}}@Overrideprotected void cleanup(org.apache.hadoop.mapreduce.Reducer<IntWritable, Text, Text, IntWritable>.Context context)throws IOException, InterruptedException {String path = context.getConfiguration().get("topKout");mos = new MultipleOutputs<Text,IntWritable>(context);Set<Entry<MyInt, String>> set = tm.entrySet();for (Entry<MyInt, String> entry : set) {mos.write("topKMOS", new Text(entry.getValue()), new IntWritable(entry.getKey().getValue()), path); } mos.close();}} @SuppressWarnings("deprecation") public static void run(String in, String out,String topKout) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); //前K个词要输出到哪个目录 conf.set("topKout",topKout); Job job = new Job(conf, "Sort"); job.setJarByClass(topk.class); job.setMapperClass(Map.class); job.setReducerClass(Reducer.class); // 设置Map输出类型 job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); // 设置Reduce输出类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //设置MultipleOutputs的输出格式 //这里利用MultipleOutputs进行对文件输出 MultipleOutputs.addNamedOutput(job,"topKMOS",TextOutputFormat.class,Text.class,Text.class); // 设置输入和输出目录 FileInputFormat.addInputPath(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.waitForCompletion(true); }}第三部分,写一个主函数来调用上面二个mapreduce
package topk;import java.io.IOException;public class topkmain {public static void main(String args[]) throws ClassNotFoundException, IOException, InterruptedException{ //要统计字数,排序的文字 String in = "C:/danci.txt"; //统计字数后的结果 String wordCout = "C:/outaa/wordCout"; //对统计完后的结果再排序后的内容 String sort = "C:/outaa/sort"; //前K条 String topK = "C:/outaa/shuchudejieguo"; //如果统计字数的job完成后就开始排序 if(top.run(in, wordCout)){ topk.run(wordCout, sort,topK); } }}
阅读全文
0 0
- topk在mapreduce下面的统计加排序的实现
- MapReduce的TopK统计加排序
- MapReduce TopK统计加排序
- MapReduce实现TopK的示例
- MapReduce实现TopK
- mapreduce-在MongoDB中,用MapReduce实现两个有引用关系的集合的统计
- mapreduce实现简单的流量统计功能
- 基于快速排序的TOPK算法
- MapReduce-TopK
- Spark下的TopK实现(Java)
- 最小堆的实现--topk算法
- 统计数集中出现最多的N个数(topK)
- TopK问题——统计大家最爱玩的游戏
- 一个单词统计的实例,怎样通过MapReduce完成排序?
- 排序算法及其在MapReduce的应用
- 排序算法及其在MapReduce的应用
- 简单的单词个数统计的mapreduce 的代码实现
- mapreduce实现对key的排序
- 通过url抓获得到的html
- 22day 遇到的问题 \内存管理
- Python,set数据类型
- 国内如何打开SnapChat(阅后即焚)——针对安卓
- Retrofit2.0使用详解
- topk在mapreduce下面的统计加排序的实现
- ASM 异常 ORA-00020: maximum number of processes (100) exceeded
- 开机自动启动app,后台保持运行Service
- 华为OJ 字符串加解密
- JAVA小白启蒙篇:第一个SSM框架搭建示例
- hibernate入门
- WEB前端之学会用PS很重要
- python3.5处理异常与python2的不同
- 解析json数据将数据填入表格对应的单元格