topk在mapreduce下面的统计加排序的实现

来源：互联网发布：三十后换工作知乎编辑：程序博客网时间：2024/05/16 08:16

给定的原始数据集如下:

All of us have read thrilling stories in which the hero had only a limited and specified time to live. Sometimes it was as long as a year, sometimes as short as 24 hours. But always we were interested in discovering just how the doomed hero chose to spend his last days or his last hours. I speak, of course, of free men who have a choice, not condemned criminals whose sphere of activities is strictly delimited.

运行以后的统计结果如下

of 21 a 20 us 15 等等

一共写了2个mapreduce 第一个用来统计单词的总次数,第二个用来进行排序

首先自定义了一个class MyInt

package topk;public class MyInt implements Comparable<MyInt>{private Integer value;public MyInt(Integer value) {this.value = value;}public Integer getValue() {return value;}public void setValue(Integer value) {this.value = value;}@Overridepublic int compareTo(MyInt o) {// TODO Auto-generated method stubreturn value.compareTo(o.getValue());}}

第一部分,第一个mapreduce

package topk;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class top {public static class Map extends Mapper<LongWritable, Text, Text, IntWritable>{IntWritable count  = new IntWritable(1);@Overrideprotected void map(LongWritable key, Text value,  Context context)throws IOException, InterruptedException {StringTokenizer st = new StringTokenizer(value.toString());while(st.hasMoreTokens()){String word  = st.nextToken().replaceAll("/", "").replace("'", "").replace(".", "");context.write(new Text(word), count);}}}public static class Reducer extends org.apache.hadoop.mapreduce.Reducer<Text, IntWritable, Text, IntWritable>{@Overrideprotected void reduce(Text key, Iterable<IntWritable> values,Context context)throws IOException, InterruptedException {int count = 0;for(IntWritable word :values){count++;}context.write(key, new IntWritable(count));}}@SuppressWarnings("deprecation")public static boolean run(String in, String out) throws IOException,ClassNotFoundException,InterruptedException {Configuration conf = new Configuration();Job job = Job.getInstance(conf);job.setJarByClass(top.class);job.setMapperClass(Map.class);job.setReducerClass(Reducer.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(IntWritable.class);    job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.addInputPath(job, new Path(in));FileOutputFormat.setOutputPath(job, new Path(out));return job.waitForCompletion(true);}}

第二部分,第二个mapreduce

package topk;import java.io.IOException;import java.util.Comparator;import java.util.Map.Entry;import java.util.Set;import java.util.StringTokenizer;import java.util.TreeMap;import java.util.regex.Pattern;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;public class topk {public static class Map extends Mapper<Object, Text, IntWritable, Text>{IntWritable outKey  = new IntWritable();Text outValue  = new Text();@Overrideprotected void map(Object key, Text value, Context context)throws IOException, InterruptedException {StringTokenizer st = new StringTokenizer(value.toString());while(st.hasMoreTokens()){String element = st.nextToken();if(Pattern.matches("\\d+", element)){//用来匹配单词的个数outKey.set(Integer.parseInt(element));}else{outValue.set(element);}}context.write(outKey, outValue);}}public static class Reducer extends org.apache.hadoop.mapreduce.Reducer<IntWritable, Text, Text, IntWritable>{private static MultipleOutputs<Text,IntWritable> mos = null;private static final int k = 10;private static TreeMap<MyInt,String> tm = new TreeMap<MyInt,String>(new Comparator<MyInt>(){@Overridepublic int compare(MyInt o1,MyInt o2) {     return o2.compareTo(o1);}});protected void reduce(IntWritable key, java.lang.Iterable<Text> values, Context context) throws IOException ,InterruptedException {for(Text text:values){context.write(text, key);tm.put(new MyInt(key.get()), text.toString());if(tm.size() > k){}}}@Overrideprotected void cleanup(org.apache.hadoop.mapreduce.Reducer<IntWritable, Text, Text, IntWritable>.Context context)throws IOException, InterruptedException {String path = context.getConfiguration().get("topKout");mos = new MultipleOutputs<Text,IntWritable>(context);Set<Entry<MyInt, String>> set = tm.entrySet();for (Entry<MyInt, String> entry : set) {mos.write("topKMOS", new Text(entry.getValue()), new IntWritable(entry.getKey().getValue()), path);     } mos.close();}} @SuppressWarnings("deprecation")    public static void run(String in, String out,String topKout) throws IOException,            ClassNotFoundException, InterruptedException {                Configuration conf = new Configuration();                //前K个词要输出到哪个目录        conf.set("topKout",topKout);                Job job = new Job(conf, "Sort");        job.setJarByClass(topk.class);        job.setMapperClass(Map.class);        job.setReducerClass(Reducer.class);        // 设置Map输出类型        job.setMapOutputKeyClass(IntWritable.class);        job.setMapOutputValueClass(Text.class);        // 设置Reduce输出类型        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        //设置MultipleOutputs的输出格式        //这里利用MultipleOutputs进行对文件输出               MultipleOutputs.addNamedOutput(job,"topKMOS",TextOutputFormat.class,Text.class,Text.class);                // 设置输入和输出目录        FileInputFormat.addInputPath(job, new Path(in));        FileOutputFormat.setOutputPath(job, new Path(out));        job.waitForCompletion(true);    }}

第三部分,写一个主函数来调用上面二个mapreduce

package topk;import java.io.IOException;public class topkmain {public static void main(String args[]) throws ClassNotFoundException, IOException, InterruptedException{                //要统计字数，排序的文字        String in = "C:/danci.txt";                //统计字数后的结果        String wordCout = "C:/outaa/wordCout";                //对统计完后的结果再排序后的内容        String sort = "C:/outaa/sort";                //前K条        String topK = "C:/outaa/shuchudejieguo";                //如果统计字数的job完成后就开始排序        if(top.run(in, wordCout)){            topk.run(wordCout, sort,topK);        }            }}

阅读全文

0 0