Mapreduce编程TOP N

来源:互联网 发布:mac地址修改 编辑:程序博客网 时间:2024/06/02 03:38

通过treeset 取出TOP N的数据,下面的程序是有缺陷的,因为tree set本身不支持相同数据。 

另外使用了cleanup方法, setup 和 cleanup在mapreduce只会跑一次, 从 input读取数据之后,map默认按照行来一行一行读取,也就是循环的,直到读完数据,所以一些初始化工作可以放到setup里去做, cleanup用来清理一些变量,既然执行一次,那我就通过在cleanup里使用方法去除多余数据,只去我要的TOP N,再发送给reduce, 实际这个程序要不要reduce无所谓。


import java.awt.image.AreaAveragingScaleFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.commons.net.nntp.NewsgroupInfo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SplitLineReader;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class TopTenOrder {


public static class TokenizerMapper extends Mapper<Object, Text, NullWritable, IntWritable> {

private TreeSet<Integer> top10 = new TreeSet<Integer>();


public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

top10.add(Integer.parseInt(value.toString()));


}

public void cleanup(Context context) throws IOException, InterruptedException {
while(top10.size() > 10){
top10.remove(top10.first());
}

   Iterator iterator = top10.iterator();
   while(iterator.hasNext()){
   context.write(NullWritable.get(), new IntWritable(Integer.parseInt(iterator.next().toString())));
   }
}
}


public static class IntSumReducer extends Reducer<NullWritable, IntWritable, NullWritable, IntWritable> {


public void reduce(Text key, Iterable<IntWritable> value, Context context)
throws IOException, InterruptedException {
for(IntWritable val : value){
context.write(NullWritable.get(), val);
}


}
}


public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "TopTenOrder");
job.setJarByClass(TopTenOrder.class);
job.setMapperClass(TokenizerMapper.class);
// job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(1);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
原创粉丝点击