MapReduce/Hadoop的TopN解决方案之键不唯一的情况

来源：互联网发布：错生网络剧未删减资源编辑：程序博客网时间：2024/06/03 14:07

一、MapReduce/Hadoop的TopN解决方案之键唯一的情况（点击打开链接）

二、针对键不唯一的情况，即文件中可能出现多次关键字

解决办法：先讲不唯一键转换为唯一键，即使用MapReduce合并键相同的项，再使用（一）所述的唯一键TopN方案

即

package topN_hadoop1;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Mapper;public class AggregateByKeyMapper extends         Mapper<Object, Text, Text, IntWritable> {   private Text K2 = new Text();   private IntWritable V2 = new IntWritable();   @Override   public void map(Object key, Text value, Context context)         throws IOException, InterruptedException {      String valueAsString = value.toString().trim();      String[] tokens = valueAsString.split(",");      if (tokens.length != 2) {         return;      }      String url = tokens[0];      int frequency =  Integer.parseInt(tokens[1]);      K2.set(url);      V2.set(frequency);      context.write(K2, V2);   }}

package topN_hadoop1;import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Reducer;public class AggregateByKeyReducer  extends    Reducer<Text, IntWritable, Text, IntWritable> {      @Override      public void reduce(Text key, Iterable<IntWritable> values, Context context)  throws IOException, InterruptedException {         int sum = 0;         for (IntWritable value : values) {               sum += value.get();         }         context.write(key, new IntWritable(sum));      }}

package topN_hadoop1;import org.apache.log4j.Logger;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class AggregateByKeyDriver  extends Configured implements Tool {   private static Logger THE_LOGGER = Logger.getLogger(AggregateByKeyDriver.class);   public int run(String[] args) throws Exception {      Job job = new Job(getConf());      HadoopUtil.addJarsToDistributedCache(job, "/lib/");      job.setJobName("AggregateByKeyDriver");      job.setInputFormatClass(TextInputFormat.class);      job.setOutputFormatClass(SequenceFileOutputFormat.class);      job.setOutputKeyClass(Text.class);      job.setOutputValueClass(IntWritable.class);      job.setMapperClass(AggregateByKeyMapper.class);      job.setReducerClass(AggregateByKeyReducer.class);      job.setCombinerClass(AggregateByKeyReducer.class);       // args[0] = input directory       // args[1] = output directory      FileInputFormat.setInputPaths(job, new Path(args[0]));      FileOutputFormat.setOutputPath(job, new Path(args[1]));      boolean status = job.waitForCompletion(true);      THE_LOGGER.info("run(): status="+status);      return status ? 0 : 1;   }   /**   * The main driver for "Aggregate By Key" program.   * Invoke this method to submit the map/reduce job.   * @throws Exception When there is communication problems with the job tracker.   */   public static void main(String[] args) throws Exception {      // Make sure there are exactly 2 parameters      if (args.length != 2) {         THE_LOGGER.warn("usage AggregateByKeyDriver <input> <output>");         System.exit(1);      }      THE_LOGGER.info("inputDir="+args[0]);      THE_LOGGER.info("outputDir="+args[1]);      int returnStatus = ToolRunner.run(new AggregateByKeyDriver(), args);      System.exit(returnStatus);   }}

阅读全文

0 0