Hadoop2.0 Mapreduce实例WordCount体验

来源:互联网 发布:magnet关联的软件 编辑:程序博客网 时间:2024/05/16 06:23
     在Hadoop2.0中MapReduce程序的都需要继承org.apache.hadoop.mapreduce.Mapper 和 org.apache.hadoop.mapreduce.Reducer这两个基础类,来定制自己的mapreduce功能,源码中主要的函数如下

Mapper.java

public void run(Context context) throws IOException, InterruptedException {    setup(context);     // Called once at the beginning of the task.    while (context.nextKeyValue()) {      map(context.getCurrentKey(), context.getCurrentValue(), context);    }    cleanup(context);   // Called once at the end of the task.  } }  /**   * Called once for each key/value pair in the input split. Most applications   * should override this, but the default is the identity function.   */  protected void map(KEYIN key, VALUEIN value,                      Context context) throws IOException, InterruptedException {    context.write((KEYOUT) key, (VALUEOUT) value);  }
Reducer.java
  public void run(Context context) throws IOException, InterruptedException {    setup(context);     // Called once at the beginning of the task.    while (context.nextKey()) {      reduce(context.getCurrentKey(), context.getValues(), context);    }    cleanup(context);   // Called once at the end of the task.  }  /**   * This method is called once for each key. Most applications will define   * their reduce class by overriding this method. The default implementation   * is an identity function.   */  protected void reduce(KEYIN key, Iterable<VALUEIN> values, Context context                        ) throws IOException, InterruptedException {    for(VALUEIN value: values) {      context.write((KEYOUT) key, (VALUEOUT) value);    }  }

       在Mapper和Reducer类中,都有一个run()方法不断提供(key,value)来调用map()和reduce()函数来处理,我们一般只需重写其中的map和reduce方法。在mapreduce中只有支持序列化的类才能作为键值,其中的key还必须要是可比较的,故 key要实现WritableComparable接口,value只需要实现Writable接口。

如下给出自己参照源码写的MyWordCount.java

import java.io.IOException;import java.util.Iterator;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class MyWordCount {public static class WordCountMapper extends Mapper<Object,Text,Text,IntWritable> {        private static final IntWritable one =  new IntWritable(1);private Text word = new Text();protected void map(Object key, Text value, Context context)throws IOException, InterruptedException {String line = value.toString();StringTokenizer words = new StringTokenizer(line);while(words.hasMoreTokens()) {word.set(words.nextToken());context.write(word, one);}}}public static class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {private IntWritable totalNum = new IntWritable();@Overrideprotected void reduce(Text key, Iterable<IntWritable> values,Context context)throws IOException, InterruptedException {   int sum = 0;   Iterator<IntWritable> it = values.iterator();                   while(it.hasNext()) {               sum += it.next().get();                   }                   totalNum.set(sum);                   context.write(key,totalNum);}}public static void main(String[] args) throws Exception{Configuration conf = new Configuration();           Job job = new Job(conf,"MyWordCount");job.setJarByClass(MyWordCount.class); //设置运行jar中的class名称job.setMapperClass(WordCountMapper.class);//设置mapreduce中的mapper reducer combiner类job.setReducerClass(WordCountReducer.class);                job.setCombinerClass(WordCountReducer.class); job.setOutputKeyClass(Text.class); //设置输出结果键值对类型                job.setOutputValueClass(IntWritable.class);FileInputFormat.addInputPath(job,new Path(args[0]));//设置mapreduce输入输出文件路径FileOutputFormat.setOutputPath(job,new Path(args[1]));System.exit(job.waitForCompletion(true) ? 0:1);}}