hadoop示例程序 WordCount详解

来源:互联网 发布:如何在手机上淘宝购物 编辑:程序博客网 时间:2024/05/16 16:17

hadoop框架由三部分组成,包括两个类(分别继承于mapper类和reducer类)以及一个主函数.


WordCount示例功能为统计文本中的单个单词出现频率. 输入两个文本,输出为单词和出现次数的对应关系.


下面对代码进行详细注释:

package org.apache.hadoop.examples;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class WordCount {  public static class TokenizerMapper        extends Mapper<Object, Text, Text, IntWritable>{        private final static IntWritable one = new IntWritable(1);  //建立"int"型变量one,初值为1    private Text word = new Text();                             //建立"string:型变量 word,用于接收传入的单词          public void map(Object key, Text value, Context context                    ) throws IOException, InterruptedException {      StringTokenizer itr = new StringTokenizer(value.toString());  //将输入的文本按行分段      while (itr.hasMoreTokens()) {        word.set(itr.nextToken());                                  //为word赋值        context.write(word, one);                                   // 将 键-值 对 word one 传入      }    }  }    public static class IntSumReducer        extends Reducer<Text,IntWritable,Text,IntWritable> {     private IntWritable result = new IntWritable();                 //创建整型变量result    public void reduce(Text key, Iterable<IntWritable> values,                        Context context                       ) throws IOException, InterruptedException {      int sum = 0;                                                 //创建int 型变量sum 初值0      for (IntWritable val : values) {        sum += val.get();                                          //将每个key对应的所有value类间                                             }      result.set(sum);                                              //sum传入result                                              context.write(key, result);                                   //将 key-result对传入    }  }  public static void main(String[] args) throws Exception {    Configuration conf = new Configuration();    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    if (otherArgs.length != 2) {      System.err.println("Usage: wordcount <in> <out>");      System.exit(2);    }    Job job = new Job(conf, "word count");                  //建立新job    job.setJarByClass(WordCount.class);    job.setMapperClass(TokenizerMapper.class);              //设置map类    job.setCombinerClass(IntSumReducer.class);              //设置combiner类    job.setReducerClass(IntSumReducer.class);               //设置reducer类    job.setOutputKeyClass(Text.class);                       //输出的key类型    job.setOutputValueClass(IntWritable.class);              //输出的value类型    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  //输入输出参数(在设置中指定)    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));    System.exit(job.waitForCompletion(true) ? 0 : 1);  }}

WordCount相当于java 中的helloword, 是hadoop的入门基础,让初学者简单的感受hadoop是怎么工作的. 后续学习任重而道远.

2 1