WordCount代码解析

来源:互联网 发布:bae怎么域名备案 编辑:程序博客网 时间:2024/05/29 04:09
    import org.apache.hadoop.conf.Configuration;      import org.apache.hadoop.fs.Path;      import org.apache.hadoop.io.IntWritable;      import org.apache.hadoop.io.Text;      import org.apache.hadoop.mapreduce.Job;      import org.apache.hadoop.mapreduce.Mapper;      import org.apache.hadoop.mapreduce.Reducer;      import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;      import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;      import org.apache.hadoop.util.GenericOptionsParser;            import java.io.IOException;      import java.util.StringTokenizer;            @SuppressWarnings("deprecation")      public class WordCount {                //在map阶段接收输入的<key, value>(key是当前输入的行号,value是对应的内容),          //然后对内容进行切词,每切下一个词就将其组织成<word,1>的形式输出(输出即写到context中)          //设置map的输入类型为<Object, Text>          //输出类型为<Text, IntWritable>          public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {              //one表示单词出现1次              private final static IntWritable one = new IntWritable(1);              //word存储切下的单词              private Text word = new Text();                    @Override              public void map(Object key, Text value, Context context) throws IOException, InterruptedException {                  //参数key表示的是行号,下面并没有用到key  Context 理解为一个容器                StringTokenizer itr = new StringTokenizer(value.toString()); //对输入的行进行切词                  while (itr.hasMoreTokens()) { //切下单词,存入word                      word.set(itr.nextToken());                      context.write(word, one);                  }              }          }                //Reducer是对相同key下的所有value进行处理.          //在reduce阶段,TaskTracker会接收到<word,{1,1,1,1}>形式的数据,也就是特定单词出现次数的情况          //设置reduce的输入数据类型为<Text, IntWritable>          //输出数据类型为<Text, IntWritable>          public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {              //result记录单词的个数              private IntWritable result = new IntWritable();                    @Override              public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {                  int sum = 0;                  //对获取的<key, value-list>计算value的和                  for (IntWritable val : values) {                      sum += val.get();                  }                  //将频数设置到result中                  result.set(sum);                  //收集结果                  context.write(key, result);              }          }                public static void main(String[] args) throws Exception {              //args=hdfs://192.168.22.156:9000/input/file1 hdfs://192.168.22.156:9000/output/wordnum              Configuration conf = new Configuration();              String[] otherArgs = new GenericOptionsParser(conf, args)                      .getRemainingArgs();              if (otherArgs.length != 2) {                  System.err.println("Usage: wordcount <in> <out>");                  System.exit(2);              }              //配置作业名              Job job = new Job(conf, "word count");              //配置作业各个类              job.setJarByClass(WordCount.class);              job.setMapperClass(TokenizerMapper.class);              job.setCombinerClass(IntSumReducer.class);              job.setReducerClass(IntSumReducer.class);              job.setOutputKeyClass(Text.class);              job.setOutputValueClass(IntWritable.class);                    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));              FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));                    System.exit(job.waitForCompletion(true) ? 0 : 1);          }      }  


1 0
原创粉丝点击