WordCount代码实现详解

来源:互联网 发布:java jna调用64位dll 编辑:程序博客网 时间:2024/06/10 22:55
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements.  See the NOTICE file * distributed with this work for additional information * regarding copyright ownership.  The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.hadoop.examples;//导入必要的packageimport java.io.IOException;        //报错类                                       import java.util.StringTokenizer;  //StringTokenizer类,用于将空白字符作为分割符的类import org.apache.hadoop.conf.Configuration;//Hadoop中用于读取配置信息的类import org.apache.hadoop.fs.Path;           //有关文件系统输入输出数据的类import org.apache.hadoop.io.IntWritable;    //封装定义了IntWritable类import org.apache.hadoop.io.Text;           //封装定义了Text类import org.apache.hadoop.mapreduce.Job;     //封装定义了Job类import org.apache.hadoop.mapreduce.Mapper;  //封装定义了Mapper类import org.apache.hadoop.mapreduce.Reducer; //封装定义了Reducer类import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;   //文件输入要用到的类import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; //文件输出要用到的类import org.apache.hadoop.util.GenericOptionsParser;             //GenericOptionsParser类,用来解释常用hadoop命令,并根据需要为Configuration对象设置相应的值public class WordCount {  public static class TokenizerMapper        extends Mapper{         //自定义的TokenizerMapper类,继承自前面导入的Mapper类        private final static IntWritable one = new IntWritable(1);  //实例化了一个IntWritable类的one对象并赋值为常量1    private Text word = new Text();                             //实例化了一个Text类的对象word          public void map(Object key, Text value, Context context     //定义Map方法                    ) throws IOException, InterruptedException {//这里说一下context类,它是Mapper的一个内部类,它用来与MapReduce系统进行通信,如把map的结果传给reduce处理。简单的说顶级接口用它在map或是reduce任务中跟踪task的状态,MapContext就是记录了map执行的上下文,在mapper类中,这个context可以存储一些job conf的信息,同时context作为了map和reduce执行中各个函数的一个桥梁,我们可以在map函数中处理这个信息      StringTokenizer itr = new StringTokenizer(value.toString());//实例化了一个以空白字符为分隔符的StringTokenizer类的对象itr      while (itr.hasMoreTokens()) {//如果判断还有下一个分隔符(空格)        word.set(itr.nextToken()); //则输出并返回之间的字符串给word        context.write(word, one);  //context.write方法将(word,1)这样的二元组存入context中      }    }  }    public static class IntSumReducer                           //自定义的IntSumReducer类,继承自前面导入的Reducer类                                    extends Reducer {    private IntWritable result = new IntWritable();           //实例化了一个IntWritable类的result对象    public void reduce(Text key, Iterable values,Context context//定义Reduce方法,这里迭代器(Iterator)是一种设计模式,它是一个对象,它可以遍历并选择序列(IntWritable)中的对象,而开发人员不需要了解该序列的底层结构。                       ) throws IOException, InterruptedException {      int sum = 0;      for (IntWritable val : values) {        sum += val.get();//将该词的出现次数相加      }      result.set(sum);//将sum赋给result      context.write(key, result);//输出最终结果    }  }  public static void main(String[] args) throws Exception {    Configuration conf = new Configuration();  //运行MapReduce程序前都要初始化Configuration,该类主要是读取MapReduce系统配置信息,这些信息包括hdfs还有MapReduce,也就是安装hadoop时候的配置文件例如:core-site.xml、hdfs-site.xml和mapred-site.xml等等文件里的信息,有些童鞋不理解为啥要这么做,这个是没有深入思考MapReduce计算框架造成,我们程序员开发MapReduce时候只是在填空,在map函数和reduce函数里编写实际进行的业务逻辑,其它的工作都是交给MapReduce框架自己操作的,但是至少我们要告诉它怎么操作啊,比如hdfs在哪里,MapReduce的jobstracker在哪里,而这些信息就在conf包下的配置文件里。    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    if (otherArgs.length < 2) {      System.err.println("Usage: wordcount  [...] ");      System.exit(2);    }//If的语句好理解,就是运行WordCount程序时候一定是两个参数,如果不是就会报错退出。至于第一句里的GenericOptionsParser类,它是用来解释常用hadoop命令,并根据需要为Configuration对象设置相应的值    Job job = Job.getInstance(conf, "word count");//用Job.getInstance方法设置作业名为word count    job.setJarByClass(WordCount.class);           //为job的输出数据设置Key类    job.setMapperClass(TokenizerMapper.class);    //设置Mapper类(Map阶段使用)    job.setCombinerClass(IntSumReducer.class);    //设置Combiner类(中间合并结果)    job.setReducerClass(IntSumReducer.class);     //设置Reducer类(Reduce阶段使用)    job.setOutputKeyClass(Text.class);            //为job的输出数据设置Key类,规定Reduce输出的Key类型为Text    job.setOutputValueClass(IntWritable.class);   //设置Reduce输出的Value类型为IntWritable        for (int i = 0; i < otherArgs.length - 1; ++i) { //设置输入输出路径            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));    }    FileOutputFormat.setOutputPath(job,      new Path(otherArgs[otherArgs.length - 1]));    System.exit(job.waitForCompletion(true) ? 0 : 1);//等待任务执行完毕退出  }}

1 0