hadoop 中的 “helloword” 代码 + 详解

来源:互联网 发布:多机位直播软件 编辑:程序博客网 时间:2024/05/16 13:47

packagecom.zhiyou.bd17.mapreduce;
importjava.io.IOException;
importorg.apache.hadoop.conf.Configuration;
importorg.apache.hadoop.fs.Path;
importorg.apache.hadoop.io.IntWritable;
importorg.apache.hadoop.io.LongWritable;
importorg.apache.hadoop.io.Text;
importorg.apache.hadoop.mapreduce.Job;
importorg.apache.hadoop.mapreduce.Mapper;
importorg.apache.hadoop.mapreduce.Reducer;
importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;
importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
publicclass WordCount {
     
       // 定义map . Mapper的四个参数在这里 第一个参数LongWritable key 代表偏移量,第二个参数Text 代表一条记录, 第三个参数Text 代表输出的key,第四个参数IntWritable 代表value
     publicstatic class WordCountMap extends Mapper<LongWritable, Text, Text, IntWritable>{
          
          privateString[]infos;
          privateText oKey =new Text();
          privatefinal IntWritable oValue = new IntWritable(1);
          
          @Override
          protectedvoid map(LongWritable key, Textvalue, Mapper<LongWritable, Text, Text, IntWritable>.Contextcontext)
                   throwsIOException, InterruptedException {
              
              // 解析一行数据,转换成一个单词数组
              infos= value.toString().split("\\s");
              
              for(Stringi :infos){
                   
                   // 把单词形成一个kv对发送给 reducer (单词,1)
                   oKey.set(i);
                   context.write(oKey,oValue);
              }
          }
          
          // 定义reducer 。 reducer 的四个参数 第一个参数 Text 是Mapper传过来的key 第二个参数 IntWritable 是Mapper传过来的value,
          // 第三个参数Text 代表reducer 输出的key 第四个参数IntWritable 代表reducer 输出的value
          publicstatic class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
              privateint sum ;
              privateIntWritableoValue = new IntWritable(0);
              
              @Override
              protectedvoid reduce(Text key, Iterable<IntWritable>values,
                        Reducer<Text, IntWritable, Text, IntWritable>.Contextcontext)
                        throwsIOException, InterruptedException {
                   sum= 0;
                   for(IntWritablevalue : values){
                        
                        sum+= value.get();
                   }
                   // 输出kv(单词,单词的计数)
                   oValue.set(sum);
                   context.write(key,oValue);
              }
          }
          
          
          // 组装一个job 到mr引擎上执行
          publicstatic void main(String[] args)throws IOException, ClassNotFoundException, InterruptedException {
              
              // 构建一个configuration ,用来配置hdfs位置和mr的各项参数
              Configurationconfiguration = new Configuration();
              
              // 创建job 对象
              Jobjob = Job.getInstance(configuration);
              
              job.setJarByClass(WordCount.class);
              job.setJobName("第一个mr作业:wordCount");
              
              // 配置mr执行类
              job.setMapperClass(WordCountMap.class);
               job.setReducerClass(WordCountReducer.class);
              
              // 设置mr的输出类型。如果Mapper和reducer 的输出类型一致,可以将设置mapper的输出类型省略
//            job.setMapOutputKeyClass(Text.class);
//             job.setMapOutputValueClass(IntWritable.class);
              job.setOutputKeyClass(Text.class);
              job.setOutputValueClass(IntWritable.class);
              
              //设置数据源(等待被处理的数据)
              // path 可以指定一个文件或者一个文件夹,如果是文件夹就处理该文件夹下的所有子文件
              PathintputPath = new Path("/test/README.ext");
              
              // 可以多次调用该方法,给mrjob设置多个处理文件的路径
              FileInputFormat.addInputPath(job,intputPath);
              
              // 设置目标数据的存放位置,是一个目录,不是一个文件,而且当前hdfs上不能已有这个目录
              PathoutputPath = new Path("/bd17/output/wordcount");
               outputPath.getFileSystem(configuration).delete(outputPath,true);
              // 设置mrjob的最终输出结果位置,一个mrjob只能有一个输出目录
              FileOutputFormat.setOutputPath(job,outputPath);
              
              // 启动作业,分布式计算交给mr引擎. true:是否打印处理过程
              booleanresult = job.waitForCompletion(true);
              
              System.exit(result? 0 : 1);
          }
     }
}