hadoop 中的 “helloword” 代码 + 详解

来源：互联网发布：多机位直播软件编辑：程序博客网时间：2024/05/16 13:47

packagecom.zhiyou.bd17.mapreduce;

importjava.io.IOException;

importorg.apache.hadoop.conf.Configuration;

importorg.apache.hadoop.fs.Path;

importorg.apache.hadoop.io.IntWritable;

importorg.apache.hadoop.io.LongWritable;

importorg.apache.hadoop.io.Text;

importorg.apache.hadoop.mapreduce.Job;

importorg.apache.hadoop.mapreduce.Mapper;

importorg.apache.hadoop.mapreduce.Reducer;

importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;

importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

publicclass WordCount {

// 定义map . Mapper的四个参数在这里第一个参数LongWritable key 代表偏移量，第二个参数Text 代表一条记录，第三个参数Text 代表输出的key，第四个参数IntWritable 代表value

publicstatic class WordCountMap extends Mapper<LongWritable, Text, Text, IntWritable>{

privateString[]infos;

privateText oKey =new Text();

privatefinal IntWritable oValue = new IntWritable(1);

@Override

protectedvoid map(LongWritable key, Textvalue, Mapper<LongWritable, Text, Text, IntWritable>.Contextcontext)

throwsIOException, InterruptedException {

// 解析一行数据，转换成一个单词数组

infos= value.toString().split("\\s");

for(Stringi :infos){

// 把单词形成一个kv对发送给 reducer (单词,1)

oKey.set(i);

context.write(oKey,oValue);

}

// 定义reducer 。 reducer 的四个参数第一个参数 Text 是Mapper传过来的key 第二个参数 IntWritable 是Mapper传过来的value，

// 第三个参数Text 代表reducer 输出的key 第四个参数IntWritable 代表reducer 输出的value

publicstatic class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

privateint sum ;

privateIntWritableoValue = new IntWritable(0);

@Override

protectedvoid reduce(Text key, Iterable<IntWritable>values,

Reducer<Text, IntWritable, Text, IntWritable>.Contextcontext)

throwsIOException, InterruptedException {

sum= 0;

for(IntWritablevalue : values){

sum+= value.get();

}

// 输出kv(单词，单词的计数)

oValue.set(sum);

context.write(key,oValue);

}

// 组装一个job 到mr引擎上执行

publicstatic void main(String[] args)throws IOException, ClassNotFoundException, InterruptedException {

// 构建一个configuration ，用来配置hdfs位置和mr的各项参数

Configurationconfiguration = new Configuration();

// 创建job 对象

Jobjob = Job.getInstance(configuration);

job.setJarByClass(WordCount.class);

job.setJobName("第一个mr作业:wordCount");

// 配置mr执行类

job.setMapperClass(WordCountMap.class);

job.setReducerClass(WordCountReducer.class);

// 设置mr的输出类型。如果Mapper和reducer 的输出类型一致，可以将设置mapper的输出类型省略

// job.setMapOutputKeyClass(Text.class);

// job.setMapOutputValueClass(IntWritable.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(IntWritable.class);

//设置数据源(等待被处理的数据)

// path 可以指定一个文件或者一个文件夹，如果是文件夹就处理该文件夹下的所有子文件

PathintputPath = new Path("/test/README.ext");

// 可以多次调用该方法，给mrjob设置多个处理文件的路径

FileInputFormat.addInputPath(job,intputPath);

// 设置目标数据的存放位置，是一个目录，不是一个文件，而且当前hdfs上不能已有这个目录

PathoutputPath = new Path("/bd17/output/wordcount");

outputPath.getFileSystem(configuration).delete(outputPath,true);

// 设置mrjob的最终输出结果位置，一个mrjob只能有一个输出目录

FileOutputFormat.setOutputPath(job,outputPath);

// 启动作业，分布式计算交给mr引擎. true：是否打印处理过程

booleanresult = job.waitForCompletion(true);

System.exit(result? 0 : 1);

}

阅读全文

0 0