WordCount代码解析
来源:互联网 发布:bae怎么域名备案 编辑:程序博客网 时间:2024/05/29 04:09
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import java.io.IOException; import java.util.StringTokenizer; @SuppressWarnings("deprecation") public class WordCount { //在map阶段接收输入的<key, value>(key是当前输入的行号,value是对应的内容), //然后对内容进行切词,每切下一个词就将其组织成<word,1>的形式输出(输出即写到context中) //设置map的输入类型为<Object, Text> //输出类型为<Text, IntWritable> public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { //one表示单词出现1次 private final static IntWritable one = new IntWritable(1); //word存储切下的单词 private Text word = new Text(); @Override public void map(Object key, Text value, Context context) throws IOException, InterruptedException { //参数key表示的是行号,下面并没有用到key Context 理解为一个容器 StringTokenizer itr = new StringTokenizer(value.toString()); //对输入的行进行切词 while (itr.hasMoreTokens()) { //切下单词,存入word word.set(itr.nextToken()); context.write(word, one); } } } //Reducer是对相同key下的所有value进行处理. //在reduce阶段,TaskTracker会接收到<word,{1,1,1,1}>形式的数据,也就是特定单词出现次数的情况 //设置reduce的输入数据类型为<Text, IntWritable> //输出数据类型为<Text, IntWritable> public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> { //result记录单词的个数 private IntWritable result = new IntWritable(); @Override public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; //对获取的<key, value-list>计算value的和 for (IntWritable val : values) { sum += val.get(); } //将频数设置到result中 result.set(sum); //收集结果 context.write(key, result); } } public static void main(String[] args) throws Exception { //args=hdfs://192.168.22.156:9000/input/file1 hdfs://192.168.22.156:9000/output/wordnum Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args) .getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } //配置作业名 Job job = new Job(conf, "word count"); //配置作业各个类 job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
1 0
- WordCount代码解析
- wordcount 代码解析
- Hadoop的WordCount代码解析
- Spark--02WordCount代码解析
- WordCount代码
- storm入门简介及WordCount代码解析(一)
- wordCount的解析
- WordCount原理解析
- 彻底解析wordCount原理
- MapReduce WordCount源码解析
- mr2 wordcount 源码解析
- Hadoop入门WordCount代码
- hadoop wordcount 代码
- Hadoop WordCount代码
- MR代码实例-wordcount
- Trident WordCount代码示例
- wordcount和sort代码
- WordCount代码实现
- Zedboard---实验五点亮另一个数码管
- java基础代码-打印图形-递归
- oracle11 pl/sqldeveloper 使用
- C代码优化
- Linux入门(11)——Ubuntu16.04安装texlive2016并配置texmaker和sublime text3
- WordCount代码解析
- linux安装coreseek
- classpath环境变量配置
- 安卓应用百度地图API(1)-keytool
- java File类
- 悟空(wukong)搜索引擎源代码阅读(待续)
- 关于unity的绝对路径与相对路径的使用
- Linux入门(12)——解决双系统下Ubuntu16.04不能访问Windows分区
- OA系统文档