YARN（Hadoop）学习笔记（3）

来源：互联网发布：linux查看80端口编辑：程序博客网时间：2024/06/10 03:17

<span style="font-size:18px;"><strong>//WordCount源码阅读笔记</strong></span>

<span style="font-size:18px;"><strong>package org.apache.hadoop.examples;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.ReduceContext;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class WordCount {public static class TokenizerMapper extendsMapper<Object, Text, Text, IntWritable> {// IntWritable是对JAVA中对int的Writable封装，intWritable实现的是WritableComparable接口，而WritableComparable实现Writable类// Writable是hadoop中为实现其海量处理而设计的序列化（Serialization)格式。Serialization是将对象编码成byte流的机制。序列化后的对象可以存储在disk上，// 方便进行反序列化操作，同时可以通过网络在KVM上迁移// Mapper<KEYIN,VALUEIN,KEYOUT,VALUEOUT>// UTF-8是Unicode编码的其中一种实现, Unicode的实现还有UTF-16、UTF-32// JAVA对字符使用Unicode编码方式。所谓编码就是由字符到机器码的映射f// Object是java.lang中定义的类，declare了equals方法和hashcode方法，用于比较两个Object是否相等；declare了toString方法用于将对象表示成String// Text类型和IntWritable类似，实现了WritableComparable接口，同时extends了binaryCompare类；Text是UTF-8的Writable封装// TokenizerMapper是MR提供的Mapper，用于将字符串分割成若干个token。// MR还提供了如ChainMapper支持链式作业的Mapper，InvertMapper交换key/value位置等其它的Mapperprivate final static IntWritable one = new IntWritable(1);// IntWritable类中定义的constructor有多个，有IntWritable(int// value){set(value)}和IntWritable() {}// IntWritable类中定义了一个int值类型的变量value，通过IntWritable(int value)进行赋值，如上一条语句private Text word = new Text();public void map(Object key, Text value, Context context// map用于将split出的每个段w中的word记录成<word,1>的形式,通过context类用write()方法写入disk// TokenizerMapper由于实现Mapper接口，也有一个map方法。// map方法中的Context是一个抽象类，实现了MapContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT>接口) throws IOException, InterruptedException {StringTokenizer itr = new StringTokenizer(value.toString());// value是StringToKenizer对象。StringTokenizer是JAVA中Object类的一个子类，实现了Emueration接口// StringTokenizer是一个用来通过token分隔String的应用类,定义了多个方法和构造器，其中一个构造器是StringTokenizer(String str)。// StringTokenizer有三个核心方法，boolean hasMoreTokens()、boolean hasMoreElement()、String nextToken。// nextMoreTokens()返回是否还有分隔符,功能与nextElement()相同；nextToken()用于返回下一个分隔符// Enumeration接口是JAVA的枚举类// StringTokenizer类用起来比StreamTokenizer类更加简单。StringTokenizer不会区分对象、函数、数组、或引证字符串。while (itr.hasMoreTokens()) {word.set(itr.nextToken());context.write(word, one);// context是Context对象，// TaskInputOutputContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT>// 中定义了write(KEYOUT key, VALUEOUT value) 方法}}}public static class IntSumReducer extendsReducer<Text, IntWritable, Text, IntWritable> {private IntWritable result = new IntWritable();public void reduce(Text key, Iterable<IntWritable> values,Context context//reduce方法用来对每个V中的v进行累加，每个<k,v>是这种形式<word,(1,1,1,1)>,所以value部分使用一个迭代器进行处理累加// Context是一个抽象类，实现了ReduceContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT>接口// Iterable<T>返回一个Iterator，用于遍历容器中的元素// JAVA容器主要有两大类Map和Collection，前者是<k,v>形式。// Map有Hashtable，Colletcion如ArrayList，LinkList，Stack，Queue等) throws IOException, InterruptedException {int sum = 0;for (IntWritable val : values) {sum += val.get();}result.set(sum);// result用于表示字母为key的次数，如key是“hello”，出现9次，那么result为context.write(key, result);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();// Configuration是hadoop用于处理配置文件的类，这个类中的set()和get()方法用于设置和访问配置文件String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();// GenericOptionsParser是hadoop框架中解析命令行参数的基本类。// 用于辨别一些标准的命令行参数，它能够让应用程序指定namenode，jobtracker，以及其他额外的配置资源。if (otherArgs.length != 2) {System.err.println("Usage: wordcount <in> <out>");System.exit(2);}/** * 当 输入的 word count时，会 创建Job的实例job * Job其实就是提供配置作业、获取作业配置、以及提交作业的功能，以及跟踪作业进度和控制作业 * Job类继承于JobContext类。JobContext提供了获取作业配置的功能，如作业ID，作业的Mapper类，Reducer类， * Job类在JobContext的基础上，提供了设置作业配置信息的功能、跟踪进度，以及提交作业的接口和控制作业的方法 */Job job = new Job(conf, "word count");/** * 静态的方法属性初始化，是在加载类的时候初始化。而非静态方法属性初始化，是new类实例对象的时候加载 * class对象是JAVA中的特殊对象，用于执行RTTI（运行时type信息，Run-Time-Type-Info） * 每个类在编译后会产生一个class对象，被保存在同名的.class文件中 * Wordcount.class用于获取类Wordcount的class对象 * ；也可以使用Wordcount.getClass()获取class对象 Job类中定义了setJarByClass(Class<?> * cls)，用于从指定class的位置（这里是**-examples-**.jar）获取Jar包 */job.setJarByClass(WordCount.class);/** * 同样是Job类中定义的方法，setMapperClass(Class<? extends Mapper> * cls)，对job进行mapper操作 */job.setMapperClass(TokenizerMapper.class);/** * 当map生成的数据过大时，带宽就成了瓶颈，怎样精简压缩传给Reduce的数据，有一种方法就是使用Combiner. * 每一个map可能会产生大量的输出，combiner的作用就是在map端对输出先做一次合并，以减少传输到reducer的数据量的方式 * Combiner号称本地的Reduce，Reduce最终的输入，也就是Combiner的输出 * Job类中定义的方法setCombinerClass(Class<? extends Reducer> cls)，对Reduce的子类 * cls进行combine *  */job.setCombinerClass(IntSumReducer.class);/** *Job中定义的方法 setReducerClass(Class<? extends Reducer> * cls)用于对Reduce子类cls，这里是IntSumReducer指定Reduce */job.setReducerClass(IntSumReducer.class);/** * setOutputKeyClass(Class<?> theClass)给输出数据设定关键字类（key * class），这里是Text.class 这个方法同样是Job中定义的方法 */job.setOutputKeyClass(Text.class);/** * setOutputValueClass(Class<?> theClass)给输出数据设定值类（value * class），这里是IntWritable.class 这个方法同样是Job中定义的方法 */job.setOutputValueClass(IntWritable.class);/** * addInputPath(Job job, Path path)是FileInputFormat中定义的方法。 用于给map-reduce * job在list of inputs中增加一个path * 这里是new创建出来的对象Path(otherArgs[0])，job是上面定义的Job对象job * Path是org.apache.hadoop.fs中定义的JAVA类 Path类有多个constructor，如Path(String * pathString)，Path(String parent, String child)获取表示路径string */FileInputFormat.addInputPath(job, new Path(otherArgs[0]));/** * FileOutputFormat是从HDFS读操作的基类，setOutputPath是它的public static一个方法 * setOutputPath(Job job, Path outputDir)用于设置job的输出目录为outputDir */FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));/** * exit是System类定义的退出，有System.exit(0),System.out(2)等。如果job完成执行返回System. * out(0) */System.exit(job.waitForCompletion(true) ? 0 : 1);}}</strong></span>

0 0