MR WordCount类基本解析

来源：互联网发布：windows安全性弹窗编辑：程序博客网时间：2024/05/16 12:04

一个最简单的WordCount解析：

import java.io.IOException;import java.util.*;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.*;import org.apache.hadoop.mapredpublic class WordCount {public static class Map extends MapReduceBase implementsMapper<LongWritable, Text, Text, IntWritable> {//这个Mapper中前两个参数对应map方法中的input key/value//后两个参数对应于输出的OutputCollect类型private final static IntWritable one = new IntWritable(1);private Text word = new Text();public void map(LongWritable key, Text value,OutputCollector<Text, Writable> output, Reporter reporter)                throws IOException {            String line = value.toString();            StringTokenizer tokenizer = new StringTokenizer(line);            while (tokenizer.hasMoreTokens()) {                word.set(tokenizer.nextToken());                output.collect(word, one);            }        }    }

public static class Reduce extends MapReduceBase implements            Reducer<Text, IntWritable, Text, IntWritable> {        public void reduce(Text key, Iterator<IntWritable> values,                OutputCollector<Text, IntWritable> output, Reporter reporter)                throws IOException {            int sum = 0;            while (values.hasNext()) {                sum += values.next().get();            }

        output.collect(key, new IntWritable(sum));    }}

public static void main(String[] args) throws Exception {        //配置Job        JobConf conf = new JobConf(WordCount.class);        //设置Job名称        conf.setJobName("wordcount");        //设置输出键值类型        conf.setOutputKeyClass(Text.class);        conf.setOutputValueClass(IntWritable.class);        //指定Map/Reduce处理类        conf.setMapperClass(Map.class);        conf.setReducerClass(Reduce.class);        //指定TextInputFormat类为输入方式，该类将文件分解为行，并以换行或回车符作为行结尾        //并以文件中偏移量为键，以每行内容作为值        /*An InputFormat for plain text files.         Files are broken into lines.

    Either linefeed or carriage-return are used to signal end of line.     Keys are the position in the file, and values are the line of text..     */    conf.setInputFormat(TextInputFormat.class);    //指定该输出格式为TextOutputFormat,该类为写入一个文本文件    conf.setOutputFormat(TextOutputFormat.class);    //指定要解析的数据及输出文件路径    FileInputFormat.setInputPaths(conf, new Path(args[0]));    FileOutputFormat.setOutputPath(conf, new Path(args[1]));    //提交Job    JobClient.runJob(conf);}

}

几个重要接口：
1.InputFormat（可以指定）:指定输入源如何解析
将输入源分割为InputSplit（不可指定），使用RecordReader（不可指定）将每个InputSplit转化为输入记录
2.Mapper（用户定义）
3.Combiner（在map节点上，用户可以定义）将在Map之后进行，可以减少网络传输
4.Paritioner（在reduce节点，用户可以定义）将确定map后的结果谁来执行
5.Sort Map之后包括到partition后将有一次归并排序过程，Partition和Sort过程即是shuffle过程
6.reduce （用户定义）

0 0