MR WordCount类基本解析

来源:互联网 发布:windows安全性弹窗 编辑:程序博客网 时间:2024/05/16 12:04

一个最简单的WordCount解析:

import java.io.IOException;import java.util.*;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.*;import org.apache.hadoop.mapredpublic class WordCount {public static class Map extends MapReduceBase implementsMapper<LongWritable, Text, Text, IntWritable> {//这个Mapper中前两个参数对应map方法中的input key/value//后两个参数对应于输出的OutputCollect类型private final static IntWritable one = new IntWritable(1);private Text word = new Text();public void map(LongWritable key, Text value,OutputCollector<Text, Writable> output, Reporter reporter)                throws IOException {            String line = value.toString();            StringTokenizer tokenizer = new StringTokenizer(line);            while (tokenizer.hasMoreTokens()) {                word.set(tokenizer.nextToken());                output.collect(word, one);            }        }    }
public static class Reduce extends MapReduceBase implements            Reducer<Text, IntWritable, Text, IntWritable> {        public void reduce(Text key, Iterator<IntWritable> values,                OutputCollector<Text, IntWritable> output, Reporter reporter)                throws IOException {            int sum = 0;            while (values.hasNext()) {                sum += values.next().get();            }
        output.collect(key, new IntWritable(sum));    }}
public static void main(String[] args) throws Exception {        //配置Job        JobConf conf = new JobConf(WordCount.class);        //设置Job名称        conf.setJobName("wordcount");        //设置输出键值类型        conf.setOutputKeyClass(Text.class);        conf.setOutputValueClass(IntWritable.class);        //指定Map/Reduce处理类        conf.setMapperClass(Map.class);        conf.setReducerClass(Reduce.class);        //指定TextInputFormat类为输入方式,该类将文件分解为行,并以换行或回车符作为行结尾        //并以文件中偏移量为键,以每行内容作为值        /*An InputFormat for plain text files.         Files are broken into lines. 
    Either linefeed or carriage-return are used to signal end of line.     Keys are the position in the file, and values are the line of text..     */    conf.setInputFormat(TextInputFormat.class);    //指定该输出格式为TextOutputFormat,该类为写入一个文本文件    conf.setOutputFormat(TextOutputFormat.class);    //指定要解析的数据及输出文件路径    FileInputFormat.setInputPaths(conf, new Path(args[0]));    FileOutputFormat.setOutputPath(conf, new Path(args[1]));    //提交Job    JobClient.runJob(conf);}

}

几个重要接口:
1.InputFormat(可以指定):指定输入源如何解析
将输入源分割为InputSplit(不可指定),使用RecordReader(不可指定)将每个InputSplit转化为输入记录
2.Mapper(用户定义)
3.Combiner(在map节点上,用户可以定义)将在Map之后进行,可以减少网络传输
4.Paritioner(在reduce节点,用户可以定义)将确定map后的结果谁来执行
5.Sort Map之后包括到partition后将有一次归并排序过程,Partition和Sort过程即是shuffle过程
6.reduce (用户定义)

0 0
原创粉丝点击