MapReduce简单实例解析map、reduce、combiner、partition一条龙

来源:互联网 发布:交互原型设计软件 编辑:程序博客网 时间:2024/06/04 22:01

需求:通过MapReduce对红楼梦TXT文件统计笑、喜、哭、怒在全书的数量,使用combiner减少IO,通过partition输出到两个文件中。
通过MapReduce插件创建MapReduce project,这样需要的包都会自动导入

主函数:

package com.zhiyou100;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;public class MyApp {    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {        Configuration conf = new Configuration();        conf.set("mapreduce.output.textoutputformat.separator", ":");        Path inputPath = new Path("hdfs://master:9000/mark/hlm-utf8.txt");        Path outputPath = new Path("hdfs://master:9000/result/hml02");        FileSystem fs = FileSystem.newInstance(conf);        // 如果文件已存在就删除        if (fs.exists(outputPath)) {            fs.delete(outputPath, true);        }        fs.close();        // job相当于一个model        Job job = Job.getInstance(conf, "HLM");        job.setJarByClass(MyApp.class);        // 指定输入目录        FileInputFormat.addInputPath(job, inputPath);        // 指定对输入数据进行格式化处理的类(可以省略)        job.setInputFormatClass(TextInputFormat.class);        // 指定自定义的Mapper类        job.setMapperClass(MyMapper.class);        // map输入        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);        // 分区(可以省略)        job.setPartitionerClass(MyPartition.class);        // 设置要运行的Reducer的数量(可以省略)        job.setNumReduceTasks(2);        // 指定自定义的Reducer类        job.setReducerClass(MyReducer.class);        job.setCombinerClass(MyCombiner.class);        // 泛型类在编译时会被当成?所以要指定        // 指定map输出的<K,V>类型(如果<k3,v3>的类型与<k2,v2>的类型一致则可以省略)        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        // 指定输出目录        FileOutputFormat.setOutputPath(job, outputPath);        // 指定对输出数据进行格式化处理的类(可以省略)        job.setOutputFormatClass(TextOutputFormat.class);        // 把任务提交到集群,轮寻方式        System.exit(job.waitForCompletion(true) ? 0 : 1);    }}

mapper:

package com.zhiyou100;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class MyMapper extends Mapper<Object, Text, Text, IntWritable> {    static {        System.out.println("my_-mapper");    }    private IntWritable num = new IntWritable();    private Text word = new Text();    private int no = 0;    public void map(Object key, Text value, Context context) throws IOException, InterruptedException {        StringTokenizer st = new StringTokenizer(value.toString(), "《 》 、 ! , 。 ? :;  “ ” ‘ ’ ");        while (st.hasMoreElements()) {            String text = st.nextElement().toString().trim();            no += 1;            context.getCounter("ZY", "statement").increment(1);            if (text.contains("笑")) {                word.set("笑");                num.set(no);                context.write(word, num);            }            if (text.contains("喜")) {                word.set("喜");                num.set(no);                context.write(word, num);            }            if (text.contains("哭")) {                word.set("哭");                num.set(no);                context.write(word, num);            }            if (text.contains("怒")) {                word.set("怒");                num.set(no);                context.write(word, num);            }        }    }}

reduce:

package com.zhiyou100;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {    static {        System.out.println("my_-reducer");    }    private IntWritable result = new IntWritable();    @Override    public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {        int sum = 0;        for(IntWritable val : values) {            sum += val.get();        }        result.set(sum);        context.write(key, result);    }}

combiner:

package com.zhiyou100;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class MyCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {    static {        System.out.println("my_-combiner");    }    private IntWritable result = new IntWritable();    @Override    public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {        int sum = 0;        for(IntWritable val : values) {            sum += 1;        }        result.set(sum);        context.write(key, result);    }}

partition:

package com.zhiyou100;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Partitioner;public class MyPartition extends Partitioner<Text, IntWritable>{    static {        System.out.println("my_-partition");    }    @Override    public int getPartition(Text key, IntWritable value, int numPartitions) {        if(key.toString().contains("笑") || key.toString().contains("喜")) {            return 0;        }else {            return 1;        }    }}

输出结果:
这里写图片描述

通过每个类上定义的静态方法打印的日志也可以看出job在调用MapReduce 及 combiner partition的先后顺序
my_-mapper –> my_-combiner –> my_-partition –> my_-reducer

阅读全文
0 0