自定义Combiner

来源：互联网发布：互联网广告算法编辑：程序博客网时间：2024/06/05 16:56
package com.ccse.hadoop.combiner;import java.io.IOException;import java.net.URI;import java.net.URISyntaxException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Counter;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * 自定义Combiner：规约 * @author woshiccna * */public class WordCountApp {private static final String INPUT_PATH = "hdfs://chaoren1:9000/mapinput";private static final String OUTPUT_PATH = "hdfs://chaoren1:9000/mapoutput";public static void main(String[] args) throws IOException, URISyntaxException,       ClassNotFoundException, InterruptedException {Configuration conf = new Configuration();final FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), conf);fileSystem.delete(new Path(OUTPUT_PATH), true);final Job job = new Job(conf, WordCountApp.class.getSimpleName());job.setJarByClass(WordCountApp.class);FileInputFormat.setInputPaths(job, INPUT_PATH);job.setMapperClass(MyMapper.class);/** *为什么使用Combiner？答：目的是减少map端的输出，意味着shuffle时传输的数据量小，网络开销小了    使用combiner有什么限制？求平均数时不适合使用combiner,如果运算结果和数据总量有关系，那么不适合使用combiner */job.setCombinerClass(MyReducer.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(LongWritable.class);job.setReducerClass(MyReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(LongWritable.class);FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));job.waitForCompletion(true);}public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {final String line = value.toString();StringTokenizer tokenizer = new StringTokenizer(line);final Counter counter = context.getCounter("Sensitive", "hello");if (value.toString().toLowerCase().contains("hello")) {counter.increment(1L);   //当查询到包含hello的词语时，计数器加1}while(tokenizer.hasMoreTokens()) {String target = tokenizer.nextToken();context.write(new Text(target), new LongWritable(1));}}}public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {@Overrideprotected void reduce(Text key, Iterable<LongWritable> value,Reducer<Text, LongWritable, Text, LongWritable>.Context context)throws IOException, InterruptedException {long times = 0l;while (value.iterator().hasNext()) {times += value.iterator().next().get();}context.write(key, new LongWritable(times));}}}
图：未设置Combiner之前的效果
图：设置了Combiner后的结果
0 0