MapReduce---chain链条式操作

来源:互联网 发布:jsp如何引入java类 编辑:程序博客网 时间:2024/04/30 18:20

准备数据:

hello world of tom1
hello world of tom1
hello world of tom2
hello world of tom3
hello world of tom3
hello world of tom4
hello world of tom4

思路分析图:

1、Mapper1(切割单词)

2、Mapper2 (滤掉单词中of)

3、Mapper2_2(滤掉单词中带tom的)

4、Mapper3(reduce的输出是Mapper3的输入,滤掉单次数量少于1=次的)


1、Mapper1(切割单词)

package hadoop.mr.chain;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * */public class Mapper1 extends Mapper<LongWritable,Text,Text,IntWritable>{protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {System.out.println("map1 : " + value.toString());String line = value.toString();String[] arr =  line.split(" ");for(String w : arr){context.write(new Text(w),new IntWritable(1));}}}


2、Mapper2 (滤掉单词中of)

package hadoop.mr.chain;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * mapper */public class Mapper2 extends Mapper<Text,IntWritable,Text,IntWritable>{protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {System.out.println("map2 : " + key.toString());String w = key.toString();if(!w.equals("of")){context.write(key,value);}}}

3、Mapper2_2(滤掉单词中带tom的)

package hadoop.mr.chain;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * mapper */public class Mapper2_2 extends Mapper<Text,IntWritable,Text,IntWritable>{protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {System.out.println("map2_2 : " + key.toString());String w = key.toString();if(!w.startsWith("tom")){context.write(key,value);}}}

4、Reducer1

package hadoop.mr.chain;import hadoop.join.reduce.CombKey;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;import java.util.Iterator;/** * reduce1 */public class Reducer1 extends Reducer<Text,IntWritable,Text,IntWritable>{protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {int count = 0 ;for(IntWritable w: values){count = count + w.get() ;}context.write(key,new IntWritable(count));System.out.println("redu : " + key.toString() + " = " + count);}}


5、Mapper3(reduce的输出是Mapper3的输入,滤掉单次数量少于1=次的)

package hadoop.mr.chain;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * mapper */public class Mapper3 extends Mapper<Text,IntWritable,Text,IntWritable>{protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {System.out.println("map3 : " + key.toString());int cnt = value.get();if(cnt > 1){context.write(key,value);}}}


6、Combiner1

package hadoop.mr.chain;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/** * reduce1 */public class Combiner1 extends Reducer<Text,IntWritable,Text,IntWritable>{protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {int count = 0 ;for(IntWritable w: values){count = count + w.get() ;}context.write(key,new IntWritable(count));System.out.println("Combiner1 : " + key.toString() + " = " + count);}}

7、MyPartitioner
package hadoop.mr.chain;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Partitioner;/** * 分区 */public class MyPartitioner extends Partitioner<Text,IntWritable>{public int getPartition(Text text, IntWritable intWritable, int numPartitions) {System.out.println("par : " + text.toString());return (text.hashCode() & Integer.MAX_VALUE )% numPartitions;}}

8、App

package  hadoop.mr.chain;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;import org.apache.hadoop.mapreduce.lib.chain.ChainReducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** */public class App {public static void main(String[] args) throws Exception {args = new String[]{"d:/java/mr/data/1.txt", "d:/java/mr/out"} ;Configuration conf = new Configuration();FileSystem fs = FileSystem.get(conf);if(fs.exists(new Path(args[1]))){fs.delete(new Path(args[1]),true);}Job job = Job.getInstance(conf);job.setJobName("chain");job.setJarByClass(App.class);//添加mapper1ChainMapper.addMapper(job,Mapper1.class, LongWritable.class,Text.class,Text.class,IntWritable.class,conf);//添加mapper1ChainMapper.addMapper(job,Mapper2.class, Text.class,IntWritable.class,Text.class,IntWritable.class,conf);ChainMapper.addMapper(job,Mapper2_2.class, Text.class,IntWritable.class,Text.class,IntWritable.class,conf);//设置reducer(***1***)ChainReducer.setReducer(job,Reducer1.class,Text.class,IntWritable.class,Text.class,IntWritable.class,conf);//在reduce后增加一个环节ChainReducer.addMapper(job, Mapper3.class, Text.class, IntWritable.class, Text.class, IntWritable.class, conf);FileInputFormat.addInputPath(job,new Path(args[0]));FileOutputFormat.setOutputPath(job,new Path(args[1]));//分区job.setPartitionerClass(MyPartitioner.class);job.setCombinerClass(Combiner1.class);job.setNumReduceTasks(3);job.waitForCompletion(true) ;}}