MapReduce---chain链条式操作

来源：互联网发布：jsp如何引入java类编辑：程序博客网时间：2024/04/30 18:20

准备数据：

hello world of tom1
hello world of tom1
hello world of tom2
hello world of tom3
hello world of tom3
hello world of tom4
hello world of tom4

思路分析图：

1、Mapper1(切割单词)

2、Mapper2 （滤掉单词中of）

3、Mapper2_2(滤掉单词中带tom的)

4、Mapper3（reduce的输出是Mapper3的输入，滤掉单次数量少于1=次的）

1、Mapper1(切割单词)

package hadoop.mr.chain;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * */public class Mapper1 extends Mapper<LongWritable,Text,Text,IntWritable>{protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {System.out.println("map1 : " + value.toString());String line = value.toString();String[] arr =  line.split(" ");for(String w : arr){context.write(new Text(w),new IntWritable(1));}}}

2、Mapper2 （滤掉单词中of）

package hadoop.mr.chain;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * mapper */public class Mapper2 extends Mapper<Text,IntWritable,Text,IntWritable>{protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {System.out.println("map2 : " + key.toString());String w = key.toString();if(!w.equals("of")){context.write(key,value);}}}

3、Mapper2_2(滤掉单词中带tom的)

package hadoop.mr.chain;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * mapper */public class Mapper2_2 extends Mapper<Text,IntWritable,Text,IntWritable>{protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {System.out.println("map2_2 : " + key.toString());String w = key.toString();if(!w.startsWith("tom")){context.write(key,value);}}}

4、Reducer1

package hadoop.mr.chain;import hadoop.join.reduce.CombKey;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;import java.util.Iterator;/** * reduce1 */public class Reducer1 extends Reducer<Text,IntWritable,Text,IntWritable>{protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {int count = 0 ;for(IntWritable w: values){count = count + w.get() ;}context.write(key,new IntWritable(count));System.out.println("redu : " + key.toString() + " = " + count);}}

5、Mapper3（reduce的输出是Mapper3的输入，滤掉单次数量少于1=次的）

package hadoop.mr.chain;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * mapper */public class Mapper3 extends Mapper<Text,IntWritable,Text,IntWritable>{protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {System.out.println("map3 : " + key.toString());int cnt = value.get();if(cnt > 1){context.write(key,value);}}}

6、Combiner1

package hadoop.mr.chain;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/** * reduce1 */public class Combiner1 extends Reducer<Text,IntWritable,Text,IntWritable>{protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {int count = 0 ;for(IntWritable w: values){count = count + w.get() ;}context.write(key,new IntWritable(count));System.out.println("Combiner1 : " + key.toString() + " = " + count);}}

7、MyPartitioner

package hadoop.mr.chain;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Partitioner;/** * 分区 */public class MyPartitioner extends Partitioner<Text,IntWritable>{public int getPartition(Text text, IntWritable intWritable, int numPartitions) {System.out.println("par : " + text.toString());return (text.hashCode() & Integer.MAX_VALUE )% numPartitions;}}

8、App

package  hadoop.mr.chain;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;import org.apache.hadoop.mapreduce.lib.chain.ChainReducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** */public class App {public static void main(String[] args) throws Exception {args = new String[]{"d:/java/mr/data/1.txt", "d:/java/mr/out"} ;Configuration conf = new Configuration();FileSystem fs = FileSystem.get(conf);if(fs.exists(new Path(args[1]))){fs.delete(new Path(args[1]),true);}Job job = Job.getInstance(conf);job.setJobName("chain");job.setJarByClass(App.class);//添加mapper1ChainMapper.addMapper(job,Mapper1.class, LongWritable.class,Text.class,Text.class,IntWritable.class,conf);//添加mapper1ChainMapper.addMapper(job,Mapper2.class, Text.class,IntWritable.class,Text.class,IntWritable.class,conf);ChainMapper.addMapper(job,Mapper2_2.class, Text.class,IntWritable.class,Text.class,IntWritable.class,conf);//设置reducer(***1***)ChainReducer.setReducer(job,Reducer1.class,Text.class,IntWritable.class,Text.class,IntWritable.class,conf);//在reduce后增加一个环节ChainReducer.addMapper(job, Mapper3.class, Text.class, IntWritable.class, Text.class, IntWritable.class, conf);FileInputFormat.addInputPath(job,new Path(args[0]));FileOutputFormat.setOutputPath(job,new Path(args[1]));//分区job.setPartitionerClass(MyPartitioner.class);job.setCombinerClass(Combiner1.class);job.setNumReduceTasks(3);job.waitForCompletion(true) ;}}

阅读全文

0 0