MapReduce数据倾斜解决方案2-- 自定义分区类---二次作业

来源:互联网 发布:bi商业智能和数据平台 编辑:程序博客网 时间:2024/05/16 19:00

数据倾斜:大量数据涌向到一个或者几个reduce,造成大量的reduce空闲。

解决数据倾斜方案2:自定义分区类---二次作业

下面以单次统计为例进行说明:

1、DataLeanMapper1

package hadoop.lean.partitioner;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;import java.util.Random;/** * DataLeanMapper1 */public class DataLeanMapper1 extends Mapper<LongWritable, Text, Text,IntWritable> {Random r = new Random();/** * 每一行 */protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {String line = value.toString();String[] arr = line.split(" ");Text keyOut = new Text();IntWritable valueOut = new IntWritable(1);for(String word : arr){keyOut.set(word);context.write(keyOut,valueOut);}}}
2、DataLeanMapper2
package hadoop.lean.partitioner;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;import java.util.Random;public class DataLeanMapper2 extends Mapper<Text, Text, Text,IntWritable> {/** * 每一行 */protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {context.write(key , new IntWritable(Integer.parseInt(value.toString())));}}
3、DataLeanReducer1

package hadoop.lean.partitioner;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/** * DataLeanReducer1 */public class DataLeanReducer1 extends Reducer<Text, IntWritable, Text, IntWritable>{protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {int count = 0 ;for(IntWritable iw : values){count = count + iw.get() ;}context.write(key,new IntWritable(count));}}
4、RandomPartitioner 随机分区

package hadoop.lean.partitioner;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Partitioner;import java.util.Random;/** * 随机分区 */public class RandomPartitioner extends Partitioner<Text,IntWritable> {Random r = new Random() ;public int getPartition(Text text, IntWritable intWritable, int numPartitions) {return r.nextInt(numPartitions);}}
5、App 

 * 数据倾斜解决办法需要二次作业
 * 自定义分区类

package hadoop.lean.partitioner;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;/** * 数据倾斜解决办法需要二次作业 * 自定义分区类 */public class App {public static void main(String[] args) throws Exception {args = new String[]{"d:/java/mr/data/1.txt", "d:/java/mr/out1", "d:/java/mr/out2"} ;Configuration conf = new Configuration();FileSystem fs = FileSystem.get(conf);if(fs.exists(new Path(args[1]))){fs.delete(new Path(args[1]),true);}Job job = Job.getInstance(conf);job.setJobName("WordCount-1");job.setJarByClass(App.class);job.setMapperClass(DataLeanMapper1.class);job.setReducerClass(DataLeanReducer1.class);//添加输入路径FileInputFormat.addInputPath(job,new Path(args[0]));//设置输出路径FileOutputFormat.setOutputPath(job,new Path(args[1]));//设置mapreduce输出job.setPartitionerClass(RandomPartitioner.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);job.setNumReduceTasks(3);//第一个阶段(job)if(job.waitForCompletion(true)){job = Job.getInstance(conf);job.setJobName("WordCount-2");job.setJarByClass(App.class);job.setMapperClass(DataLeanMapper2.class);job.setReducerClass(DataLeanReducer1.class);//添加输入路径FileInputFormat.addInputPath(job, new Path(args[1]));//设置输出路径FileOutputFormat.setOutputPath(job, new Path(args[2]));//第一次的输出是第二次的输入,首次输出的key - valuejob.setInputFormatClass(KeyValueTextInputFormat.class);//第二次哈希分区job.setPartitionerClass(HashPartitioner.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);job.setNumReduceTasks(3);job.waitForCompletion(true);}}}