MapReduce数据倾斜解决方案2-- 自定义分区类---二次作业

来源：互联网发布：bi商业智能和数据平台编辑：程序博客网时间：2024/05/16 19:00

数据倾斜：大量数据涌向到一个或者几个reduce，造成大量的reduce空闲。

解决数据倾斜方案2：自定义分区类---二次作业

下面以单次统计为例进行说明:

1、DataLeanMapper1

package hadoop.lean.partitioner;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;import java.util.Random;/** * DataLeanMapper1 */public class DataLeanMapper1 extends Mapper<LongWritable, Text, Text,IntWritable> {Random r = new Random();/** * 每一行 */protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {String line = value.toString();String[] arr = line.split(" ");Text keyOut = new Text();IntWritable valueOut = new IntWritable(1);for(String word : arr){keyOut.set(word);context.write(keyOut,valueOut);}}}

2、DataLeanMapper2

package hadoop.lean.partitioner;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;import java.util.Random;public class DataLeanMapper2 extends Mapper<Text, Text, Text,IntWritable> {/** * 每一行 */protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {context.write(key , new IntWritable(Integer.parseInt(value.toString())));}}

3、DataLeanReducer1

package hadoop.lean.partitioner;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/** * DataLeanReducer1 */public class DataLeanReducer1 extends Reducer<Text, IntWritable, Text, IntWritable>{protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {int count = 0 ;for(IntWritable iw : values){count = count + iw.get() ;}context.write(key,new IntWritable(count));}}

4、RandomPartitioner 随机分区

package hadoop.lean.partitioner;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Partitioner;import java.util.Random;/** * 随机分区 */public class RandomPartitioner extends Partitioner<Text,IntWritable> {Random r = new Random() ;public int getPartition(Text text, IntWritable intWritable, int numPartitions) {return r.nextInt(numPartitions);}}

5、App

* 数据倾斜解决办法需要二次作业
* 自定义分区类

package hadoop.lean.partitioner;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;/** * 数据倾斜解决办法需要二次作业 * 自定义分区类 */public class App {public static void main(String[] args) throws Exception {args = new String[]{"d:/java/mr/data/1.txt", "d:/java/mr/out1", "d:/java/mr/out2"} ;Configuration conf = new Configuration();FileSystem fs = FileSystem.get(conf);if(fs.exists(new Path(args[1]))){fs.delete(new Path(args[1]),true);}Job job = Job.getInstance(conf);job.setJobName("WordCount-1");job.setJarByClass(App.class);job.setMapperClass(DataLeanMapper1.class);job.setReducerClass(DataLeanReducer1.class);//添加输入路径FileInputFormat.addInputPath(job,new Path(args[0]));//设置输出路径FileOutputFormat.setOutputPath(job,new Path(args[1]));//设置mapreduce输出job.setPartitionerClass(RandomPartitioner.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);job.setNumReduceTasks(3);//第一个阶段(job)if(job.waitForCompletion(true)){job = Job.getInstance(conf);job.setJobName("WordCount-2");job.setJarByClass(App.class);job.setMapperClass(DataLeanMapper2.class);job.setReducerClass(DataLeanReducer1.class);//添加输入路径FileInputFormat.addInputPath(job, new Path(args[1]));//设置输出路径FileOutputFormat.setOutputPath(job, new Path(args[2]));//第一次的输出是第二次的输入，首次输出的key - valuejob.setInputFormatClass(KeyValueTextInputFormat.class);//第二次哈希分区job.setPartitionerClass(HashPartitioner.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);job.setNumReduceTasks(3);job.waitForCompletion(true);}}}

阅读全文

0 0