MapReduce数据倾斜解决方案1--1、重新设计key---二次作业

来源:互联网 发布:ubuntu 升级软件包 编辑:程序博客网 时间:2024/06/12 23:43

数据倾斜:大量数据涌向到一个或者几个reduce,造成大量的reduce空闲。

解决数据倾斜方案1:重新设计key---二次作业

下面以单次统计为例进行说明:

1、DataLeanMapper1 对key重新设计,增加随机数后缀

package hadoop.lean.key;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;import java.util.Random;/** * WordCountMapper */public class DataLeanMapper1 extends Mapper<LongWritable, Text, Text,IntWritable> {Random r = new Random();/** * 每一行 */protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {String line = value.toString();String[] arr = line.split(" ");Text keyOut = new Text();IntWritable valueOut = new IntWritable(1);for(String word : arr){//对key重新设计,增加随机数后缀keyOut.set(word + "_" + r.nextInt(100));context.write(keyOut,valueOut);}}}


2、WordCountMapper-2   切割单词,去除_后缀

package hadoop.lean.key;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;import java.util.Random;/** * WordCountMapper-2 * 切割单词,去除_后缀 */public class DataLeanMapper2 extends Mapper<Text, Text, Text,IntWritable> {Random r = new Random();/** * 每一行 */protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {String word = key.toString();int index = word.lastIndexOf("_") ;word = word.substring(0,index) ;int count = Integer.parseInt(value.toString());context.write(new Text(word) , new IntWritable(count));}}

3、WordCountReducer
package hadoop.lean.key;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/** * WordCountReducer */public class DataLeanReducer1 extends Reducer<Text, IntWritable, Text, IntWritable>{protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {int count = 0 ;for(IntWritable iw : values){count = count + iw.get() ;}context.write(key,new IntWritable(count));}}
4、App 数据倾斜解决办法需要二次作业

package hadoop.lean.key;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * 数据倾斜解决办法需要二次作业 */public class App {public static void main(String[] args) throws Exception {args = new String[]{"d:/java/mr/data/1.txt", "d:/java/mr/out1", "d:/java/mr/out2"} ;Configuration conf = new Configuration();FileSystem fs = FileSystem.get(conf);if(fs.exists(new Path(args[1]))){fs.delete(new Path(args[1]),true);}Job job = Job.getInstance(conf);job.setJobName("WordCount-1");job.setJarByClass(App.class);job.setMapperClass(DataLeanMapper1.class);job.setReducerClass(DataLeanReducer1.class);//添加输入路径FileInputFormat.addInputPath(job,new Path(args[0]));//设置输出路径FileOutputFormat.setOutputPath(job,new Path(args[1]));//设置mapreduce输出job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(IntWritable.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);job.setNumReduceTasks(3);//第一个阶段(job)if(job.waitForCompletion(true)){job = Job.getInstance(conf);job.setJobName("WordCount-2");job.setJarByClass(App.class);job.setMapperClass(DataLeanMapper2.class);job.setReducerClass(DataLeanReducer1.class);//添加输入路径FileInputFormat.addInputPath(job, new Path(args[1]));//设置输出路径FileOutputFormat.setOutputPath(job, new Path(args[2]));//第一次的输出是第二次的输入,首次输出的key - valuejob.setInputFormatClass(KeyValueTextInputFormat.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);job.setNumReduceTasks(3);job.waitForCompletion(true);}}}



原创粉丝点击