解决MapperReduce在实际应用中产生的数据倾斜问题

来源：互联网发布：华数网络编辑：程序博客网时间：2024/04/28 21:17

1.txt:

hello tom1
hello tom2
hello tom3
hello tom4
hello tom5
hello tom6
hello tom7
hello tom8
hello tom9
hello tom10

2.txt

hello tom11
hello tom12
hello tom13
hello tom14
hello tom15
hello tom16
hello tom17
hello tom18
hello tom19
hello tom20

3.txt

hello tom21
hello tom22
hello tom23
hello tom24
hello tom25
hello tom26
hello tom27
hello tom28
hello tom29
hello tom30

先写一个能产生数据倾斜的MapperReduce代码，如下：

map端：import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * Created  on 2017/3/16. */public class WCSkewMapper extends Mapper<LongWritable,Text,Text,IntWritable> {    @Override    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {        String[] arr = value.toString().split(" ");        Text keyOut = new Text();        IntWritable valueOut = new IntWritable();        for (String s : arr) {            keyOut.set(s);            valueOut.set(1);            context.write(keyOut, valueOut);        }    }}

reduce 端：
import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/** * Created  on 2017/3/16. */public class WCSkewReducer extends Reducer<Text,IntWritable,Text,IntWritable>{    @Override    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {        int count =0 ;        for(IntWritable iw : values){            count = count  + iw.get();        }        context.write(key,new IntWritable(count));    }}

App端：import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.IOException;/** * Created on 2017/3/16. */public class WCSkewApp {    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        conf.set("fs.defaultFS","file:///");        Job job = Job.getInstance(conf);        //设置job 的各种属性        job.setJobName("WCAkewApp");        job.setJarByClass(WCSkewApp.class);        job.setInputFormatClass(TextInputFormat.class);        //添加输入路径        FileInputFormat.addInputPath(job,new Path("g:/comp/skew"));        FileOutputFormat.setOutputPath(job,new  Path("g:/comp/out"));        //设置合成类        job.setCombinerClass(WCSkewReducer.class);        //设置任务类        job.setMapperClass(WCSkewMapper.class);        job.setReducerClass(WCSkewReducer.class);        //reduce任务数        job.setNumReduceTasks(4);        //设置kv类型        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        job.waitForCompletion(true);    }}

执行以上代码，则会在g:/comp/out/下产生数据的数据。
30个hello 都进到一个reduce 执行，以下为解决办法：

利用随机分区解决：
map端 import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * Created on 2017/3/16. */public class WCSkewMapper extends Mapper<LongWritable,Text,Text,IntWritable> {    @Override    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {        String[] arr = value.toString().split(" ");        Text keyOut = new Text();        IntWritable valueOut = new IntWritable();        for (String s : arr) {            keyOut.set(s);            valueOut.set(1);            context.write(keyOut, valueOut);        }    }}

reduce端import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/** * Created  on 2017/3/16. */public class WCSkewReducer extends Reducer<Text,IntWritable,Text,IntWritable>{    @Override    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {        int count =0 ;        for(IntWritable iw : values){            count = count  + iw.get();        }        context.write(key,new IntWritable(count));    }}


RandomPartitioner 端import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Partitioner;import java.util.Random;/** * Created  on 2017/3/18. */public class RandomPartitioner extends Partitioner<Text,IntWritable>{    public int getPartition(Text text, IntWritable intWritable, int numPartitioner) {        return new Random().nextInt(numPartitioner);    }}


App 端import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * Created  on 2017/3/16. */public class WCSkewApp {    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        conf.set("fs.defaultFS","file:///");        Job job = Job.getInstance(conf);        //设置job 的各种属性        job.setJobName("WCAkewApp");        job.setJarByClass(WCSkewApp.class);        job.setInputFormatClass(TextInputFormat.class);        //添加输入路径        FileInputFormat.addInputPath(job,new Path("g:/comp/skew"));        FileOutputFormat.setOutputPath(job,new  Path("g:/comp/out"));        //设置合成类        job.setPartitionerClass(RandomPartitioner.class);//设置分区类        job.setCombinerClass(WCSkewReducer.class);        //设置任务类        job.setMapperClass(WCSkewMapper.class);        job.setReducerClass(WCSkewReducer.class);        //reduce任务数        job.setNumReduceTasks(4);        //设置kv类型        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        job.waitForCompletion(true);    }}


利用随机分区，使hello分散在不同的reduceTask上进行计算，但执行到这步还没结束，因为此时产生的结果不是我想要的，因为hello是分散在不同的part-上，我们真正要的结果是每个单词出现的次数，所以我们还要进行一次Job（Mapper Reduce）任务, 如下：

map 端import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * Created  on 2017/3/18. */public class WCSkewMapper extends Mapper<LongWritable,Text,Text,IntWritable>{    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {        String[] arr = value.toString().split("\t");        context.write(new Text(arr[0]), new IntWritable(Integer.parseInt(arr[1])));    }}

reduce 端import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/** * Created on 2017/3/16. */public class WCSkewReducer extends Reducer<Text,IntWritable,Text,IntWritable>{    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {        int count = 0;        for (IntWritable iw : values) {            count = count + iw.get();        }        context.write(key, new IntWritable(count));    }}


App端import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** *解决数据倾斜问题 */public class WCSkewApp {    public static void main(String[] args) throws Exception {        //加载配置文件        Configuration conf = new Configuration();        //设置本地文件系统        conf.set("fs.defaultFS", "file:///");        //创建job对象        Job job = Job.getInstance(conf);        //设置job的属性        job.setJobName("WCSkewApp");        job.setJarByClass(WCSkewApp.class);        //设置文件输入格式        job.setInputFormatClass(TextInputFormat.class);        //设置Mapper Reduce类        job.setMapperClass(WCSkewMapper.class);        job.setReducerClass(WCSkewReducer.class);        //设置map reduce 的kv 建输出类型        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        //设置文件输入和输出路径        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00000"));        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00001"));        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00002"));        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00003"));        FileOutputFormat.setOutputPath(job, new Path("g:/comp/out8"));        //设置reduce 个数        job.setNumReduceTasks(4);        job.waitForCompletion(true);    }}
执行此job任务之后，得到的hello 为其总数，即为我们想要的 此时数据倾斜问题得到解决。。





在上述解决数据倾斜问题的第二个job任务中，在App端的输入格式还可以设置成为
KeyValueTextInputFormat泛型为<Text,Text>（本来是TextInputformat）,需要注意的是此时map 端的输入输出均为Text 类型
代码如下：
map import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * Created on 2017/3/18. */public class WCSkewMapper extends Mapper<Text,Text,Text,IntWritable>{    protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {        context.write(key, new IntWritable(Integer.parseInt(value.toString())));    }}

reduce import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/** * Created  on 2017/3/16. */public class WCSkewReducer extends Reducer<Text,IntWritable,Text,IntWritable>{    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {        int count = 0;        for (IntWritable iw : values) {            count = count + iw.get();        }        context.write(key, new IntWritable(count));    }}

Appimport org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** *解决数据倾斜问题 */public class WCSkewApp {    public static void main(String[] args) throws Exception {        //加载配置文件        Configuration conf = new Configuration();        //设置本地文件系统        conf.set("fs.defaultFS", "file:///");        //创建job对象        Job job = Job.getInstance(conf);        //设置job的属性        job.setJobName("WCSkewApp");        job.setJarByClass(WCSkewApp.class);        //设置文件输入格式        job.setInputFormatClass(KeyValueTextInputFormat.class);        //设置Mapper Reduce类        job.setMapperClass(WCSkewMapper.class);        job.setReducerClass(WCSkewReducer.class);        //设置map reduce 的kv 建输出类型        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        //设置文件输入和输出路径        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00000"));        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00001"));        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00002"));        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00003"));        FileOutputFormat.setOutputPath(job, new Path("g:/comp/out8"));        //设置reduce 个数        job.setNumReduceTasks(4);        job.waitForCompletion(true);    }}



                                                     0        0           	
					
					   解决MapperReduce在实际应用中产生的数据倾斜问题
	  	   设计模式的实际应用——在C#中解决单客户端窗口数据并发问题
	  	   解决spark中遇到的数据倾斜问题
	  	   解决spark中遇到的数据倾斜问题
	  	   解决spark中遇到的数据倾斜问题
	  	   解决spark中遇到的数据倾斜问题
	  	   在mapreduce中怎样解决数据倾斜
	  	   spark的数据倾斜问题的解决
	  	   Mapreduce中的DistributedCache应用-解决join算法中数据倾斜问题
	  	   MapReduce中数据倾斜的产生和解决办法详解
	  	   spark中遇到的数据倾斜问题
	  	   关于在struts2.0中应用json产生类型访问异常问题的解决
	  	   元数据在实际项目的应用
	  	   hive的数据倾斜问题
	  	   GPDB的数据倾斜问题
	  	   浅谈大数据处理工作中数据倾斜问题的解决方案
	  	   hive中join导致的数据倾斜问题排查
	  	   [easyui]实际应用中遇到的问题
	     		  
	  	   LeetCode刷题【Array】 Construct Binary Tree from Preorder and Inorder Traversal
	  	   PAT L2-013. 红色警报
	  	   H5简解自定义事件
	  	   Android实现登录记住密码功
	  	   java指定若干个网络图片,打包为zip下载
	  	   解决MapperReduce在实际应用中产生的数据倾斜问题
	  	   C#用副线程改主线程（UI线程）的控件属性的方法（包括Winform和WPF）
	  	   JSP中请求重定向和请求转发的区别
	  	   H5的预加载
	  	   hdu 5780 gcd（线性筛+快速幂+数论）
	  	   JZOJ3736【NOI2014模拟7.11】数学题(math)
	  	   springmvc 用拦截器+token防止重复提交
	  	   详解 Dagger2 系列，原来 Dagger2 如此简单
	  	   maven 下载安装