maprecuce系列(5)——map端join算法实现

来源:互联网 发布:crm软件对比 编辑:程序博客网 时间:2024/04/30 01:43

一、概述

在(4)中我们很快的实现了join的功能,但是在实际的生产中,会有一个严重的问题,由于数据量比较大,最后的分区比如都根据hashpartion来处理,就会导致数据的倾斜,有的reduceTask就会工作量太大,有的工作量就会太小,其实,我们可以看到,maptask阶段的任务分配其实还是比较均匀的,所以如果能在map阶段,把所有的工作都给处理掉就好了,这样我们就会想到缓存,数据量不大的一张表缓存起来放在redis上,但是由于redis还是要经过网络传输,有本地缓存是最好不过的,其实在hadoop中有这么distributedcache这么个功能能解决我们的缓存问题。

二、代码实现

MapSideJoin.java

package mapjoin;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.BufferedReader;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.net.URI;import java.net.URISyntaxException;import java.util.HashMap;import java.util.Map;/** * * Created by tianjun on 2017/3/19. */public class MapSideJoin {    static class MapSideJoinMapper extends Mapper<LongWritable,Text,Text,NullWritable>{        Map<String,String> pdInfoMap = new HashMap<>();        Text k =new Text();        @Override        protected void setup(Context context) throws IOException, InterruptedException {            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("pd.txt")));            String line = null;            while (StringUtils.isNotEmpty(line = br.readLine())){                String[] fields = line.split(",");                pdInfoMap.put(fields[0],fields[1]);            }            br.close();        }        @Override        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {            String orderline = value.toString();            String[] fields = orderline.split(",");            String pdName = pdInfoMap.get(fields[2]);            k.set(orderline + ',' +pdName);            context.write(k,NullWritable.get());        }    }    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {        String os = System.getProperty("os.name").toLowerCase();        if (os.contains("windows")) {            System.setProperty("HADOOP_USER_NAME", "root");        }        Configuration conf = new Configuration();        conf.set("mapreduce.framework.name","yarn");        conf.set("yarn.resourcemanager.hostname","mini01");        conf.set("fs.defaultFS","hdfs://mini01:9000/");//            默认就是local模式//        conf.set("mapreduce.framework.name","local");//        conf.set("mapreduce.jobtracker.address","local");//        conf.set("fs.defaultFS","file:///");        Job wcjob = Job.getInstance(conf);        wcjob.setJar("F:/myWorkPlace/java/dubbo/demo/dubbo-demo/mr-demo1/target/mr.demo-1.0-SNAPSHOT.jar");        //如果从本地拷贝,是不行的,这时需要使用setJar//        wcjob.setJarByClass(Rjoin.class);        wcjob.setMapperClass(MapSideJoinMapper.class);//        wcjob.setReducerClass(RjoinReducer.class);        //设置我们的业务逻辑Mapper类的输出key和value的数据类型        wcjob.setMapOutputKeyClass(Text.class);        wcjob.setMapOutputValueClass(NullWritable.class);        //不需要reduce        wcjob.setNumReduceTasks(0);        //设置我们的业务逻辑Reducer类的输出key和value的数据类型//        wcjob.setOutputKeyClass(InfoBean1.class);//        wcjob.setOutputValueClass(NullWritable.class);        //如果不设置InputFormat,默认就是使用TextInputFormat.class//        wcjob.setInputFormatClass(CombineFileInputFormat.class);//        CombineFileInputFormat.setMaxInputSplitSize(wcjob,4194304);//        CombineFileInputFormat.setMinInputSplitSize(wcjob,2097152);        FileSystem fs = FileSystem.get(new URI("hdfs://mini01:9000"), new Configuration(), "root");        Path path = new Path("hdfs://mini01:9000/wc/rjoin");        if (fs.exists(path)) {            fs.delete(path, true);        }        //指定要处理的数据所在的位置        FileInputFormat.setInputPaths(wcjob, new Path("hdfs://mini01:9000/input/rjoin/order.txt"));        //指定处理完成之后的结果所保存的位置        FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://mini01:9000/wc/mapjoin"));        //指定需要缓存一个文件到所有的maptask运行节点工作目录        /*wcjob.addArchiveToClassPath(archive);*/ //缓存jar包到task运行节点的classpath中        /*wcjob.addCacheArchive(uri);*/            //缓存压缩包到task运行节点的工作目录        /*wcjob.addCacheFile(uir);*/                //缓存普通文件到task运行节点的工作目录        wcjob.addCacheFile(new URI("hdfs://mini01:9000/input/rjoin/pd.txt"));        boolean res = wcjob.waitForCompletion(true);        System.exit(res ? 0 : 1);    }}

三、计算结果

[root@mini03 ~]# hdfs dfs -cat /wc/mapjoin/*1001,20150710,P0001,2,xiaomi51002,20150710,P0001,3,xiaomi51002,20150710,P0002,3,chuiziT11001,20150710,P0001,2,xiaomi51002,20150710,P0003,3,meizu1003,20150710,P0002,3,chuiziT1
0 0