MapReduce Join操作

来源：互联网发布：上瘾网络剧全集资源编辑：程序博客网时间：2024/06/05 18:17

MapReduce 处理两个或多个数据源是经常的事，此时就要根据某个key将两个数据源进行Join操作，Join操作的原则是：
1) 如果两个数据源大小差不多，则直接使用DataJoin类进行Join操作
2) 如果两个数据源大小相差很大，则使用Distributed Cache机制将较小的数据源复制到所有节点并缓存起来，然后过滤大的数据源。如果缓存不下较小的数据源，可以先用Distributed Cache机制将较小的数据源的Key复制到所有节点并缓存，然后过滤大的数据源，过滤后的数据再与较小的数据源进行Join操作。

本周遇到的问题就是第二种情况的较小数据源无法被缓存地下（抛出OOM错误）。
解决方案是使用两个Job
第一个Job 将较小的数据源切割成只有 key的数据
第二个Job的第一个Mapper用Distributed Cache机制将上面的只有key的数据复制到所有节点并进行缓存，然后过滤大的数据源，输出的key为数据的唯一ID，第二个Mapper加载较小的数据源，输出的key为数据的唯一ID，Reducer会接收到相同ID的两个数据源数据，然后进行自己的操作

废话少说直接上有注释的代码：

/** * 用户基础属性数据反向校验Job * Created by luweijie@xiaomi.com on 15-7-20. */public class BasicDataVerifyJob {    private static Logger logger = LoggerFactory.getLogger(BasicDataVerifyJob.class);    private static final String SEPARATOR = "\t";    /**     * 程序Job入口     * @param args     */    public static void main (String args[]) throws IOException, InterruptedException, ClassNotFoundException {        Configuration conf = new Configuration();        new GenericOptionsParser(conf, args).getRemainingArgs();        if (StringUtils.isBlank(conf.get("path")) || StringUtils.isBlank(conf.get("date")) ||                StringUtils.isBlank(conf.get("output"))) {            System.err.println("jvm args: -Dpath -Ddate -Doutput must be specified!");            System.exit(2);        }        FileSystem fs = FileSystem.get(conf);        String path = conf.get("path");        if (!fs.exists(new Path(path))) {            logger.error("input path not exists : " + path);            System.exit(-1);        }        // output        String output = conf.get("output");        Path outputPath = new Path(output);        if (!output.endsWith("/")) {            output += "/";        }        if (fs.exists(outputPath)) {            fs.delete(outputPath, true);        }        // 产生mappingID文件Job        String mappingIDFileOutput = output + "mappingIDFile";        MicloudMRJob job1 = MicloudMRJob.getInstance(conf, "Generate mappingID file: " + path + " --> " + mappingIDFileOutput);        job1.setJarByClass(BasicDataVerifyJob.class);        job1.setMapperClass(MappingIDGeneratorMapper.class);        job1.setMapOutputKeyClass(Text.class);        job1.setMapOutputValueClass(NullWritable.class);        MultipleInputs.addInputPath(job1, new Path(path), TextInputFormat.class, MappingIDGeneratorMapper.class);        job1.setOutputFormatClass(TextOutputFormat.class);        FileOutputFormat.setOutputPath(job1, new Path(mappingIDFileOutput));        // mappingIDFile产生成功后， 根据mappingIDFile过滤原数据并比较验证        if (job1.waitForCompletion(true)) {            PathManager pathManager = PathManager.create(conf.get("date"));            MicloudHDFSInputConfig inputConfig = new MicloudHDFSInputConfig("UserPublicInformation",                    FilterMapper.class, BasicDataVerifyJob.class,                    pathManager, Text.class, DataField.class);            String verifyFile = output + "verify";            OutputConfig outputConfig = new OutputConfig();            outputConfig.setOutputPath(verifyFile);            outputConfig.setOutputFormatClass(TextOutputFormat.class);            outputConfig.setOutKeyClass(Text.class);            outputConfig.setOutValueClass(IntWritable.class);            outputConfig.setReducerClass(VerifyReducer.class);            MicloudMRJob job2 = MicloudDataContext.getInstance().createJob(conf, inputConfig, outputConfig, "Verify Basic Data Job: " + pathManager.getOutputPath() + " --> " + verifyFile);            MultipleInputs.addInputPath(job2, new Path(path), TextInputFormat.class, VerifyHDFSMapper.class);            // 将较小的数据源放置到Distributed Cache文件中            FileStatus[] fileStatuses = fs.listStatus(new Path(mappingIDFileOutput));            URI[] uris = new URI[fileStatuses.length];            for (int i = 0; i < fileStatuses.length; i++) {                uris[i] = fileStatuses[i].getPath().toUri();            }            job2.setCacheFiles(uris);            // Reducer Num            job2.setNumReduceTasks(50);            // 统计比较验证的结果，以下的代码就是处理代码            if (job2.waitForCompletion(true)) {                String resultFile = output + "result";                MicloudMRJob job3 = MicloudMRJob.getInstance(conf, "Statistics Verify Data Job: " + verifyFile + " --> " + resultFile);                job3.setJarByClass(BasicDataVerifyJob.class);                job3.setMapperClass(StatisticsMapper.class);                job3.setMapOutputKeyClass(Text.class);                job3.setMapOutputValueClass(IntWritable.class);                MultipleInputs.addInputPath(job3, new Path(verifyFile), TextInputFormat.class, StatisticsMapper.class);                job3.setOutputFormatClass(TextOutputFormat.class);                FileOutputFormat.setOutputPath(job3, new Path(resultFile));                job3.setCombinerClass(StatisticsCombiner.class);                job3.setReducerClass(StatisticsReducer.class);                job3.setNumReduceTasks(3);                job3.waitForCompletion(true);            }        }    }    /**     * 产生mappingID文件 Mapper     */    public static class MappingIDGeneratorMapper extends Mapper<LongWritable, Text, Text, NullWritable> {        private Text outputKey = new Text();        @Override        protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {            String line = new String(value.copyBytes());            String terms[] = line.split(SEPARATOR);            if (terms == null || terms.length < 2) {                return;            }            outputKey.set(terms[0]);            context.write(outputKey, NullWritable.get());        }    }    /**     * 字符编码     */    private static final String CHARSET = "ISO-8859-1";    /**     * 过滤Mapper     */    public static class FilterMapper extends MicloudHDFSMapper<Text, DataField> {        private static Set<String> joinData = new HashSet<String>();        private Text outputKey = new Text();        @Override        protected  void setup (Context context){            // 将distributed cache file 装入各个Map节点本地的内存数据joinData中            FileSystem hdfs = null;            try {                String line = null;                Path[] cacheFiles = context.getLocalCacheFiles();                if (cacheFiles != null && cacheFiles.length > 0) {                    for (int i = 0; i < cacheFiles.length; i++) {                        BufferedReader joinReader = new BufferedReader(new FileReader(cacheFiles[i].toString()));                        while ((line = joinReader.readLine()) != null) {                            joinData.add(line);                        }                    }                }            } catch (IOException e) {                e.printStackTrace();                System.exit(0);            } finally {                if (hdfs != null) {                    try {                        hdfs.close();                    } catch (IOException e) {                        // Do Nothing                    }                }            }        }        @Override        public void persistMapOutput(Mapper.Context context, StructuredData data) throws IOException, InterruptedException {            String mappingID = String.valueOf(data.getData("MappingID"));            if (joinData.contains(mappingID)) {                outputKey.set(mappingID);                ByteArrayOutputStream bos = new ByteArrayOutputStream();                ObjectOutputStream out = new ObjectOutputStream(bos);                out.writeObject(data);                DataField outputValue = new DataField("A", bos.toString(CHARSET));                context.write(outputKey, outputValue);                out.close();            }        }    }    /**     * 来自于HDFS输入文件的 Mapper     */    public static class VerifyHDFSMapper extends Mapper<LongWritable, Text, Text, DataField> {        private Text outputKey = new Text();        @Override        protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {            String line = new String(value.copyBytes());            String terms[] = line.split(SEPARATOR);            if (terms == null || terms.length < 2) {                return;            }            outputKey.set(terms[0]);            DataField outputValue = new DataField("B", terms[1]);            context.write(outputKey, outputValue);        }    }    /**     * Verify Reducer     */    public static class VerifyReducer extends Reducer<Text, DataField, Text, IntWritable> {        private Text outputKey = new Text();        private IntWritable outputValue = new IntWritable();        @Override        protected void reduce (Text key, Iterable<DataField> values, Context context) throws IOException, InterruptedException {            if (values != null) {                StructuredData structuredData = null;                String stringData = null;                for (DataField value : values) {                    if ("A".equalsIgnoreCase(value.getKey())) {                        ByteArrayInputStream bis = new ByteArrayInputStream(value.getValue().getBytes(CHARSET));                        ObjectInputStream in = new ObjectInputStream(bis);                        try {                            structuredData = (StructuredData) in.readObject();                        } catch (ClassNotFoundException e) {                            e.printStackTrace();                        }                        in.close();                    } else if ("B".equalsIgnoreCase(value.getKey())) {                        stringData = value.getValue();                    }                }                Map<String, Integer> resultMap = VerifyUtil.verifyBasicData(stringData, structuredData);                if (resultMap != null) {                    Iterator<Map.Entry<String, Integer>> it = resultMap.entrySet().iterator();                    while (it.hasNext()) {                        Map.Entry<String, Integer> entry = it.next();                        outputKey.set(entry.getKey());                        outputValue.set(entry.getValue());                        context.write(outputKey, outputValue);                    }                }            }        }    }    /**     * 统计结果Mapper     */    public static class StatisticsMapper extends Mapper<LongWritable, Text, Text, IntWritable> {        private Text outputKey = new Text();        private IntWritable outputValue = new IntWritable();        @Override        protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {            String line = new String(value.getBytes());            String[] terms = line.split(SEPARATOR);            if (terms.length != 2) {                return;            }            outputKey.set(terms[0]);            outputValue.set(Integer.valueOf(terms[1]));            context.write(outputKey, outputValue);        }    }    /**     * 统计结果Combiner,合并相同key,减少网络传输量     */    public static class StatisticsCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {        private IntWritable outputValue = new IntWritable();        @Override        protected void reduce (Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {            int sum = 0;            for (IntWritable value : values) {                sum += value.get();            }            outputValue.set(sum);            context.write(key, outputValue);        }    }    /**     * 统计结果Reducer     */    public static class StatisticsReducer extends Reducer<Text, IntWritable, Text, IntWritable> {        private IntWritable outputValue = new IntWritable();        @Override        protected void reduce (Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {            int sum = 0;            for (IntWritable value : values) {                sum += value.get();            }            outputValue.set(sum);            context.write(key, outputValue);        }    }}

0 0