MapReduce Join操作
来源:互联网 发布:上瘾网络剧全集资源 编辑:程序博客网 时间:2024/06/05 18:17
MapReduce 处理两个或多个数据源是经常的事,此时就要根据某个key将两个数据源进行Join操作,Join操作的原则是:
1) 如果两个数据源大小差不多,则直接使用DataJoin类进行Join操作
2) 如果两个数据源大小相差很大,则使用Distributed Cache机制将较小的数据源复制到所有节点并缓存起来,然后过滤大的数据源。如果缓存不下较小的数据源,可以先用Distributed Cache机制将较小的数据源的Key复制到所有节点并缓存,然后过滤大的数据源,过滤后的数据再与较小的数据源进行Join操作。
本周遇到的问题就是第二种情况的较小数据源无法被缓存地下(抛出OOM错误)。
解决方案是使用两个Job
第一个Job 将较小的数据源 切割成只有 key的数据
第二个Job的第一个Mapper用Distributed Cache机制将上面的只有key的数据复制到所有节点并进行缓存,然后过滤大的数据源,输出的key为数据的唯一ID,第二个Mapper加载较小的数据源,输出的key为数据的唯一ID,Reducer会接收到相同ID的两个数据源数据,然后进行自己的操作
废话少说直接上有注释的代码:
/** * 用户基础属性数据反向校验Job * Created by luweijie@xiaomi.com on 15-7-20. */public class BasicDataVerifyJob { private static Logger logger = LoggerFactory.getLogger(BasicDataVerifyJob.class); private static final String SEPARATOR = "\t"; /** * 程序Job入口 * @param args */ public static void main (String args[]) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); new GenericOptionsParser(conf, args).getRemainingArgs(); if (StringUtils.isBlank(conf.get("path")) || StringUtils.isBlank(conf.get("date")) || StringUtils.isBlank(conf.get("output"))) { System.err.println("jvm args: -Dpath -Ddate -Doutput must be specified!"); System.exit(2); } FileSystem fs = FileSystem.get(conf); String path = conf.get("path"); if (!fs.exists(new Path(path))) { logger.error("input path not exists : " + path); System.exit(-1); } // output String output = conf.get("output"); Path outputPath = new Path(output); if (!output.endsWith("/")) { output += "/"; } if (fs.exists(outputPath)) { fs.delete(outputPath, true); } // 产生mappingID文件Job String mappingIDFileOutput = output + "mappingIDFile"; MicloudMRJob job1 = MicloudMRJob.getInstance(conf, "Generate mappingID file: " + path + " --> " + mappingIDFileOutput); job1.setJarByClass(BasicDataVerifyJob.class); job1.setMapperClass(MappingIDGeneratorMapper.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(NullWritable.class); MultipleInputs.addInputPath(job1, new Path(path), TextInputFormat.class, MappingIDGeneratorMapper.class); job1.setOutputFormatClass(TextOutputFormat.class); FileOutputFormat.setOutputPath(job1, new Path(mappingIDFileOutput)); // mappingIDFile产生成功后, 根据mappingIDFile过滤原数据并比较验证 if (job1.waitForCompletion(true)) { PathManager pathManager = PathManager.create(conf.get("date")); MicloudHDFSInputConfig inputConfig = new MicloudHDFSInputConfig("UserPublicInformation", FilterMapper.class, BasicDataVerifyJob.class, pathManager, Text.class, DataField.class); String verifyFile = output + "verify"; OutputConfig outputConfig = new OutputConfig(); outputConfig.setOutputPath(verifyFile); outputConfig.setOutputFormatClass(TextOutputFormat.class); outputConfig.setOutKeyClass(Text.class); outputConfig.setOutValueClass(IntWritable.class); outputConfig.setReducerClass(VerifyReducer.class); MicloudMRJob job2 = MicloudDataContext.getInstance().createJob(conf, inputConfig, outputConfig, "Verify Basic Data Job: " + pathManager.getOutputPath() + " --> " + verifyFile); MultipleInputs.addInputPath(job2, new Path(path), TextInputFormat.class, VerifyHDFSMapper.class); // 将较小的数据源放置到Distributed Cache文件中 FileStatus[] fileStatuses = fs.listStatus(new Path(mappingIDFileOutput)); URI[] uris = new URI[fileStatuses.length]; for (int i = 0; i < fileStatuses.length; i++) { uris[i] = fileStatuses[i].getPath().toUri(); } job2.setCacheFiles(uris); // Reducer Num job2.setNumReduceTasks(50); // 统计比较验证的结果,以下的代码就是处理代码 if (job2.waitForCompletion(true)) { String resultFile = output + "result"; MicloudMRJob job3 = MicloudMRJob.getInstance(conf, "Statistics Verify Data Job: " + verifyFile + " --> " + resultFile); job3.setJarByClass(BasicDataVerifyJob.class); job3.setMapperClass(StatisticsMapper.class); job3.setMapOutputKeyClass(Text.class); job3.setMapOutputValueClass(IntWritable.class); MultipleInputs.addInputPath(job3, new Path(verifyFile), TextInputFormat.class, StatisticsMapper.class); job3.setOutputFormatClass(TextOutputFormat.class); FileOutputFormat.setOutputPath(job3, new Path(resultFile)); job3.setCombinerClass(StatisticsCombiner.class); job3.setReducerClass(StatisticsReducer.class); job3.setNumReduceTasks(3); job3.waitForCompletion(true); } } } /** * 产生mappingID文件 Mapper */ public static class MappingIDGeneratorMapper extends Mapper<LongWritable, Text, Text, NullWritable> { private Text outputKey = new Text(); @Override protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = new String(value.copyBytes()); String terms[] = line.split(SEPARATOR); if (terms == null || terms.length < 2) { return; } outputKey.set(terms[0]); context.write(outputKey, NullWritable.get()); } } /** * 字符编码 */ private static final String CHARSET = "ISO-8859-1"; /** * 过滤Mapper */ public static class FilterMapper extends MicloudHDFSMapper<Text, DataField> { private static Set<String> joinData = new HashSet<String>(); private Text outputKey = new Text(); @Override protected void setup (Context context){ // 将distributed cache file 装入各个Map节点本地的内存数据joinData中 FileSystem hdfs = null; try { String line = null; Path[] cacheFiles = context.getLocalCacheFiles(); if (cacheFiles != null && cacheFiles.length > 0) { for (int i = 0; i < cacheFiles.length; i++) { BufferedReader joinReader = new BufferedReader(new FileReader(cacheFiles[i].toString())); while ((line = joinReader.readLine()) != null) { joinData.add(line); } } } } catch (IOException e) { e.printStackTrace(); System.exit(0); } finally { if (hdfs != null) { try { hdfs.close(); } catch (IOException e) { // Do Nothing } } } } @Override public void persistMapOutput(Mapper.Context context, StructuredData data) throws IOException, InterruptedException { String mappingID = String.valueOf(data.getData("MappingID")); if (joinData.contains(mappingID)) { outputKey.set(mappingID); ByteArrayOutputStream bos = new ByteArrayOutputStream(); ObjectOutputStream out = new ObjectOutputStream(bos); out.writeObject(data); DataField outputValue = new DataField("A", bos.toString(CHARSET)); context.write(outputKey, outputValue); out.close(); } } } /** * 来自于HDFS输入文件的 Mapper */ public static class VerifyHDFSMapper extends Mapper<LongWritable, Text, Text, DataField> { private Text outputKey = new Text(); @Override protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = new String(value.copyBytes()); String terms[] = line.split(SEPARATOR); if (terms == null || terms.length < 2) { return; } outputKey.set(terms[0]); DataField outputValue = new DataField("B", terms[1]); context.write(outputKey, outputValue); } } /** * Verify Reducer */ public static class VerifyReducer extends Reducer<Text, DataField, Text, IntWritable> { private Text outputKey = new Text(); private IntWritable outputValue = new IntWritable(); @Override protected void reduce (Text key, Iterable<DataField> values, Context context) throws IOException, InterruptedException { if (values != null) { StructuredData structuredData = null; String stringData = null; for (DataField value : values) { if ("A".equalsIgnoreCase(value.getKey())) { ByteArrayInputStream bis = new ByteArrayInputStream(value.getValue().getBytes(CHARSET)); ObjectInputStream in = new ObjectInputStream(bis); try { structuredData = (StructuredData) in.readObject(); } catch (ClassNotFoundException e) { e.printStackTrace(); } in.close(); } else if ("B".equalsIgnoreCase(value.getKey())) { stringData = value.getValue(); } } Map<String, Integer> resultMap = VerifyUtil.verifyBasicData(stringData, structuredData); if (resultMap != null) { Iterator<Map.Entry<String, Integer>> it = resultMap.entrySet().iterator(); while (it.hasNext()) { Map.Entry<String, Integer> entry = it.next(); outputKey.set(entry.getKey()); outputValue.set(entry.getValue()); context.write(outputKey, outputValue); } } } } } /** * 统计结果Mapper */ public static class StatisticsMapper extends Mapper<LongWritable, Text, Text, IntWritable> { private Text outputKey = new Text(); private IntWritable outputValue = new IntWritable(); @Override protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = new String(value.getBytes()); String[] terms = line.split(SEPARATOR); if (terms.length != 2) { return; } outputKey.set(terms[0]); outputValue.set(Integer.valueOf(terms[1])); context.write(outputKey, outputValue); } } /** * 统计结果Combiner,合并相同key,减少网络传输量 */ public static class StatisticsCombiner extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable outputValue = new IntWritable(); @Override protected void reduce (Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable value : values) { sum += value.get(); } outputValue.set(sum); context.write(key, outputValue); } } /** * 统计结果Reducer */ public static class StatisticsReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable outputValue = new IntWritable(); @Override protected void reduce (Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable value : values) { sum += value.get(); } outputValue.set(sum); context.write(key, outputValue); } }}
0 0
- MapReduce之Join操作
- MapReduce实现join操作
- MapReduce实现join操作
- MapReduce实现join操作
- MapReduce Join操作
- MapReduce实现join操作
- MapReduce实现join操作
- MapReduce实现join操作
- MapReduce实现join操作
- 关于MapReduce join操作
- MapReduce实现join操作
- MapReduce之Join操作
- MapReduce之join操作
- MapReduce之Join操作(1)
- MapReduce之Join操作(2)
- MapReduce之Join操作(3)
- MapReduce之Join操作(4)
- MapReduce实现hive join操作
- HTTP、SIP都是基于文本的协议,他们和基于二进制的协议有什么区别?!
- mycncart不同地区不同固定运费
- HDU 5289 Assignment
- 关于RTP打包h264的时间戳要注意的问题
- C++中Override覆盖问题
- MapReduce Join操作
- 学习笔记---c#中的get与set学习
- iOS获取文件夹中存储空间的大小、手机剩余空间大小、手机总空间大小 工具
- UVALive 5088 Alice and Bob's Trip(树形DP)
- Hdu 5302 Connect the Graph 2015 Multi-University Training Contest 2
- 图像标注 图像分割,特征提取和分类算法
- 一切成功源于积累——20150723 货币对之澳美 典型跌势 5分钟定理 八连斩 再次中计
- ADO.NET TableMappings使用方法
- oc013---oc程序的整体语法结构