MapReduce-Join操作-初体验

来源：互联网发布：网络电视套餐编辑：程序博客网时间：2024/05/29 09:30

这一篇博客说说mapreduce的join问题，根据join的文件分别的数据量的大小，可以使用以下几种方式可以选择
1.repartition join -- reduce-side join 适用于合并两人个或多个更多的大型数据
2.replication join -- map-side join 适用于数据集小于缓存容量的情形
3.semi join -- 另一种map-side join 适用于数据集太大而不能导入内存的情形，但是经过一些过滤措施可以将其减小
到适合于内存处理的大小
这里先不详细的说明几种join的区别，后面会挨个做一个实例来说明，然后在各自说明，下面就来做一个的是reduce-side join,也是使用最广泛的一种join，他支持多路合并，下面就是今天的需求：
用户数据：
uid,name,phoneid
1,tom,40
2,jack,20
3,seven,30
4,lee,10
5,smith,20
6,张三,10
7,李四,30
8,王五,20

goodid,name
10,苹果
20,三星
30,LG
40,华为

输出结果：
张三苹果
lee 苹果
王五三星
smith 三星
jack 三星
李四 LG
seven LG
tom 华为

定制Writable可序列化对象：（实现hadoop的序列化，写法同WritableComparable，只是没有比较的功能，不用实现compareTo（）方法）

如果要了解如何定制WritableComparable可以参考《MapReduce-自定义Key-二次排序》

import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Writable;public class User implements Writable {private String uno = "";private String name = "";private String pname = "";private String pno = "";private int flag = 0;public User() {}public User(User u) {super();this.uno = u.uno;this.name = u.name;this.pname = u.pname;this.pno = u.pno;this.flag = u.flag;}public User(String uno, String name, String pname, String pno, int flag) {super();this.uno = uno;this.name = name;this.pname = pname;this.pno = pno;this.flag = flag;}@Overridepublic void readFields(DataInput input) throws IOException {this.uno = input.readUTF();this.name = input.readUTF();this.pname = input.readUTF();this.pno = input.readUTF();this.flag = input.readInt();}@Overridepublic void write(DataOutput output) throws IOException {output.writeUTF(uno);output.writeUTF(name);output.writeUTF(pname);output.writeUTF(pno);output.writeInt(flag);}public String getUno() {return uno;}public void setUno(String uno) {this.uno = uno;}public String getName() {return name;}public void setName(String name) {this.name = name;}public String getPname() {return pname;}public void setPname(String pname) {this.pname = pname;}public String getPno() {return pno;}public void setPno(String pno) {this.pno = pno;}public int getFlag() {return flag;}public void setFlag(int flag) {this.flag = flag;}@Overridepublic String toString() {return name + " " + pname;}}

map阶段：

import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class JoinMapper extends Mapper<LongWritable, Text, IntWritable, User> {@Overrideprotected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {String line = value.toString();/** * 判断是否为空行 */if(line.trim().length() <= 0) {return;}String[] arr = line.split(",");/** * 如果是用户数据则设置flag=0 */if (arr.length == 3) {User u = new User();u.setUno(arr[0]);u.setName(arr[1]);u.setFlag(0);context.write(new IntWritable(Integer.parseInt(arr[2].trim())), u);} else if (arr.length == 2) {/** * 如果是手机数据则把flag=1 */User u = new User();u.setPname(arr[1]);u.setPno(arr[0]);u.setFlag(1);/** * 都把要join的字段作为key,这样就可以让其到reduce函数处理时在同一个 * 迭代器中，这样就可以在reduce函数中做join的操作 */context.write(new IntWritable(Integer.parseInt(arr[0].trim())), u);}}}

reduce阶段：

import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class JoinReducer extends Reducer<IntWritable, User, NullWritable, Text> {@Overrideprotected void reduce(IntWritable key, Iterable<User> values, Context context)throws IOException, InterruptedException {User phone = null;List<User> users = new ArrayList<User>();/** * 遍历迭代器，找出其中的手机的相关信息并存入指定对象 * 这里只是简单的体验一下join操作的基本本方式，而且这 * 种写法是肯定不能用于线上的，后面总结部分会做详细的 * 分析，而且在后面的博客中会一步步的分享可行的方案 */for(User e: values) {if(e.getFlag() == 1) {phone = new User(e);} else if (e.getFlag() == 0) {users.add(new User(e));}}/** * 遍历user集合，把手机信息添加到user对象中并输出达到我们的实验目的 * 这里就是join操作发生的地方 */for( User e: users) {e.setPno(phone.getPno());e.setPname(phone.getPname());context.write(NullWritable.get(), new Text(e.toString()));}}}

启动函数：

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class JobMain {public static void main(String[] args) throws Exception{Configuration configuration = new Configuration();Job job = new Job(configuration, "join-job");job.setJarByClass(JobMain.class);job.setMapperClass(JoinMapper.class);job.setMapOutputKeyClass(IntWritable.class);job.setMapOutputValueClass(User.class);job.setReducerClass(JoinReducer.class);job.setOutputKeyClass(NullWritable.class);job.setOutputValueClass(Text.class);FileInputFormat.addInputPath(job, new Path(args[0]));Path outputDir = new Path(args[1]);FileSystem fs  = FileSystem.get(configuration);if(fs.exists(outputDir)) {fs.delete(outputDir, true);}FileOutputFormat.setOutputPath(job, outputDir);System.exit(job.waitForCompletion(true)?0:1);}}

运行结果：

总结：

注释中说明了这个Join的mapreduce的写法是不好的，只是用于我们体验join的流程而用，因为这个写法效率低，资源消耗大而且不能适用于所有的业务，效率低是因为在reduce端遍历了两次集合，资源的消耗大是因为重新创建了List来放几乎所有的迭代器中的数据，不能适用于所有的业务是因为正式环境往往一个reduce的迭代器中的数据量巨大，而List的最大值为Integer.MAX_VALUE,所以在数据量巨大的时候,会造成List越界的错误，所以后面会分享《hadoop硬实战》和《hadoop in action》中的解决方法来一步步的优化join方案。

0 0