MapReduce的Join

来源：互联网发布：再向虎山行gotv源码ts 编辑：程序博客网时间：2024/06/06 03:30

一、Map端Join

Map端联接是指数据到达map 处理函数之前进行合并的

基本思路

（1）需要join的两个文件，一个存储在HDFS中，一个使用DistributedCache.addCacheFile()将需要join另一个文件加入到所有Map的缓存里；

（2）在Map函数里读取该文件，进行Join；

（3）将结果输出到reduce；

（4）DistributedCache.addCacheFile()需要在作业提交前设置；

DistributedCache

DistributedCache是为方便用户进行应用程序开发而设计的文件分发工具。它能够将只读的外部文件进行自动分发到各个节点上进行本地缓存，以便task运行时加载使用。

使用步骤：

1.在HDFS中上传文件（文本文件、压缩文件、jar包等）；

2.调用相关API添加文件信息；

3.task运行前直接调用文件读写API获取文件；

常用API：DistributedCache.addCacheFile()，DistributedCache.addCacheArchive()，...

（例子同下文的Reduce端Join，直接贴额外增加的两个代码吧。。文章顺序写反了。。。）

源代码MapSideMapper.java：

package com.join;import java.io.BufferedReader;import java.io.FileReader;import java.io.IOException;import java.util.HashMap;import java.util.Map;import org.apache.hadoop.filecache.DistributedCache;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Mapper.Context;public class MapSideMapper extends Mapper<LongWritable, Text, NullWritable, EMP_DEP> {private Map<Integer,String> joinData = new HashMap<Integer,String>();@Overrideprotected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {    String[] values = value.toString().split("\\s+");EMP_DEP emp_dep = new EMP_DEP();emp_dep.setName(values[0]);emp_dep.setSex(values[1]);emp_dep.setAge(Integer.parseInt(values[2]));int depNo = Integer.parseInt(values[3]);String depName = joinData.get(depNo);emp_dep.setDepNo(depNo);emp_dep.setDepName(depName);context.write(NullWritable.get(), emp_dep);}@Overrideprotected void setup(Context context)throws IOException, InterruptedException {Path[] path = DistributedCache.getLocalCacheFiles(context.getConfiguration());BufferedReader reader = new BufferedReader(new FileReader(path[0].toString()));String str = null;while((str = reader.readLine()) != null){String[] s = str.split("\\s+");joinData.put(Integer.parseInt(s[0]), s[1]);}}}

源代码TestMapSideJoin.java：

package com.join;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.filecache.DistributedCache;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class TestMapSideJoin {public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException{Configuration conf = new Configuration();    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    if (otherArgs.length != 3) {      System.err.println("Usage: wordcount <in> <in2> <out>");      System.exit(2);    }        DistributedCache.addCacheFile(new Path(args[1]).toUri(), conf);        Job job = new Job(conf, "Map side Join");    job.setJarByClass(TestMapSideJoin.class);    job.setMapperClass(MapSideMapper.class);        job.setNumReduceTasks(0);    job.setMapOutputKeyClass(IntWritable.class);    job.setMapOutputValueClass(EMP_DEP.class);        job.setOutputKeyClass(NullWritable.class);    job.setOutputValueClass(EMP_DEP.class);        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));    FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));    System.exit(job.waitForCompletion(true) ? 0 : 1);}}

将EMP放置在/input1，DEP放置在/input2，运行：

二、Reduce端Join

reduce端联接比map端联接更普遍，因为输入的数据不需要特定的结构；但是效率低，因为所有数据必须经过shuffle过程；

基本思路：

（1）Map端读取所有文件，并在输出的内容里加上标识代表数据是从哪个文件里来的；

（2）在reduce处理函数里，对按照标识对数据进行保存；

（3）然后根据Key的Join来求出结果直接输出；

Reduce端Join的例子：

Table EMP：（新建文件EMP，第一行属性名不要）

Name Sex Age DepNo

zhang male 20 1

li female 25 2

wang female 30 3

zhou male 35 2

Table Dep：（新建文件DEP，第一行属性名不要）

DepNo DepName

1 Sales

2 Dev

3 Mgt

新建项目TestJoin，包com.join,

源代码EMP_DEP.java：

package com.join;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.WritableComparable;public class EMP_DEP implements WritableComparable {    private String name = "";    private String sex = "";    private int age = 0;    private int DepNo = 0;    private String DepName = "";    private String table = "";        public EMP_DEP(){}        public EMP_DEP(EMP_DEP emp_dep) {    this.name = emp_dep.getName();    this.sex = emp_dep.getSex();    this.age = emp_dep.getAge();    this.DepNo = emp_dep.getDepNo();    this.DepName = emp_dep.getDepName();    this.table = emp_dep.getTable();    }    public String getName() {return name;}public void setName(String name) {this.name = name;}public String getSex() {return sex;}public void setSex(String sex) {this.sex = sex;}public int getAge() {return age;}public void setAge(int age) {this.age = age;}public int getDepNo() {return DepNo;}public void setDepNo(int depNo) {DepNo = depNo;}public String getDepName() {return DepName;}public void setDepName(String depName) {DepName = depName;}public String getTable() {return table;}public void setTable(String table) {this.table = table;}@Overridepublic void readFields(DataInput in) throws IOException {this.name = in.readUTF();this.sex = in.readUTF();this.age = in.readInt();this.DepNo = in.readInt();this.DepName = in.readUTF();this.table = in.readUTF();}@Overridepublic void write(DataOutput out) throws IOException {out.writeUTF(name);out.writeUTF(sex);out.writeInt(age);out.writeInt(DepNo);out.writeUTF(DepName);out.writeUTF(table);}@Overridepublic int compareTo(Object o) {return 0;}@Overridepublic String toString() {return name + " " + sex + " " + age + " " + DepName;}}

源代码ReduceSideMapper.java：

package com.join;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Mapper.Context;public class ReduceSideMapper extends Mapper<LongWritable, Text,IntWritable , EMP_DEP> {private EMP_DEP emp_dep = new EMP_DEP();@Overrideprotected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {String[] values = value.toString().split("\\s+");if(values.length == 4) {emp_dep.setName(values[0]);emp_dep.setSex(values[1]);emp_dep.setAge(Integer.parseInt(values[2]));emp_dep.setDepNo(Integer.parseInt(values[3]));emp_dep.setTable("EMP");context.write(new IntWritable(Integer.parseInt(values[3])), emp_dep);}if(values.length == 2) {emp_dep.setDepNo(Integer.parseInt(values[0]));emp_dep.setDepName(values[1]);emp_dep.setTable("DEP");context.write(new IntWritable(Integer.parseInt(values[0])), emp_dep);}}}

源代码ReduceSideReducer.java：

package com.join;import java.io.IOException;import java.util.LinkedList;import java.util.List;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.Reducer.Context;public class ReduceSideReducer extendsReducer<IntWritable, EMP_DEP, NullWritable, EMP_DEP> {@Overrideprotected void reduce(IntWritable arg0, Iterable<EMP_DEP> value,Context context) throws IOException, InterruptedException {String depName = "";List<EMP_DEP> list = new LinkedList<EMP_DEP>();for (EMP_DEP val : value) {list.add(new EMP_DEP(val));if (val.getTable().equals("DEP")) {depName = val.getDepName();}}for (EMP_DEP v : list) {if (v.getTable().equals("EMP")) {v.setDepName(depName);context.write(NullWritable.get(), v);}}}}

源代码TestReduceSideJoin.java：

package com.join;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class TestReduceSideJoin {public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException{Configuration conf = new Configuration();    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    if (otherArgs.length != 2) {      System.err.println("Usage: wordcount <in> <out>");      System.exit(2);    }    Job job = new Job(conf, "Reduce side Join");    job.setJarByClass(TestReduceSideJoin.class);    job.setMapperClass(ReduceSideMapper.class);    job.setReducerClass(ReduceSideReducer.class);    job.setMapOutputKeyClass(IntWritable.class);    job.setMapOutputValueClass(EMP_DEP.class);        job.setOutputKeyClass(NullWritable.class);    job.setOutputValueClass(EMP_DEP.class);        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));    System.exit(job.waitForCompletion(true) ? 0 : 1);}}

导出jar文件，运行：

0 0