Hadoop应用——Map端Join操作

来源：互联网发布：网络实体小说排行榜编辑：程序博客网时间：2024/05/16 16:24

联接
使用案例
Table EMP：

Name    Sex Age DepNozhang   male    20  1li  female  25  2wang    female  30  3zhou    male    35  2

Table DEP：

DepNo   DepName1   Sales2   Dev3   Mgt

Map端联接是指数据到达map处理函数之前进行合并的。
基本思路
1）需要join的两个文件，一个存储在HDFS中，一个使用DistributedCache.addCacheFile()将需要join的另一个文件加入到所有Map的缓存里
2）在Map函数里读取该文件，进行join
3）将结果输出到HDFS上
4）DistributedCache.addCacheFile()需要在作业提交前设置

Hadoop DistributedCache原理分析
DistributedCache是Hadoop为方便用户进行应用程序开发而设计的文件分发工具。它能够将只读的外部文件自动分发到各个节点上进行本地缓存，以便Task运行时加载使用。它的大体工作流程如下：
1、用户提交作业后，Hadoop将由-files和-archives选项指定的文件复制到JobTracker的文件系统（一般为HDFS）中；
2、之后，当某个TaskTracker收到该作业的第一个Task后，该任务将负责从JobTracker文件系统中将文件下载到本地磁盘进行缓存，这样后续的Task就可以直接在本地访问这些文件了。
EMP_DEP.java

package com.join;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.WritableComparable;public class EMP_DEP implements WritableComparable{    private String name  = "" ;    private String sex = "" ;    private int age = 0 ;    private int depNo = 0 ;    private String depName = "" ;    private String table = "" ;    public EMP_DEP(){    }    public EMP_DEP(EMP_DEP emp_dep){        this.name = emp_dep.getName() ;        this.sex = emp_dep.getSex() ;        this.age = emp_dep.getAge() ;        this.depNo = emp_dep.getDepNo() ;        this.depName = emp_dep.getDepName() ;        this.table = emp_dep.getTable() ;    }    public String getName() {        return name;    }    public void setName(String name) {        this.name = name;    }    public String getSex() {        return sex;    }    public void setSex(String sex) {        this.sex = sex;    }    public int getAge() {        return age;    }    public void setAge(int age) {        this.age = age;    }    public int getDepNo() {        return depNo;    }    public void setDepNo(int depNo) {        this.depNo = depNo;    }    public String getDepName() {        return depName;    }    public void setDepName(String depName) {        this.depName = depName;    }    public String getTable() {        return table;    }    public void setTable(String table) {        this.table = table;    }    @Override    public void readFields(DataInput in) throws IOException {        this.name = in.readUTF() ;        this.sex = in.readUTF() ;        this.age = in.readInt() ;        this.depNo = in.readInt() ;        this.depName = in.readUTF() ;        this.table = in.readUTF() ;    }    @Override    public void write(DataOutput out) throws IOException {        out.writeUTF(name) ;        out.writeUTF(sex) ;        out.writeInt(age) ;        out.writeInt(depNo) ;        out.writeUTF(depName) ;        out.writeUTF(table) ;    }    @Override    public int compareTo(Object o) {        return 0;    }    public String toString(){        return name + " " + sex + " " + age + " " + depName ;     }}

MapSideMapper.java

package com.join;import java.io.BufferedReader;import java.io.FileReader;import java.io.IOException;import java.util.HashMap;import java.util.Map;import org.apache.hadoop.filecache.DistributedCache;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class MapSideMapper extends Mapper<LongWritable, Text, NullWritable, EMP_DEP> {    private Map<Integer , String> joinData = new HashMap() ;    @Override    protected void map(LongWritable key, Text value,Context context)            throws IOException, InterruptedException {        //处理EMP表        String[] values = value.toString().split("\\s+") ;        EMP_DEP emp_dep = new EMP_DEP() ;        emp_dep.setName(values[0]) ;        emp_dep.setSex(values[1]) ;        emp_dep.setAge(Integer.valueOf(values[2])) ;        int depNo = Integer.parseInt(values[3]) ;        String depName = joinData.get(depNo) ;        emp_dep.setDepNo(depNo) ;        emp_dep.setDepName(depName) ;        context.write(NullWritable.get(), emp_dep) ;    }    @Override    protected void setup(Context context)            throws IOException, InterruptedException {        Path[] path = DistributedCache.getLocalCacheFiles(context.getConfiguration()) ;        //只缓存了一个DEP文件,在本地文件系统中        BufferedReader reader = new BufferedReader(new FileReader(path[0].toString())) ;        String str = null ;        while((str = reader.readLine()) != null){            String[] s = str.split("\\s+") ;            joinData.put(Integer.valueOf(s[0]), s[1]) ;        }    }}

在Mapper或者Reducer类中使用文件时，Mapper或者Reducer开始运行前，各种文件已经下载到本地的工作目录中，直接调用文件读写API即可获取文件内容。在map函数执行前通过setup进行初始化。。。

TestMapSideJoin.java

package com.join;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.filecache.DistributedCache;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class TestMapSideJoin {    public static void main(String args[]) throws Exception{        Configuration conf = new Configuration();        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();        if (otherArgs.length != 3 ) {          System.err.println("Usage: wordcount <in1> <in2> <out>");          System.exit(2);        }        DistributedCache.addCacheFile(new Path(otherArgs[1]).toUri(), conf) ;        Job job = new Job(conf, "Map side join");        job.setJarByClass(TestMapSideJoin.class);        job.setMapperClass(MapSideMapper.class);        job.setNumReduceTasks(0) ;        job.setMapOutputKeyClass(NullWritable.class) ;        job.setMapOutputValueClass(EMP_DEP.class) ;        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));        FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));        System.exit(job.waitForCompletion(true) ? 0 : 1);    }}

DistributedCache添加使用的外部文件只需要一行代码

DistributedCache.addCacheFile(new Path(otherArgs[1]).toUri(), conf) ;

0 0