Hadoop Left Join2
来源:互联网 发布:emlog博客源码 编辑:程序博客网 时间:2024/05/29 17:23
我在上一篇博客Hadoop实现LeftJoin操作上已经分享过一种实现LeftJoin操作的方法。这次分享一种自定义数据类型来实现LeftJoin,该方法相对与之前的方法要更高效。简单来说,之前分享的方法是把两张表先按照同一种格式去map,也就是说无论是员工表还是公司表都是一样处理,只不过在Reducer时进行判断,将原本为null的内容替换,所以会有不少的浪费。而本次介绍的方法,自定义一种数据类型,对同一个公司,把公司表放在员工表前面,这样每个Key的第一个一定是公司,不需要做判断,也不需要重复存储一个公司。
说了这么多废话,接下来正式介绍本方法。首先将测试数据放在这方便理解:
employee
jd,david
jd,mike
tb,mike
tb,lucifer
elong,xiaoming
elong,ali
tengxun,xiaoming
tengxun,lilei
xxx,aaa
salary
jd,1600
tb,1800
elong,2000
tengxun,2200
首先定义一个自定义的键值,该键值作为Map过程的输出键值,包含一个Cid,表示公司编号(在这里就是公司名称,后面不再赘述),一个isEmployee,代表是否为employee。hadoop规定,若是自定义键值类型,必须:
1、实现Writable接口
实现readFields,将输入流字节反序列化
public void readFields(DataInput in) throws IOException { this.cid = in.readUTF() ; this.isEmployee = in.readBoolean(); }
实现write,把每个对象序列化到输出
public void write(DataOutput out) throws IOException { out.writeUTF(cid); out.writeBoolean(isEmployee); }
注意readFields和write里面的顺序必须相同
2、重写compareTo函数
重写compareTo函数,自定义比较函数
public int compareTo(EmployeeKey o) { if(this.cid .equals(o.cid)){ if(this.getIs()==o.getIs()){ return 0 ; } else if(this.getIs()==true){ return 1 ; } else return -1 ; } else { return (this.getCid().compareTo(o.getCid())>0)?1:-1 ; } }
3、重写group
如果不重写group,由于公司表和职员表里面的isEmployee不同,同一个公司的公司表和职员表将不在一个List中,这当然不是我们想要的结果。
因此要实现一个继承WritableComparator的类,并在主类中设置setGroupingComparatorClass
class GroupComParator extends WritableComparator{ public GroupComParator(){ super(EmployeeKey.class,true) ; } @SuppressWarnings("rawtypes") @Override public int compare(WritableComparable a , WritableComparable b) { EmployeeKey aKey = (EmployeeKey) a ; EmployeeKey bKey = (EmployeeKey) b ; if(aKey.getCid().equalsIgnoreCase(bKey.getCid())){ return 0 ; } else{ return ( aKey.getCid().compareTo(bKey.getCid()) == -1)?1:-1 ; } } }
自定义好mapper的输出键类型后,就执行mapper操作
class MyMapper extends Mapper<Object,Text,EmployeeKey,Employee> { protected void map(Object key,Text value ,Context context)throws IOException,InterruptedException{ String filePath = ( (FileSplit)context.getInputSplit()).getPath().toString() ; String val = value.toString() ; if(val == null || val == "") return ; EmployeeKey outKey = new EmployeeKey() ; if(filePath.indexOf("employee")!=-1){ String[] line = val.split(",") ; if(line.length<2)return ; Employee em = new Employee() ; outKey.setCid(line[0]); outKey.setIs(true); em.setCid(line[0]); em.setEid(line[1]); em.setSalary("-1"); context.write(outKey,em); } else if(filePath.indexOf("salary")!=-1){ String[] line = val.split(",") ; if(line.length<2) return ; Employee em = new Employee() ; outKey.setCid(line[0]); outKey.setIs(false); em.setCid(line[0]) ; em.setEid("-1"); em.setSalary(line[1]) ; context.write(outKey,em); } }}
以上过程会将同一个cid的放在一个list里面,接下来只要在reducer过程中将第一个元素的salary填入其他的元素中即可合并两张表。
package join;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.Writable;import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.io.WritableComparator;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;class EmployeeKey implements WritableComparable<EmployeeKey>{ private String cid ; private boolean isEmployee ; public EmployeeKey() { } public EmployeeKey(String cid ,boolean isEmployee){ this.cid = cid ; this.isEmployee = isEmployee; } public void readFields(DataInput in) throws IOException { this.cid = in.readUTF() ; this.isEmployee = in.readBoolean(); } public void write(DataOutput out) throws IOException { out.writeUTF(cid); out.writeBoolean(isEmployee); } public int compareTo(EmployeeKey o) { if(this.cid .equals(o.cid)){ if(this.getIs()==o.getIs()){ return 0 ; } else if(this.getIs()==true){ return 1 ; } else return -1 ; } else { return (this.getCid().compareTo(o.getCid())>0)?1:-1 ; } } public String getCid(){ return this.cid ; } public boolean getIs(){ return this.isEmployee ; } public void setCid(String cid){ this.cid = cid; } public void setIs(boolean isEmployee){ this.isEmployee = isEmployee; }}class Employee implements Writable{ private String cid ; private String eid ; private String salary ; public void readFields(DataInput in) throws IOException { this.cid = in.readUTF() ; this.eid = in.readUTF() ; this.salary = in.readUTF() ; } public void write(DataOutput out) throws IOException { out.writeUTF(cid) ; out.writeUTF(eid) ; out.writeUTF(salary); } public String getCid() { return cid; } public void setCid(String cid) { this.cid = cid; } public String getEid() { return eid; } public void setEid(String eid) { this.eid = eid; } public String getSalary() { return salary; } public void setSalary(String salary) { this.salary = salary; }}class GroupComParator extends WritableComparator{ public GroupComParator(){ super(EmployeeKey.class,true) ; } @SuppressWarnings("rawtypes") @Override public int compare(WritableComparable a , WritableComparable b) { EmployeeKey aKey = (EmployeeKey) a ; EmployeeKey bKey = (EmployeeKey) b ; if(aKey.getCid().equalsIgnoreCase(bKey.getCid())){ return 0 ; } else{ return ( aKey.getCid().compareTo(bKey.getCid()) == -1)?1:-1 ; } } } class MyMapper extends Mapper<Object,Text,EmployeeKey,Employee> { protected void map(Object key,Text value ,Context context)throws IOException,InterruptedException{ String filePath = ( (FileSplit)context.getInputSplit()).getPath().toString() ; String val = value.toString() ; if(val == null || val == "") return ; EmployeeKey outKey = new EmployeeKey() ; if(filePath.indexOf("employee")!=-1){ String[] line = val.split(",") ; if(line.length<2)return ; Employee em = new Employee() ; outKey.setCid(line[0]); outKey.setIs(true); //System.out.println(line[1]+" outkey"); em.setCid(line[0]); em.setEid(line[1]); em.setSalary("-1"); context.write(outKey,em); } else if(filePath.indexOf("salary")!=-1){ String[] line = val.split(",") ; if(line.length<2) return ; Employee em = new Employee() ; outKey.setCid(line[0]); outKey.setIs(false); em.setCid(line[0]) ; em.setEid("-1"); em.setSalary(line[1]) ; context.write(outKey,em); } }}class MyReducer extends Reducer<EmployeeKey,Employee,Object,Text> { protected void reduce(EmployeeKey key,Iterable<Employee> vals ,Context context)throws IOException,InterruptedException{ EmployeeKey eKey = key ; String salary="" ; for(Employee val : vals){ if(eKey.getIs() == true){ val.setSalary(salary); if(salary!="") context.write( null,new Text(val.getCid()+" "+val.getEid()+" "+val.getSalary())); else context.write( null,new Text(val.getCid()+" "+val.getEid()+" "+"NULL")); } else{ salary = new String(val.getSalary()); } } }}public class LeftJoin { public static void main(String[] args) throws Exception{ Configuration config = new Configuration(); Job job = new Job(config,"Left Join") ; FileInputFormat.setInputPaths(job, new Path(args[0])) ; FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setGroupingComparatorClass(GroupComParator.class); job.setJarByClass(LeftJoin.class); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(EmployeeKey.class); job.setMapOutputValueClass(Employee.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); System.exit(job.waitForCompletion(true)?0:1); }}
- Hadoop Left Join2
- Hadoop Left Join
- left
- Hadoop “No space left on device”问题解决方法
- Hadoop streaming: Exception in thread "main" java.io.IOException: No space left on device
- Left Join
- Left Join
- left-right
- left 截取
- Left join
- left join
- left join
- left join
- LEFT JOIN
- who left?
- left floating
- Left Join
- left join
- Python Web开发 之Django框架入门学习笔记(一)——安装和初步使用
- 一个简单的Java反射Demo
- Hadoop Left Join
- 支持向量机(SVM)与其理论发展(2):对偶学习
- Qt5注册全局热键
- Hadoop Left Join2
- Linux下C语言多线程学习之一——线程的创建
- 网页特效代码
- Codeforces
- 使用weka内置算法分析数据(图形界面操作)
- Atitit 个人 企业 政府 等组织 财政收入分类与提升途径attilax总结 v2
- Python(三)
- ubuntu 解决“无法获得锁 /var/lib/dpkg/lock -open”的方法
- js监控enter键触发函数提交数据或者登陆