Hadoop单表与多表关联

来源：互联网发布：it公司编辑：程序博客网时间：2024/06/07 06:33

在单表关联和多表关联的应用的时候，需要从数据里面挖掘出信息来进行操作。

例子给出孩子和父母的表输出孩子和爷爷的表数据如下：

亲人表childparenttomjeremtomlucyjeremxdlucyhongtonjackjcterryjcjccy样例输出：

tomxdtomhongtonjackcyterrycy从样例输出，输入可以得出需要进行单表的链接是自己链接自己，链接的是左表的parent列和右表的child列，结果除去爷孙链接即可。

首先在map阶段将输入数据分割成child和parent之后，需要将parent设为key，child设为value作为左表，而在另一组的child和parent中需要将parent 作为value child作为key，此表作为右表。为了区分左右表，需要将在输出value的时候附带上左右的区别信息，字符1代表左表字符2代表右表。map阶段就完成了左表和右表，在shuffle阶段完成链接，reduce接收链接结果，其中key对应的value-list 包含了对应的关系，取出value值进行分析，将child放入数组1 parent放入数组2 然后进行数组的笛卡尔积得出最后结果。

代码样式入下：

[java] view plaincopy
package reverseIndex;  
  
import java.io.IOException;  
import java.util.Iterator;  
import java.util.StringTokenizer;  
  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.Mapper;  
import org.apache.hadoop.mapreduce.Reducer;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  
public class singleJoin {  
    public static int time = 0; //  用于表头的输出  
    public static class Map extends Mapper<LongWritable, Text,Text,Text>{  
        public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{  
            String childName = new String();  
            String parentName = new String();  
            String relationType = new String(); // the flag of left_table right_table  
            //handle line data  
            StringTokenizer itr = new StringTokenizer(value.toString());  
            String[] values = new String[2];  
            int i = 0;  
            while(itr.hasMoreTokens()){  
                values[i] = itr.nextToken();  
                i++;  
            }  
            //ignore the title   
            if(values[0].compareTo("child")!=0){  
                childName = values[0];  
                parentName = values[1];  
                //output the left_table  
                relationType = "1";  
                context.write(new Text(values[1]), new Text(relationType+"+"  
                                +childName+"+"+parentName));  
                  
                //output the right_table  
                relationType = "2";  
                context.write(new Text(values[0]),new Text(relationType+"+"  
                            +childName+"+"+parentName));  
            }  
        }  
    }  
    public static class Reduce extends Reducer<Text,Text,Text,Text>{  
         public void reduce(Text key,Iterable<Text>value,Context context) throws IOException, InterruptedException{  
             if(0 == time){  
                 //输出表头   
                 context.write(new Text("grandchild"),new Text("grandparent"));  
                 time++;  
             }  
             int grandchildnum = 0;  
             int grandparentnum = 0;  
             //定义存储grandchild,grandparent的数组  
             String[] grandchild = new String[20];  
             String[] grandparent = new String[20];  
             Iterator<Text> it = value.iterator();  
             while(it.hasNext()){  
                 String record = it.next().toString();  
                 int len = record.length();  
                 int i=2;  
                 if(0==len){  
                     continue;  
                 }  
                 char relationType = record.charAt(0);  
                 String childName = new String();  
                 String parentName = new String();  
                 while(record.charAt(i)!='+'){  
                     childName += record.charAt(i);  
                     i++;  
                 }  
                 i += 1;  
                 while(i<len){  
                     parentName += record.charAt(i);  
                     i++;  
                 }  
                 if('1'==relationType){  
                     grandchild[grandchildnum] = childName;  
                     grandchildnum++;  
                       
                 }  
                 if('2'==relationType){  
                     grandparent[grandparentnum] = parentName;  
                     grandparentnum++;  
                 }  
             }//end while 结束一行数据  
                 if(0!=grandchildnum && 0!=grandparentnum){  
                     for(int m=0;m<grandchildnum;m++){  
                         for(int n=0;n<grandparentnum;n++){  
                             context.write(new Text(grandchild[m]),new Text(grandparent[m]));  
                         }  
                     }  
                 }  
             }//end reduce  
    }//end Reduce  
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
        // TODO Auto-generated method stub  
        Configuration conf = new Configuration();  
        Job job = new Job(conf,"singleJoin");  
        job.setJarByClass(singleJoin.class);  
        job.setMapperClass(Map.class);  
        job.setReducerClass(Reduce.class);  
          
        job.setOutputKeyClass(Text.class);  
        job.setOutputValueClass(Text.class);  
          
        FileInputFormat.addInputPath(job,new Path(args[0]));  
        FileOutputFormat.setOutputPath(job, new Path(args[1]));  
        System.exit(job.waitForCompletion(true)?0:1);  
    }  
}  

多表链接：输入2个文件，一个学生表包含学生名和课程编号，另一个是教师名和课程编号，输出学生相对应的教师。教师1cy2ld3cy4ld

xd1xyy2lf3zz3输出：xdcyxyyldlfcyzzcy这种多表链接关系类似于关系型数据库的多表之间的查询，链接的join语句。

解决方法与上面一致：（左右表）

代码如下：

[java] view plaincopy
package reverseIndex;  
  
import java.io.IOException;  
import java.util.Iterator;  
import java.util.StringTokenizer;  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.Mapper;  
import org.apache.hadoop.mapreduce.Reducer;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  
public class multiJoin {  
    public static int time = 0;  
    public static class Map extends Mapper<LongWritable,Text,Text,Text>{  
        public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{  
            String mapKey = new String();  
            String mapValue = new String();  
            String line = value.toString();  
            String relationType = new String();  
            StringTokenizer it = new StringTokenizer(line);  
            if(line.contains("studentName")||line.contains("courseId")){  
                return;  
            }  
            int i=0;  
            while(it.hasMoreTokens()){  
                String token = it.nextToken();  
                //如果是课程编号在前，就将作为右表，若是人名在前面就是作为左表  
                if(token.charAt(0)>='0'&&token.charAt(0)<='9'){  
                    mapKey = token;  
                    if(i>0){  
                        relationType = "1";  
                    }else{  
                        relationType = "2";  
                    }  
                    continue;  
                }  
                mapValue += token+" ";  
                i++;  
            }  
            //完后左右表的输出  
            context.write(new Text(mapKey),new Text(relationType+"+"+mapValue));  
        }  
    }  
public static class Reduce extends Reducer<Text,Text,Text,Text>{  
    public void reduce(Text key,Iterable<Text>values,Context context) throws IOException, InterruptedException{  
        if(time == 0){  
            context.write(new Text("student"),new Text("teacher"));  
            time++;  
        }  
        int studentnum = 0;  
        int teachernum = 0;  
        String[] student = new String[10];  
        String[] teacher = new String[10];  
        Iterator<Text> it = values.iterator();  
        while(it.hasNext()){  
            String record = it.next().toString();  
            int len = record.length();  
            int i=2;//忽略relationType和+号  
            if(0 == len){  
                return;  
            }  
            char relationType = record.charAt(0);  
            if('1' == relationType){  
                student[studentnum] = record.substring(i);  
                studentnum++;  
            }  
            if('2'==relationType){  
                teacher[teachernum] = record.substring(i);  
                teachernum++;  
            }  
        }  
        if(0!=studentnum && 0!=teachernum){  
            for(int m=0;m<studentnum;m++){  
                for(int n=0;n<teachernum;n++){  
                    context.write(new Text(student[m]), new Text(teacher[n]));  
                }  
            }  
        }  
          
    }  
}  
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
        // TODO Auto-generated method stub  
        Configuration conf = new Configuration();  
        Job job = new Job(conf,"multiJoin");  
        job.setJarByClass(multiJoin.class);  
        job.setMapperClass(Map.class);  
        job.setReducerClass(Reduce.class);  
          
        job.setOutputKeyClass(Text.class);  
        job.setOutputValueClass(Text.class);  
          
        FileInputFormat.addInputPath(job,new Path(args[0]));  
        FileOutputFormat.setOutputPath(job, new Path(args[1]));  
        System.exit(job.waitForCompletion(true)?0:1);  
          
    }  
}  

0 0