hadoop学习--单表关联

来源:互联网 发布:java课程设计小游戏 编辑:程序博客网 时间:2024/05/18 03:57

本例子是对原有数据所包含的信息的挖掘。根据孩子与父母的信息,获取孩子与祖父母,外祖父母的信息。由child-parent表求得grandchild-grandparent表

原始数据如下:

family.txt:

child parent
Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma


输出如下:

grandchild grandparent
Tom Jesse
Tom Alice
Jone Jesse
Jone Alice
Jone Ben
Jone Mary
Tom Ben
Tom Mary
Philip Alice
Philip Jesse
Mark Alice
Mark Jesse

1、设计思路

首先在map阶段对于每一行输入,如Tom Lucy,处理成<Tom,1+Tom+Lucy> ,<Lucy,2+Tom+Lucy>,然后保存到context中,由reduce处理。

因此在map阶段结束后的数据如下所示:

Alice 2+Jack+Alice
Alice 2+Terry+Alice
Alma 2+Philip+Alma
Alma 2+Mark+Alma
Ben 2+Lucy+Ben
Jack 1+Jack+Jesse
Jack 2+Tom+Jack
Jack 2+Jone+Jack
Jack 1+Jack+Alice
Jesse 2+Terry+Jesse
Jesse 2+Jack+Jesse
Jone 1+Jone+Jack
Jone 1+Jone+Lucy
Lucy 1+Lucy+Ben
Lucy 2+Jone+Lucy
Lucy 2+Tom+Lucy
Lucy 1+Lucy+Mary
Mark 1+Mark+Alma
Mark 1+Mark+Terry
Mary 2+Lucy+Mary
Philip 1+Philip+Terry
Philip 1+Philip+Alma
Terry 2+Philip+Terry
Terry 1+Terry+Alice
Terry 2+Mark+Terry
Terry 1+Terry+Jesse
Tom 1+Tom+Jack
Tom 1+Tom+Lucy

下面是reduce的工作:

对于相同的key值如果同时拥有1+*+*和2+*+*的value值,则可以得到一对grandchild grandparent。

比如key=Jack

Jack 1+Jack+Jesse
Jack 2+Tom+Jack
Jack 2+Jone+Jack
Jack 1+Jack+Alice

Jack有一个小孩是Tom,Jesse又是Jack的父母。因此Tom和Jesse就是一对grandchild grandparent。

Jack总共有Tom Jone 2个小孩,所以这里可以得到4对grandchild grandparent

Tom Jesse

Tom Alice

Jone Jesse
Jone Alice

2、程序代码

import java.io.IOException;import java.util.StringTokenizer;import java.util.*;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class STjoin {  public static class TokenizerMapper        extends Mapper<Object, Text, Text, Text>{          String tabletype = new String();    String childname = new String();    String parentname = new String();    public void map(Object key, Text value, Context context                    ) throws IOException, InterruptedException {      StringTokenizer itr = new StringTokenizer(value.toString());      String[] values = new String[2];      int  i = 0;      while (itr.hasMoreTokens()) {        values[i] = itr.nextToken();        i++;      }      if(values[0].compareTo("child")!=0)      {      childname = values[0];      parentname = values[1];            //save child table      tabletype = "1";      context.write(new Text(values[0]), new Text(tabletype+ "+" + values[0] + "+" + values[1]));            //save parent table      tabletype = "2";      context.write(new Text(values[1]), new Text(tabletype+ "+" + values[0] + "+" + values[1]));      }    }  }    public static class IntSumReducer        extends Reducer<Text,Text,Text,Text> {    private IntWritable result = new IntWritable();int count = 0;    public void reduce(Text key, Iterable<Text> values,                        Context context                       ) throws IOException, InterruptedException {      if(count == 0)      {      context.write(new Text("grandchild"),new Text("grandparent"));      count++;      }      int grandchildnum = 0;  int grandparentnum = 0;  String[] grandchild = new String[10];  String[] grandparent = new String[10];      String strrecord = new String();      String[] strArr = new String[3];      Iterator ite = values.iterator();      while(ite.hasNext())      {      strrecord = ite.next().toString();      if(strrecord.length()>0)      {      strArr = strrecord.split("\\+");      }      else      {      continue;      }       if(strArr[0].equals("2"))      {      grandchild[grandchildnum++] = strArr[1];      }      else if(strArr[0].equals("1"))      {      grandparent[grandparentnum++] = strArr[2];      }      }      if(grandchildnum != 0 && grandparentnum != 0)      {      for(int i = 0;i < grandchildnum;i++)      {      for(int j = 0; j < grandparentnum;j++)      {      context.write(new Text(grandchild[i]),new Text(grandparent[j]));      }      }      }    }  }  public static void main(String[] args) throws Exception {    Configuration conf = new Configuration();    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    if (otherArgs.length != 2) {      System.err.println("Usage: STjoin <in> <out>");      System.exit(2);    }    Job job = new Job(conf, "STjoin");    job.setJarByClass(STjoin.class);    job.setMapperClass(TokenizerMapper.class);    //job.setCombinerClass(IntSumReducer.class);    job.setReducerClass(IntSumReducer.class);    job.setOutputKeyClass(Text.class);    job.setOutputValueClass(Text.class);    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));    System.exit(job.waitForCompletion(true) ? 0 : 1);  }}


参考资料:

《hadoop实战》(陆嘉恒)



0 0
原创粉丝点击