MapReduce多表关联实验

来源:互联网 发布:db2和mysql 迁移 编辑:程序博客网 时间:2024/06/06 23:19

一、实例描述

多表关联是通过对原始数据进行一定的处理,从其中挖掘出关心的信息。

 

二、数据描述

输入是两个文件,一个代表工厂表,包含工厂名列和地址号列;另一个代表地址表,包含地址名列和地址编号列。要求从输入数据中找出工厂名和地址名的对应关系,输出工厂名---地址名表。

 

样例输入:

Factory:                                                                              address:

factoryname    addressed                                             addressID         addressname

Beijing Red Star        1                                                      1       Beijing

Shenzhen Thunder   3                                                      2       Guangzhou

Guangzhou Honda            2                                            3       Shenzhen

Beijing Rising            1                                                      4       Xian

GuangzhouDevelopment Bank       2

Tencent             3

Back of Beijing          1

 

三、设计思路

多表关联和单表关联类似,都类似于数据库中的自然连接。相比单表关联,多表关联的左右表和连接列更加清楚,因此可以采用和单表关联相同的处理方式。Map识别出输入的行属于哪个表之后,对其进行分割,将连接的列值保存在key中,另一列和左右表标志保存在value中,然后输出。Reduce拿到连接结果后,解析value内容,根据标志将左右表内容分开存放,然后求笛卡尔积,最后直接输出。

 

四、程序代码

 

package HadoopShiZhang2;

 

import java.io.IOException;

import java.net.URI;

import java.util.*;

 

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

//程序以调试成功

public classMTjoin {

   public static int time = 0;

   /*多表关联和单表关联类似,它也是通过对原始数据进行一定的处理,从其中挖掘出关心的信息。如下输入的是两个文件,

    * 一个代表工厂表,包含工厂名列和地址编号列;另一个代表地址表,

    * 包含地址名列和地址编号列。要求从输入数据中找出工厂名和地址名的对应关系,输出工厂名-地址名表

     */

       static final String INPUT_PATH ="hdfs://chaoren:9000/MTjoin";

      static final String OUT_PATH ="hdfs://chaoren:9000/out";

      public static void main(String[] args)throws Exception {

         final Configuration conf =new Configuration();

         final FileSystem fileSystem =FileSystem.get(new URI(INPUT_PATH), conf);

         final Path outPath =new Path(OUT_PATH);

         if(fileSystem.exists(outPath)){

            fileSystem.delete(outPath,true);

         }

         final Job job =new Job(conf,MTjoin.class.getSimpleName());

         FileInputFormat.setInputPaths(job,INPUT_PATH);

         job.setInputFormatClass(TextInputFormat.class);

         job.setMapperClass(Map.class);

         job.setMapOutputKeyClass(Text.class);

         job.setMapOutputValueClass(Text.class);

         //job.setPartitionerClass(HashPartitioner.class);

         //job.setNumReduceTasks(1);

         //job.setCombinerClass(Reduce.class);

         job.setReducerClass(Reduce.class);

         job.setOutputKeyClass(Text.class);

         job.setOutputValueClass(Text.class);

     

         FileOutputFormat.setOutputPath(job,outPath);

         job.setOutputFormatClass(TextOutputFormat.class);

         job.waitForCompletion(true);

   }

   /**

     * map中先区分输入行属于左表还是右表,然后对两列值进行分割,

     * 保存连接列在key值,剩余列和左右表标志在value中,最后输出

     */

   publicstaticclassMapextendsMapper<Object, Text, Text, Text> {

       

      // 实现map函数

        public void map(Object key, Text value, Context context)

             throws IOException,InterruptedException {

            String line = value.toString();//每行文件

            String relationtype = new String();//左右表标识

            // 输入文件首行,不处理

            if (line.contains("factoryname") ==true || line.contains("addressed")==true){

                return;

            }

             // 输入的一行预处理文本

            StringTokenizer itr = new StringTokenizer(line);

            String mapkey = new String();

            String mapvalue = new String();

            int i = 0;

            while (itr.hasMoreTokens()) {

                // 先读取一个单词

                String token = itr.nextToken();

                // 判断该地址ID就把存到"values[0]"

                if (token.charAt(0) >='0' &&token.charAt(0) <= '9') {

                    mapkey = token;

                   if (i > 0) {

                        relationtype = "1";

                    } else {

                        relationtype = "2";

                    }

                    continue;

                }

                 // 存工厂名

                mapvalue += token + " ";

                i++;

            }

             // 输出左右表

            context.write(new Text(mapkey),new Text(relationtype + "+"+ mapvalue));

        }

   }

    /*

     * reduce解析map输出,将value中数据按照左右表分别保存,

       * 然后求出笛卡尔积,并输出。

     */

   publicstaticclassReduceextendsReducer<Text, Text, Text, Text> {

         // 实现reduce函数

        public void reduce(Text key, Iterable<Text>values, Context context)

                throws IOException, InterruptedException {

             // 输出表头

            if (0 ==time) {

                context.write(new Text("factoryname"),newText("addressname"));

                time++;

            }

             int factorynum = 0;

            String[] factory = new String[10];

            int addressnum = 0;

            String[] address = new String[10];

             Iterator ite =values.iterator();

            while (ite.hasNext()) {

                String record =ite.next().toString();

                int len = record.length();

                int i = 2;

                if (0 == len) {

                    continue;

                }

                 // 取得左右表标识

                char relationtype = record.charAt(0);

                 // 左表

                if ('1' == relationtype) {

                    factory[factorynum] =record.substring(i);

                    factorynum++;

                }

                 // 右表

                if ('2' == relationtype) {

                    address[addressnum] =record.substring(i);

                    addressnum++;

                }

            }

             // 求笛卡尔积

            if (0 != factorynum && 0 != addressnum){

                for (int m = 0; m < factorynum; m++) {

                    for (int n = 0; n < addressnum; n++) {

                        // 输出结果

                        context.write(new Text(factory[m]),

                                new Text(address[n]));

                    }

                }

           }

         }

   }

  

 

}

 

五、实验结果


运行程序后查看结果:


0 0
原创粉丝点击