MapReducer多表查询

来源：互联网发布：淘宝卖家中心网页编辑：程序博客网时间：2024/05/08 10:21

一个例子晚上例子，别处看来的，但是有些代码看不懂

表1

factoryname addressed
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Back of Beijing 1

表2

addressID addressname
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian

package HbaseCoprocessor;import org.apache.hadoop.mapreduce.Mapper;import java.io.File;import java.io.IOException;import java.nio.file.Files;import java.util.*;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class MoreTableMapReducer {private static int time = 0;      public static class Map extends Mapper<Object,Text,Text,Text>{          //在map中先区分输入行属于左表还是右表，然后对两列值进行分割，          //保存连接列在key值，剩余列和左右表标志在value中，最后输出          public void map(Object key,Text value,Context context) throws IOException,InterruptedException{              String line = value.toString();              int i = 0;              //输入文件首行，不处理              if(line.contains("factoryname")==true||line.contains("addressID")==true){                  return;              }              //找出数据中的分割点              while(line.charAt(i)>='9'||line.charAt(i)<='0'){                  i++;              }              if(line.charAt(i)>='9'||line.charAt(i)<='0'){                  //左表                  int j = i-1;                  while(line.charAt(j)!=' ') j--;                  String[] values = {line.substring(0, j),line.substring(i)};                  context.write(new Text(values[1]), new Text("1+"+values[0]));              }else{//右表                  int j = i+1;                  while(line.charAt(j)!=' ') j++;                  String[] values = {line.substring(0, i+1),line.substring(j)};                  context.write(new Text(values[0]), new Text("2+"+values[1]));              }          }      }      public static class Reduce extends Reducer<Text,Text,Text,Text>{          //reduce解析map输出，将value中数据按照左右表分别保存，然后求笛卡尔积，输出          public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{              if(time == 0){//输入文件第一行                  context.write(new Text("factoryname"),new Text("addressname"));                  time++;              }              int factorynum = 0;              String factory[] = new String[10];              int adressnum = 0;              String adress[] = new String[10];              Iterator iter = values.iterator();              while(iter.hasNext()){                  String record = iter.next().toString();                  int len = record.length();                  int i = 2;                  char type = record.charAt(0);                  String factoryname = new String();                  String adressname = new String();                  if(type == '1'){//左表                      factory[factorynum] = record.substring(2);                      factorynum++;                  }else{//右表                      adress[adressnum] = record.substring(2);                  }              }              if(factorynum!=0&&adressnum!=0){//笛卡尔积                  for(int m=0;m<factorynum;m++){                      for(int n=0;n<adressnum;n++){                          context.write(new Text(factory[m]), new Text(adress[n]));                      }                  }              }          }      }      /**      * @param args      */      public static void main(String[] args) throws Exception{          // TODO Auto-generated method stub          Configuration conf = new Configuration();          String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();          if(otherArgs.length != 2){              System.err.println("Usage WordCount <int> <out>");              System.exit(2);          }          Job job = new Job(conf,"word count");          job.setJarByClass(MoreTableMapReducer.class);          job.setMapperClass(Map.class);          job.setCombinerClass(Reduce.class);          job.setReducerClass(Reduce.class);          job.setOutputKeyClass(Text.class);          job.setOutputValueClass(Text.class);          FileInputFormat.addInputPath(job, new Path(otherArgs[0]));          FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));          System.exit(job.waitForCompletion(true) ? 0 : 1);      }  }

后来自己琢磨了一下，搞出来一个自己写的通俗版本的

输出结果为

factorynameaddressnameBack of BeijingBeijing1Beijing RisingBeijing1Beijing Red StarBeijing1Guangzhou Development BankGuangzhou2Guangzhou HondaGuangzhou2TencentShenzhen3Shenzhen ThunderShenzhen3

代码为

package HbaseCoprocessor;import org.apache.hadoop.mapreduce.Mapper;import java.io.File;import java.io.IOException;import java.nio.file.Files;import java.util.*;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class MTjoin {public static int time = 0;/* *  * 在map中先区分输入行属于左表还是右表，然后对两列值进行分割， *  * 保存连接列在key值，剩余列和左右表标志在value中，最后输出 */public static class Map extends Mapper<Object, Text, Text, Text> {// 实现map函数public void map(Object key, Text value, Context context)throws IOException, InterruptedException {String line = value.toString();// 每行文件// 输入文件首行，不处理   if (line.contains("factoryname") == true || line.contains("addressID") == true) {return;} String[] arr=line.split("\t"); //第几张表  根据表的字段来进行判断 String tableNum=""; if (arr[0].length()>1) {tableNum="1";context.write(new Text(arr[1]), new Text(tableNum + "_" + arr[0]));}else{tableNum="2";context.write(new Text(arr[0]), new Text(tableNum + "_" + arr[1]));}}}/* *  * reduce解析map输出，将value中数据按照左右表分别保存， *  * 　　* 然后求出笛卡尔积，并输出。 */public static class Reduce extends Reducer<Text, Text, Text, Text> {// 实现reduce函数public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {// 输出表头if (0 == time) {context.write(new Text("factoryname"), new Text("addressname"));time++;} Iterator ite = values.iterator();List<String> tableList=new ArrayList<String>();String str2="";while (ite.hasNext()) {String record = ite.next().toString();//切割表2的内容String[] arr2=record.split("_");if (record.contains("2")) {str2=arr2[1];}else if(record.contains("1")){tableList.add(arr2[1]);}}for (int i = 0; i < tableList.size(); i++) {context.write(new Text(tableList.get(i)), new Text(str2+"\t"+key.toString()));System.out.println(key.toString());}}}public static void deleteAllFilesOfDir(File path) {      if (!path.exists())          return;      if (path.isFile()) {          path.delete();          return;      }      File[] files = path.listFiles();      for (int i = 0; i < files.length; i++) {          deleteAllFilesOfDir(files[i]);      }      path.delete();  } public static void main(String[] args) throws Exception {File file= new File("F:\\hadoop\\out"); deleteAllFilesOfDir(file);Configuration conf = new Configuration();// 这句话很关键System.setProperty("hadoop.home.dir","F:/alljar/hadoop-2.4.1-x64/hadoop-2.4.1");Job job = new Job(conf, "Multiple Table Join");job.setJarByClass(MTjoin.class);// 设置Map和Reduce处理类job.setMapperClass(Map.class);job.setReducerClass(Reduce.class);// 设置输出类型job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);// 设置输入和输出目录FileInputFormat.addInputPath(job, new Path("F:\\hadoop\\in"));FileOutputFormat.setOutputPath(job, new Path("F:\\hadoop\\out"));System.exit(job.waitForCompletion(true) ? 0 : 1);}}

这本来是一个面试题。当时其实也想到了方法，但是面试者出的字段不清不楚，加上有点紧张一下子没理清楚思路，回来写了一下并不难，记录一下，下一次可以轻松面对了。也顺便想尝试一下3表的代码，逻辑似乎一下，hive的底层应该就是这样的

阅读全文

0 0