join操作

来源：互联网发布：蒋中挺和肖秀荣知乎编辑：程序博客网时间：2024/06/07 01:52

数据
------------
[customers.txt]
1,tom,12
2,tom,13
3,tom,14
4,tom,15

[orders.txt]
1,no001,12.23,1
2,no001,12.23,1
3,no001,12.23,2
4,no001,12.23,2
5,no001,12.23,2
6,no001,12.23,3
7,no001,12.23,3
8,no001,12.23,3
9,no001,12.23,3

map端join
---------------
1.创建Mapper
package com.hdfs.mr.mapjoin;

  import org.apache.hadoop.conf.Configuration;
  import org.apache.hadoop.fs.FSDataInputStream;
  import org.apache.hadoop.fs.FileSystem;
  import org.apache.hadoop.fs.Path;
  import org.apache.hadoop.io.LongWritable;
  import org.apache.hadoop.io.NullWritable;
  import org.apache.hadoop.io.Text;
  import org.apache.hadoop.mapreduce.Mapper;

  import java.io.BufferedReader;
  import java.io.IOException;
  import java.io.InputStreamReader;
  import java.util.HashMap;
  import java.util.Map;

  /**
   * join操作，map端连接。
   */
  public class MapJoinMapper extends Mapper<LongWritable,Text,Text,NullWritable> {

private Map<String,String> allCustomers = new HashMap<String,String>();

   //启动,初始化客户信息
   protected void setup(Context context) throws IOException, InterruptedException {
    try {
     Configuration conf = context.getConfiguration();
     FileSystem fs = FileSystem.get(conf);
     FSDataInputStream fis = fs.open(new Path("file:///d:/mr/mapjoin/customers.txt"));
     //得到缓冲区阅读器
     BufferedReader br = new BufferedReader(new InputStreamReader(fis));
     String line = null ;
     while((line = br.readLine()) != null){
      //得到cid
      String cid = line.substring(0,line.indexOf(","));
      allCustomers.put(cid,line);
     }
    } catch (Exception e) {
     e.printStackTrace();
    }
   }

   protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    //订单信息
    String line = value.toString();
    //提取customer id
    String cid = line.substring(line.lastIndexOf(",") + 1);
    //订单信息
    String orderInfo = line.substring(0,line.lastIndexOf(","));

    //连接customer + "," + order
    String customerInfo = allCustomers.get(cid);
    context.write(new Text(customerInfo + "," + orderInfo),NullWritable.get());
   }

}

2.创建App
package com.hdfs.mr.mapjoin;

  import org.apache.hadoop.conf.Configuration;
  import org.apache.hadoop.fs.Path;
  import org.apache.hadoop.io.NullWritable;
  import org.apache.hadoop.io.Text;
  import org.apache.hadoop.mapreduce.Job;
  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

  /**
   *
   */
  public class MapJoinApp {
   public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();
    conf.set("fs.defaultFS","file:///");
    Job job = Job.getInstance(conf);

    //设置job的各种属性
    job.setJobName("MapJoinApp");                        //作业名称
    job.setJarByClass(MapJoinApp.class);                 //搜索类

    //添加输入路径
    FileInputFormat.addInputPath(job,new Path(args[0]));
    //设置输出路径
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

//没有reduce

job.setNumReduceTasks(0);

job.setMapperClass(MapJoinMapper.class); //mapper类

job.setMapOutputKeyClass(Text.class); //
job.setMapOutputValueClass(NullWritable.class); //

    job.waitForCompletion(true);
   }
  }

join端连接
-----------------------
1.自定义key
package com.hdfs.mr.mapjoin.reducejoin;

import org.apache.hadoop.io.WritableComparable;

  import java.io.DataInput;
  import java.io.DataOutput;
  import java.io.IOException;

  /**
   */
  public class ComboKey2 implements WritableComparable<ComboKey2> {
   //0-customer 1-order
   private int type ;
   private int cid ;
   private int oid ;
   private String customerInfo = "" ;
   private String orderInfo = "" ;

   public int compareTo(ComboKey2 o) {
    int type0 = o.type ;
    int cid0= o.cid;
    int oid0 = o.oid;
    String customerInfo0 = o.customerInfo;
    String orderInfo0 = o.orderInfo ;
    //是否同一个customer的数据
    if(cid == cid0){
     //同一个客户的两个订单
     if(type == type0){
      return oid - oid0 ;
     }
     //一个Customer + 他的order
     else{
      if(type ==0)
       return -1 ;
      else
       return 1 ;
     }
    }
    //cid不同
    else{
     return cid - cid0 ;
    }
   }

   public void write(DataOutput out) throws IOException {
    out.writeInt(type);
    out.writeInt(cid);
    out.writeInt(oid);
    out.writeUTF(customerInfo);
    out.writeUTF(orderInfo);
   }

   public void readFields(DataInput in) throws IOException {
    this.type = in.readInt();
    this.cid = in.readInt();
    this.oid = in.readInt();
    this.customerInfo = in.readUTF();
    this.orderInfo = in.readUTF();
   }
  }

2.自定义分区类
public class CIDPartitioner extends Partitioner<ComboKey2,NullWritable>{

   public int getPartition(ComboKey2 key, NullWritable nullWritable, int numPartitions) {
    return key.getCid() % numPartitions;
   }
  }
3.创建Mapper
  package com.it18zhang.hdfs.mr.mapjoin.reducejoin;

  import org.apache.hadoop.io.LongWritable;
  import org.apache.hadoop.io.NullWritable;
  import org.apache.hadoop.io.Text;
  import org.apache.hadoop.mapreduce.InputSplit;
  import org.apache.hadoop.mapreduce.Mapper;
  import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

  /**
   * mapper
   */
  public class ReduceJoinMapper extends Mapper<LongWritable,Text,ComboKey2,NullWritable> {

   protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    //
    String line = value.toString() ;

    //判断是customer还是order
    FileSplit split = (FileSplit)context.getInputSplit();
    String path = split.getPath().toString();
    //客户信息
    ComboKey2 key2 = new ComboKey2();
    if(path.contains("customers")){
     String cid = line.substring(0,line.indexOf(","));
     String custInfo = line ;
     key2.setType(0);
     key2.setCid(Integer.parseInt(cid));
     key2.setCustomerInfo(custInfo);
    }
    //order info
    else{
     String cid = line.substring(line.lastIndexOf(",") + 1);
     String oid = line.substring(0, line.indexOf(","));
     String oinfo = line.substring(0, line.lastIndexOf(","));
     key2.setType(1);
     key2.setCid(Integer.parseInt(cid));
     key2.setOid(Integer.parseInt(oid));
     key2.setOrderInfo(oinfo);
    }
    context.write(key2,NullWritable.get());
   }
  }

4.创建Reducer
package com.hdfs.mr.mapjoin.reducejoin;

  import org.apache.hadoop.io.NullWritable;
  import org.apache.hadoop.io.Text;
  import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

  /**
   * ReduceJoinReducer,reducer端连接实现。
   */
  public class ReduceJoinReducer extends Reducer<ComboKey2,NullWritable,Text,NullWritable> {

   protected void reduce(ComboKey2 key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
    Iterator<NullWritable> it = values.iterator();
    it.next();
    int type = key.getType();
    int cid = key.getCid() ;
    String cinfo = key.getCustomerInfo() ;
    while(it.hasNext()){
     it.next();
     String oinfo = key.getOrderInfo();
     context.write(new Text(cinfo + "," + oinfo),NullWritable.get());
    }
   }
  }

5.创建排序对比器
package com.hdfs.mr.mapjoin.reducejoin;

  import org.apache.hadoop.io.WritableComparable;
  import org.apache.hadoop.io.WritableComparator;

  /**
   * 组合Key排序对比器
   */
  public class ComboKey2Comparator extends WritableComparator {
   protected ComboKey2Comparator() {
    super(ComboKey2.class, true);
   }

   public int compare(WritableComparable a, WritableComparable b) {
    ComboKey2 k1 = (ComboKey2) a;
    ComboKey2 k2 = (ComboKey2) b;
    return k1.compareTo(k2);
   }
  }

6.分组对比器
package com.hdfs.mr.mapjoin.reducejoin;

  import org.apache.hadoop.io.WritableComparable;
  import org.apache.hadoop.io.WritableComparator;

  /**
   * CID分组对比器
   */
  public class CIDGroupComparator extends WritableComparator{

   protected CIDGroupComparator() {
    super(ComboKey2.class, true);
   }

   public int compare(WritableComparable a, WritableComparable b) {
    ComboKey2 k1 = (ComboKey2) a;
    ComboKey2 k2 = (ComboKey2) b;
    return k1.getCid() - k2.getCid();
   }
  }

7.App
package com.hdfs.mr.mapjoin.reducejoin;

  import org.apache.hadoop.conf.Configuration;
  import org.apache.hadoop.fs.Path;
  import org.apache.hadoop.io.IntWritable;
  import org.apache.hadoop.io.NullWritable;
  import org.apache.hadoop.io.Text;
  import org.apache.hadoop.mapreduce.Job;
  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

  /**
   *
   */
  public class ReduceJoinApp {
   public static void main(String[] args) throws Exception {

Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");

Job job = Job.getInstance(conf);

    //设置job的各种属性
    job.setJobName("ReduceJoinApp");                        //作业名称
    job.setJarByClass(ReduceJoinApp.class);                 //搜索类

    //添加输入路径
    FileInputFormat.addInputPath(job,new Path("D:\\mr\\reducejoin"));
    //设置输出路径
    FileOutputFormat.setOutputPath(job,new Path("D:\\mr\\reducejoin\\out"));

job.setMapperClass(ReduceJoinMapper.class); //mapper类
job.setReducerClass(ReduceJoinReducer.class); //reducer类

    //设置Map输出类型
    job.setMapOutputKeyClass(ComboKey2.class);            //
    job.setMapOutputValueClass(NullWritable.class);      //

    //设置ReduceOutput类型
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);         //

    //设置分区类
    job.setPartitionerClass(CIDPartitioner.class);
    //设置分组对比器
    job.setGroupingComparatorClass(CIDGroupComparator.class);
    //设置排序对比器
    job.setSortComparatorClass(ComboKey2Comparator.class);
    job.setNumReduceTasks(2);                           //reduce个数
    job.waitForCompletion(true);
   }
  }

0 0