MapReduce---连接操作--Reduce端连接

来源：互联网发布：php表单提交数据过滤编辑：程序博客网时间：2024/06/10 00:13

由于reduce端连接并不要求输入数据集符合特定结构，因而reduce端比map端连接更为常用，但是，关联的两个数据集都需要经过MapReduce的shuffer过程，所以reduce端的连接的效率往往会很低：

基本思路：mapper为各个记录标记源，并且使用连接键作为map的输出键，使键相同的记录放在同一个reduce中

1 、定义组合CombKey

package hadoop.join.reduce;import org.apache.hadoop.io.WritableComparable;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;public class CombKey implements WritableComparable<CombKey> {//类型:0-customer 1-orderpublic int type = -1 ;//customer idpublic int cid = -1 ;//order idpublic int oid = -1;/** * 排序 */public int compareTo(CombKey o) {int otype = o.type ;int ocid = o.cid ;int ooid = o.oid ;//相同类型if(type == otype){if(type == 0){return cid - ocid ;}else{//同一客户订单if(cid == ocid){return oid - ooid ;}//不同客户的订单else{return cid - ocid ;}}}//不同型else{//是否同一客户if(type == 0){//是否是该客户的订单if(cid == ocid){return -1 ;}else{return cid - ocid ;}}else{if(cid == ocid){return 1 ;}else{return cid - ocid ;}}}}public void write(DataOutput out) throws IOException {out.writeInt(type);out.writeInt(cid);out.writeInt(oid);}public void readFields(DataInput in) throws IOException {this.type = in.readInt() ;this.cid = in.readInt() ;this.oid = in.readInt() ;}}

2、JoinMapper

package hadoop.join.reduce;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.util.HashMap;import java.util.Map;/** * Mapper */public class JoinMapper extends Mapper<LongWritable,Text ,CombKey,Text>{protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {//一行文本String line = value.toString();String[] arr = line.split(",") ;FileSplit split = (FileSplit) context.getInputSplit();String path = split.getPath().getName() ;CombKey keyOut = new CombKey() ;//customerif(path.contains("customers")){keyOut.type = 0 ;keyOut.cid = Integer.parseInt(arr[0]) ;}//订单else{keyOut.type = 1;keyOut.cid = Integer.parseInt(arr[3]);keyOut.oid = Integer.parseInt(arr[0]);}context.write(keyOut,value);}}

3、JoinReducer

package hadoop.join.reduce;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;import java.util.Iterator;/** * */public class JoinReducer extends Reducer<CombKey,Text,Text,NullWritable>{protected void reduce(CombKey key, Iterable<Text> values, Context context) throws IOException, InterruptedException {System.out.println("================================");//customerIterator<Text> it = values.iterator() ;if(key.type == 0){//取得custInfoString custInfo = it.next().toString() ;System.out.println(custInfo);while(it.hasNext()){String orderInfo = it.next().toString();System.out.println(custInfo + "," + orderInfo);context.write(new Text(custInfo + "," + orderInfo),NullWritable.get());}}//orderelse{while (it.hasNext()) {String orderInfo = it.next().toString();System.out.println("NULL," + orderInfo);context.write(new Text("NULL," + orderInfo), NullWritable.get());}}}}

4、自定义CIDPartitioner

package hadoop.join.reduce;import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.io.Text;/** *  */public class CIDPartitioner extends Partitioner<CombKey,Text>{public int getPartition(CombKey key, Text text, int numPartitions) {return key.cid % numPartitions ;}}

5、自定义CIDGroupComparator

package hadoop.join.reduce;import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.io.WritableComparator;/** * cid分组对比器 */public class CIDGroupComparator extends WritableComparator{protected CIDGroupComparator() {super(CombKey.class, true);}public int compare(WritableComparable k1, WritableComparable k2) {CombKey ck1 = (CombKey) k1;CombKey ck2 = (CombKey) k2;return ck1.cid - ck2.cid ;}}

6、App

package hadoop.join.reduce;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * join:reduce端连接 */public class App {public static void main(String[] args) throws Exception {args = new String[]{"d:/java/mr/join", "d:/java/mr/out"} ;Configuration conf = new Configuration();FileSystem fs = FileSystem.get(conf);if(fs.exists(new Path(args[1]))){fs.delete(new Path(args[1]),true);}Job job = Job.getInstance(conf);job.setJobName("join-reduce");job.setJarByClass(App.class);job.setMapperClass(JoinMapper.class);job.setReducerClass(JoinReducer.class);//添加输入路径FileInputFormat.addInputPath(job,new Path(args[0]));//设置输出路径FileOutputFormat.setOutputPath(job,new Path(args[1]));//设置mapreduce输出job.setMapOutputKeyClass(CombKey.class);job.setMapOutputValueClass(Text.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(NullWritable.class);job.setGroupingComparatorClass(CIDGroupComparator.class);job.setPartitionerClass(CIDPartitioner.class);job.setNumReduceTasks(2);//第一个阶段(job)job.waitForCompletion(true) ;}}

阅读全文

0 0