MapReduce---连接操作--map端连接

来源：互联网发布：python 随机整数编辑：程序博客网时间：2024/06/18 18:42

在项目开发中，要实现两个“表”的join操作，其中一个表数据量小，一个表很大，这种场景在实际中非常常见，比如“订单日志”join “产品信息”采用map端连接

原理：适用于大表 + 小表(载入内存)。

map之前执行，加载文件到内存，形成map

可以大大提高join操作的并发度，加快处理速度

1、JoinMapper

package hadoop.join.map;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.Mapper;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.util.HashMap;import java.util.Map;/** * Mapper */public class JoinMapper extends Mapper<LongWritable,Text ,Text,NullWritable>{private Map<String,String> customers  ;/** * map之前执行，加载文件到内存，形成map */protected void setup(Context context) throws IOException, InterruptedException {//加载customers.txtcustomers = new HashMap<String, String>();String path = context.getConfiguration().get("customers.path") ;FSDataInputStream in = FileSystem.get(context.getConfiguration()).open(new Path(path));BufferedReader br = new BufferedReader(new InputStreamReader(in)) ;String line = null ;while((line = br.readLine()) != null){String[] arr = line.split(",");customers.put(arr[0] , line) ;}     }

2.App
package hadoop.join.map;import com.it18zhang.hadoop.lean.key.DataLeanMapper1;import com.it18zhang.hadoop.lean.key.DataLeanMapper2;import com.it18zhang.hadoop.lean.key.DataLeanReducer1;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * join:map端连接 */public class App {public static void main(String[] args) throws Exception {args = new String[]{"d:/java/mr/join/orders.txt", "d:/java/mr/out", "d:/java/mr/join/customers.txt" } ;Configuration conf = new Configuration();conf.set("customers.path",args[2]);FileSystem fs = FileSystem.get(conf);if(fs.exists(new Path(args[1]))){fs.delete(new Path(args[1]),true);}Job job = Job.getInstance(conf);job.setJobName("join-map");job.setJarByClass(App.class);job.setMapperClass(JoinMapper.class);//添加输入路径FileInputFormat.addInputPath(job,new Path(args[0]));//设置输出路径FileOutputFormat.setOutputPath(job,new Path(args[1]));//设置mapreduce输出job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(NullWritable.class);job.setNumReduceTasks(0);//第一个阶段(job)job.waitForCompletion(true) ;}}

阅读全文

0 0