Hadoop入门之Join的两种实现Demo
来源:互联网 发布:校园网络电视台介绍 编辑:程序博客网 时间:2024/06/04 11:46
需求: 订单表和商品表合到一起
order.txt(订单id, 日期, 商品编号, 数量)
1001 20150710 P0001 2
1002 20150710 P0001 3
1002 20150710 P0002 3
1003 20150710 P0003 3
product.txt(商品编号, 商品名字, 价格)
P0001 小米5 1001
P0002 锤子T1 1000
P0003 锤子 1002
这种如果在mysql中,就是一个Join的查询,那MR如果实现呢?
第一种在Reducer中实现补操作:
第二种:直接使用Mapper+分布式文件缓存来实现:
这种如果在mysql中,就是一个Join的查询,那MR如果实现呢?
第一种在Reducer中实现补操作:
package com.demo.join.one;import java.io.IOException;import java.lang.reflect.InvocationTargetException;import java.util.ArrayList;import java.util.List;import org.apache.commons.beanutils.BeanUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * @Description: 一种方式的Join查询组合实现 Reducer端实现 * @author: songqinghu * @date: 2017年8月30日 下午2:41:09 * Version:1.0 */public class OneJoin { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set("HADOOP_USER_NAME", "hadoop"); Job job = Job.getInstance(conf); job.setJarByClass(OneJoin.class); job.setMapperClass(OneJoinMapper.class); job.setReducerClass(OneJoinReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(OrderBean.class); job.setOutputKeyClass(OrderBean.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setNumReduceTasks(2); job.waitForCompletion(true); }}class OneJoinMapper extends Mapper<LongWritable, Text, Text, OrderBean>{ private OrderBean bean = new OrderBean(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //依靠Key进行分发 是其key值保持一致,即可将两表分发到同一个reducer上处理,这里是用商品Id相连的. //这里存在一个问题,某个商品的数量非常多的情况,在reducer中进行循环迭代补全时会不会发生内存溢出的情况? //假设每个Bean 1k 1g内存大约存储1百万 还算可以 --->这里引出另外一个问题某个reducer处理的多,其余的处理的少,如何解决? //根据文件名称进行判断组装bean FileSplit inputSplit = (FileSplit) context.getInputSplit(); String fileName = inputSplit.getPath().getName(); String[] infos = value.toString().split("\t"); if(fileName.startsWith("order")){//订单文件 bean.setAll(infos[0],infos[1], infos[2], infos[3], "", "", true); context.write(new Text(infos[2]),bean); }else{//商品文件 bean.setAll("","", infos[0],"", infos[1], infos[2], false); context.write(new Text(infos[0]),bean); }// System.out.println(bean.toString()); } }class OneJoinReducer extends Reducer<Text, OrderBean, OrderBean, NullWritable>{ @Override protected void reduce(Text Text, Iterable<OrderBean> iters, Context content) throws IOException, InterruptedException { List<OrderBean> beans = new ArrayList<OrderBean>(); OrderBean bean =new OrderBean(); for (OrderBean orderBean : iters) { if(orderBean.isOrder()){ try { beans.add((OrderBean) BeanUtils.cloneBean(orderBean)); } catch (IllegalAccessException | InstantiationException | InvocationTargetException | NoSuchMethodException e) { e.printStackTrace(); } }else{ bean.setAll("", "", orderBean.getItemId(), "", orderBean.getItemName(), orderBean.getPrice(), false); } } //对bean进行补全输出 for (OrderBean orderBean : beans) { orderBean.setItemName(bean.getItemName()); orderBean.setPrice(bean.getPrice()); content.write(orderBean,null); } }}
package com.demo.join.one;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Writable;/** * @Description: 订单信息类 * @author: songqinghu * @date: 2017年8月30日 下午2:25:24 * Version:1.0 * order.txt(订单id, 日期, 商品编号, 数量) 1001 20150710 P0001 2 1002 20150710 P0001 3 1002 20150710 P0002 3 1003 20150710 P0003 3 product.txt(商品编号, 商品名字, 价格) P0001 小米5 1001 P0002 锤子T1 1000 P0003 锤子 1002 */public class OrderBean implements Writable{ private String orderId; private String dateString; private String itemId; private String number; private String itemName; private String price; private boolean isOrder;//是否是订单表 public OrderBean() {} public OrderBean(String orderId, String dateString, String itemId, String number, String itemName, String price, boolean isOrder) { this.orderId = orderId; this.dateString = dateString; this.itemId = itemId; this.number = number; this.itemName = itemName; this.price = price; this.isOrder = isOrder; } public void setAll(String orderId, String dateString, String itemId, String number, String itemName, String price, boolean isOrder) { this.orderId = orderId; this.dateString = dateString; this.itemId = itemId; this.number = number; this.itemName = itemName; this.price = price; this.isOrder = isOrder; } public boolean isOrder() { return isOrder; } public void setOrder(boolean isOrder) { this.isOrder = isOrder; } public String getOrderId() { return orderId; } public void setOrderId(String orderId) { this.orderId = orderId; } public String getDateString() { return dateString; } public void setDateString(String dateString) { this.dateString = dateString; } public String getItemId() { return itemId; } public void setItemId(String itemId) { this.itemId = itemId; } public String getNumber() { return number; } public void setNumber(String number) { this.number = number; } public String getItemName() { return itemName; } public void setItemName(String itemName) { this.itemName = itemName; } public String getPrice() { return price; } public void setPrice(String price) { this.price = price; } @Override public void write(DataOutput out) throws IOException { out.writeUTF(orderId); out.writeUTF(dateString); out.writeUTF(itemId); out.writeUTF(number); out.writeUTF(itemName); out.writeUTF(price); out.writeBoolean(isOrder); } @Override public void readFields(DataInput in) throws IOException { this.orderId = in.readUTF(); this.dateString = in.readUTF(); this.itemId = in.readUTF(); this.number = in.readUTF(); this.itemName = in.readUTF(); this.price = in.readUTF(); this.isOrder = in.readBoolean(); } @Override public String toString() { return orderId + "\t" + dateString + "\t" + itemId + "\t" + number + "\t" + itemName + "\t" + price; } }
第二种:直接使用Mapper+分布式文件缓存来实现:
package com.demo.join.two;import java.io.BufferedReader;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.net.URI;import java.util.HashMap;import java.util.Map;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * @Description: 第二种方式的Join 克服数据倾斜问题 * @author: songqinghu * @date: 2017年8月30日 下午3:54:22 * Version:1.0 */public class TwoJoin { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set("HADOOP_USER_NAME", "hadoop"); Job job = Job.getInstance(conf); job.setJarByClass(TwoJoin.class); job.setMapperClass(TwoJoinMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setNumReduceTasks(0);//不进行reducer处理 //设置mapper节点缓存文件 job.addCacheFile(new URI("file:/E:/hadoop/twojoin/item.txt")); job.waitForCompletion(true); }}class TwoJoinMapper extends Mapper<LongWritable, Text, Text, NullWritable>{ private Map<String, String> items = new HashMap<String, String>(); @Override protected void setup(Context context) throws IOException, InterruptedException { //这里读取商品信息(可以多种渠道DB,文件) 这里使用分布式缓存 将指定文件上传到各个mapper节点classpath目录下 BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("item.txt"))); String line = reader.readLine(); String[] infos = line.split(","); items.put(infos[0],infos[1]); reader.close(); } Text text = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //这里每个节点都缓存了商品信息了 String line = value.toString(); String[] splits = line.split(","); String itemName = items.get(splits[2]); text.set(line+"\t"+itemName); context.write(text, null); } }
阅读全文
0 0
- Hadoop入门之Join的两种实现Demo
- Hadoop入门之共同好友实现Demo
- hadoop的两表join
- hadoop实现Join的几种方法
- Hadoop入门之Mapreduce过程的几个Demo
- Hadoop入门之几个Demo的加强版本
- hadoop 两表join
- Hadoop入门之Flume的几种配置和使用采集日志Demo
- oracle实现left join的两种写法
- 用MR实现Join逻辑的两种方法
- Hadoop开篇之Mapreduce实现多类别流量统计的两种实现方式
- Hadoop 中的两表join
- Hadoop 中的两表join
- hadoop两表join相关
- Hadoop 中的两表join
- Hadoop 中的两表join
- hadoop join之semi join
- hadoop join之semi join
- OpenCV2编程手册笔记之 5.3形态学滤波对图像进行开闭运算
- Java中HashMap和HashSet存储机制
- windows faster r-cnn制作自己的数据集并训练
- WebService学习(二)
- LED驱动
- Hadoop入门之Join的两种实现Demo
- 单调栈与单调队列
- sql server技术知识
- linux下tomcat配置自定义jdk
- ERROR 1205 : Lock wait timeout exceeded; try restarting transaction
- strace 跟踪进程中的系统调用
- Spring boot入门五 spring mvc spring boot mybatis配置整合
- TCP和UDP的最完整的区别
- java.security.cert.CertificateException: No name matching https证书验证不通过