MapReduce实战练习二:两张表的合并汇总
来源:互联网 发布:linux 当前目录大小 编辑:程序博客网 时间:2024/06/16 21:00
需求:
订单数据表t_order:
id
date
pid
amount
1001
20150710
P0001
2
1002
20150710
P0001
3
1002
20150710
P0002
3
商品信息表t_product
id
pname
category_id
price
P0001
小米5
1000
2
P0002
锤子T1
1000
3
假如数据量巨大,两表的数据是以文件的形式存储在HDFS中,需要用mapreduce程序来实现一下SQL查询运算:
select a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id(测试文件中数据之间用逗号分隔)
InfoBean来封装相关数据
package com.bpf.mr.rjoin;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Writable;public class InfoBean implements Writable { private int order_id; private String dateString; private String p_id; private int amount; private String pname; private int category_id; private float price; //flag=0代表这个对象封装订单表 //flag=1代表这个对象封装产品信息表 private int flag; public InfoBean() {} public void set(int order_id, String dateString, String p_id, int amount, String pname, int category_id, float price, int flag) { this.order_id = order_id; this.dateString = dateString; this.p_id = p_id; this.amount = amount; this.pname = pname; this.category_id = category_id; this.price = price; this.flag = flag; } public int getOrder_id() { return order_id; } public void setOrder_id(int order_id) { this.order_id = order_id; } public String getDateString() { return dateString; } public void setDateString(String dateString) { this.dateString = dateString; } public String getP_id() { return p_id; } public void setP_id(String p_id) { this.p_id = p_id; } public int getAmount() { return amount; } public void setAmount(int amount) { this.amount = amount; } public String getPname() { return pname; } public void setPname(String pname) { this.pname = pname; } public int getCategory_id() { return category_id; } public void setCategory_id(int category_id) { this.category_id = category_id; } public float getPrice() { return price; } public void setPrice(float price) { this.price = price; } public int getFlag() { return flag; } public void setFlag(int flag) { this.flag = flag; } @Override public void readFields(DataInput in) throws IOException { this.order_id = in.readInt(); this.dateString = in.readUTF(); this.p_id = in.readUTF(); this.amount = in.readInt(); this.pname = in.readUTF(); this.category_id = in.readInt(); this.price = in.readFloat(); this.flag = in.readInt(); } @Override public void write(DataOutput out) throws IOException { out.writeInt(order_id); out.writeUTF(dateString); out.writeUTF(p_id); out.writeInt(amount); out.writeUTF(pname); out.writeInt(category_id); out.writeFloat(price); out.writeInt(flag); } @Override public String toString() { return "order_id=" + order_id + ", dateString=" + dateString + ", p_id=" + p_id + ", amount=" + amount + ", pname=" + pname + ", category_id=" + category_id + ", price=" + price + ", flag=" + flag; } }
package com.bpf.mr.rjoin;import java.io.IOException;import java.net.URI;import java.util.ArrayList;import org.apache.commons.beanutils.BeanUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class Rjoin { static class RjoinMapper extends Mapper<LongWritable, Text, Text, InfoBean> { InfoBean bean = new InfoBean(); Text t = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); FileSplit split = (FileSplit) context.getInputSplit(); String name = split.getPath().getName(); String pid = ""; //通过文件名判断是哪种数据 if(name.startsWith("order")) { String[] field = line.split(","); bean.set(Integer.parseInt(field[0]), field[1], field[2], Integer.parseInt(field[3]), "", 0, 0, 0); pid = field[2]; }else { String[] field = line.split(","); bean.set(0, "", field[0], 0, field[1], Integer.parseInt(field[2]), Float.parseFloat(field[3]), 1); pid = field[0]; } t.set(pid); context.write(t, bean); } } static class RjoinReducer extends Reducer<Text, InfoBean, InfoBean, NullWritable>{ @Override protected void reduce(Text pid, Iterable<InfoBean> beans, Context context) throws IOException, InterruptedException { //每一个pid对应多组订单 InfoBean pdBean = new InfoBean(); ArrayList<InfoBean> orderBeans = new ArrayList<InfoBean>(); for (InfoBean infoBean : beans) { if(infoBean.getFlag() == 1 ) { try { BeanUtils.copyProperties(pdBean, infoBean); } catch (Exception e) { e.printStackTrace(); } }else { InfoBean orderBean = new InfoBean(); try { BeanUtils.copyProperties(orderBean, infoBean); orderBeans.add(orderBean); } catch (Exception e) { e.printStackTrace(); } } } //拼接两类数据,形成最终结果 for (InfoBean bean : orderBeans) { bean.setPname(pdBean.getPname()); bean.setCategory_id(pdBean.getCategory_id()); bean.setPrice(pdBean.getPrice()); context.write(bean, NullWritable.get()); } } public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); final Job job = Job.getInstance(conf); job.setJarByClass(Rjoin.class); job.setMapperClass(RjoinMapper.class); job.setReducerClass(RjoinReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(InfoBean.class); // TODO: specify output types job.setOutputKeyClass(InfoBean.class); job.setOutputValueClass(NullWritable.class); //便于测试,若存在输出目录,则删除 Path outPath = new Path("hdfs://Master:9000/output"); FileSystem fs = FileSystem.get(new URI("hdfs://Master:9000"), conf); if(fs.exists(outPath)) { fs.delete(outPath,true); } // TODO: specify input and output DIRECTORIES (not files) FileInputFormat.setInputPaths(job, "hdfs://Master:9000/bpf"); FileOutputFormat.setOutputPath(job, outPath); job.waitForCompletion(true); } } }
阅读全文
0 0
- MapReduce实战练习二:两张表的合并汇总
- MapReduce练习(二)
- flume+mapreduce实战小练习
- MapReduce 练习二 找朋友
- MapReduce练习二:ChainMapper和ChainReducer的使用
- Hadoop实战:*********MapReduce的性能调优(二)*********
- MapReduce实战练习一:手机流量统计
- MapReduce实战练习三:倒排索引
- MapReduce实战练习四:找出共同好友
- Leetcode练习<二十二> 合并排序好的链表
- mapreduce编程练习(二)倒排索引 Combiner的使用以及练习
- Oracle实战练习(续二)
- [oc实战练习二]程序里面的网络请求
- Hadoop实战【二、MapReduce+自定义数据类型】
- Clojure:算法练习的实现(二)——合并排序
- Hadoop原理汇总(二)——MapReduce
- Hadoop之——MapReduce实战(二)
- MapReduce编程实战(二)——20151112
- git如何关联远程仓库以及如何切换关联的远程分支
- 搭建5个节点的hadoop集群环境(CDH5)
- Linux面试题20道
- java 正则表达式截取 json 对象的键值
- python在linux中输出带颜色的文字的方法
- MapReduce实战练习二:两张表的合并汇总
- mac 关闭sip 保护系统
- putty下载链接提供
- HDU 3336 Count the string
- 『 Spark 』7. 使用 Spark DataFrame 进行大数据分析
- C#学习目标
- Could not resolve placeholder 'jdbc.url' in string value "${jdbc.url}"
- HttpServlet
- django发送邮件setting设置及celery设置