MapReducer多表查询
来源:互联网 发布:淘宝卖家中心网页 编辑:程序博客网 时间:2024/05/08 10:21
一个例子晚上例子,别处看来的,但是有些代码看不懂
表1
factoryname addressed
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Back of Beijing 1
表2
addressID addressname
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian
package HbaseCoprocessor;import org.apache.hadoop.mapreduce.Mapper;import java.io.File;import java.io.IOException;import java.nio.file.Files;import java.util.*;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class MoreTableMapReducer {private static int time = 0; public static class Map extends Mapper<Object,Text,Text,Text>{ //在map中先区分输入行属于左表还是右表,然后对两列值进行分割, //保存连接列在key值,剩余列和左右表标志在value中,最后输出 public void map(Object key,Text value,Context context) throws IOException,InterruptedException{ String line = value.toString(); int i = 0; //输入文件首行,不处理 if(line.contains("factoryname")==true||line.contains("addressID")==true){ return; } //找出数据中的分割点 while(line.charAt(i)>='9'||line.charAt(i)<='0'){ i++; } if(line.charAt(i)>='9'||line.charAt(i)<='0'){ //左表 int j = i-1; while(line.charAt(j)!=' ') j--; String[] values = {line.substring(0, j),line.substring(i)}; context.write(new Text(values[1]), new Text("1+"+values[0])); }else{//右表 int j = i+1; while(line.charAt(j)!=' ') j++; String[] values = {line.substring(0, i+1),line.substring(j)}; context.write(new Text(values[0]), new Text("2+"+values[1])); } } } public static class Reduce extends Reducer<Text,Text,Text,Text>{ //reduce解析map输出,将value中数据按照左右表分别保存,然后求笛卡尔积,输出 public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{ if(time == 0){//输入文件第一行 context.write(new Text("factoryname"),new Text("addressname")); time++; } int factorynum = 0; String factory[] = new String[10]; int adressnum = 0; String adress[] = new String[10]; Iterator iter = values.iterator(); while(iter.hasNext()){ String record = iter.next().toString(); int len = record.length(); int i = 2; char type = record.charAt(0); String factoryname = new String(); String adressname = new String(); if(type == '1'){//左表 factory[factorynum] = record.substring(2); factorynum++; }else{//右表 adress[adressnum] = record.substring(2); } } if(factorynum!=0&&adressnum!=0){//笛卡尔积 for(int m=0;m<factorynum;m++){ for(int n=0;n<adressnum;n++){ context.write(new Text(factory[m]), new Text(adress[n])); } } } } } /** * @param args */ public static void main(String[] args) throws Exception{ // TODO Auto-generated method stub Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs(); if(otherArgs.length != 2){ System.err.println("Usage WordCount <int> <out>"); System.exit(2); } Job job = new Job(conf,"word count"); job.setJarByClass(MoreTableMapReducer.class); job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }后来自己琢磨了一下, 搞出来一个自己写的通俗版本的
输出结果为
factorynameaddressnameBack of BeijingBeijing1Beijing RisingBeijing1Beijing Red StarBeijing1Guangzhou Development BankGuangzhou2Guangzhou HondaGuangzhou2TencentShenzhen3Shenzhen ThunderShenzhen3
代码为
package HbaseCoprocessor;import org.apache.hadoop.mapreduce.Mapper;import java.io.File;import java.io.IOException;import java.nio.file.Files;import java.util.*;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class MTjoin {public static int time = 0;/* * * 在map中先区分输入行属于左表还是右表,然后对两列值进行分割, * * 保存连接列在key值,剩余列和左右表标志在value中,最后输出 */public static class Map extends Mapper<Object, Text, Text, Text> {// 实现map函数public void map(Object key, Text value, Context context)throws IOException, InterruptedException {String line = value.toString();// 每行文件// 输入文件首行,不处理 if (line.contains("factoryname") == true || line.contains("addressID") == true) {return;} String[] arr=line.split("\t"); //第几张表 根据表的字段来进行判断 String tableNum=""; if (arr[0].length()>1) {tableNum="1";context.write(new Text(arr[1]), new Text(tableNum + "_" + arr[0]));}else{tableNum="2";context.write(new Text(arr[0]), new Text(tableNum + "_" + arr[1]));}}}/* * * reduce解析map输出,将value中数据按照左右表分别保存, * * * 然后求出笛卡尔积,并输出。 */public static class Reduce extends Reducer<Text, Text, Text, Text> {// 实现reduce函数public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {// 输出表头if (0 == time) {context.write(new Text("factoryname"), new Text("addressname"));time++;} Iterator ite = values.iterator();List<String> tableList=new ArrayList<String>();String str2="";while (ite.hasNext()) {String record = ite.next().toString();//切割表2的内容String[] arr2=record.split("_");if (record.contains("2")) {str2=arr2[1];}else if(record.contains("1")){tableList.add(arr2[1]);}}for (int i = 0; i < tableList.size(); i++) {context.write(new Text(tableList.get(i)), new Text(str2+"\t"+key.toString()));System.out.println(key.toString());}}}public static void deleteAllFilesOfDir(File path) { if (!path.exists()) return; if (path.isFile()) { path.delete(); return; } File[] files = path.listFiles(); for (int i = 0; i < files.length; i++) { deleteAllFilesOfDir(files[i]); } path.delete(); } public static void main(String[] args) throws Exception {File file= new File("F:\\hadoop\\out"); deleteAllFilesOfDir(file);Configuration conf = new Configuration();// 这句话很关键System.setProperty("hadoop.home.dir","F:/alljar/hadoop-2.4.1-x64/hadoop-2.4.1");Job job = new Job(conf, "Multiple Table Join");job.setJarByClass(MTjoin.class);// 设置Map和Reduce处理类job.setMapperClass(Map.class);job.setReducerClass(Reduce.class);// 设置输出类型job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);// 设置输入和输出目录FileInputFormat.addInputPath(job, new Path("F:\\hadoop\\in"));FileOutputFormat.setOutputPath(job, new Path("F:\\hadoop\\out"));System.exit(job.waitForCompletion(true) ? 0 : 1);}}
这本来是一个面试题。当时其实也想到了方法,但是面试者出的字段不清不楚,加上有点紧张一下子没理清楚思路,回来写了一下并不难,记录一下, 下一次可以轻松面对了。也顺便想尝试一下3表的代码, 逻辑似乎一下,hive的底层应该就是这样的
阅读全文
0 0
- MapReducer多表查询
- 两表MapReducer
- MapReducer浅析
- MapReducer优化
- MapReducer的Job调优
- MapReducer-找共同好友
- MapReducer实现单词计算
- MapReducer程序调试技巧
- MapReducer入门案例MyWordCount
- MapReducer工作流程小结
- Mapreducer任务启动
- [Hadoop]MapReducer工作过程
- MapReducer中文编码
- Hadoop MapReducer工作过程
- Hadoop客户端 mapreducer
- MapReducer流量统计
- python 实现 mapreducer
- 基于mapreducer的图算法
- 查看js 执行效率工具
- 最短路径题集1
- bash shell 文件语法解读与总结
- Flask--物联网应用(三)-ESP8266网页调试系统
- Android中的HashMap原理实践探索,重写equals(),为什么重写hashCode() 和 ==
- MapReducer多表查询
- android 设置EditText只能输入大写字母,Fragment的软键盘输入模式
- java集合框架之LinkedList类
- Android编程之SparseArray<E>详解
- 初学$ajax()遇到的问题
- 基于TCP/IP的实现
- java的jdk部署错误问题解决
- POJ 2398 Toy Storage 笔记
- Maven入门之环境安装