Hadoop单表与多表关联
来源:互联网 发布:it公司 编辑:程序博客网 时间:2024/06/07 06:33
在单表关联和多表关联的应用的时候,需要从数据里面挖掘出信息来进行操作。
例子给出 孩子和父母的表 输出孩子和爷爷的表 数据如下:
tomxdtomhongtonjackcyterrycy从样例输出,输入可以得出 需要进行单表的链接是自己链接自己,链接的是左表的parent列和右表的child列,结果除去爷孙链接即可。
首先在map阶段将输入数据分割成child和parent之后,需要将parent设为key,child设为value作为左表,而在另一组的child和parent中需要将parent 作为value child作为key,此表作为右表。为了区分左右表,需要将在输出value的时候 附带上左右的区别信息,字符1代表左表 字符2代表右表。map阶段就完成了左表 和右表 ,在shuffle阶段完成链接,reduce接收链接结果,其中key对应的value-list 包含了 对应的关系,取出value值进行分析,将child放入数组1 parent放入数组2 然后进行数组的笛卡尔积得出最后结果。
代码样式入下:
- package reverseIndex;
- import java.io.IOException;
- import java.util.Iterator;
- import java.util.StringTokenizer;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- public class singleJoin {
- public static int time = 0; // 用于表头的输出
- public static class Map extends Mapper<LongWritable, Text,Text,Text>{
- public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{
- String childName = new String();
- String parentName = new String();
- String relationType = new String(); // the flag of left_table right_table
- //handle line data
- StringTokenizer itr = new StringTokenizer(value.toString());
- String[] values = new String[2];
- int i = 0;
- while(itr.hasMoreTokens()){
- values[i] = itr.nextToken();
- i++;
- }
- //ignore the title
- if(values[0].compareTo("child")!=0){
- childName = values[0];
- parentName = values[1];
- //output the left_table
- relationType = "1";
- context.write(new Text(values[1]), new Text(relationType+"+"
- +childName+"+"+parentName));
- //output the right_table
- relationType = "2";
- context.write(new Text(values[0]),new Text(relationType+"+"
- +childName+"+"+parentName));
- }
- }
- }
- public static class Reduce extends Reducer<Text,Text,Text,Text>{
- public void reduce(Text key,Iterable<Text>value,Context context) throws IOException, InterruptedException{
- if(0 == time){
- //输出表头
- context.write(new Text("grandchild"),new Text("grandparent"));
- time++;
- }
- int grandchildnum = 0;
- int grandparentnum = 0;
- //定义存储grandchild,grandparent的数组
- String[] grandchild = new String[20];
- String[] grandparent = new String[20];
- Iterator<Text> it = value.iterator();
- while(it.hasNext()){
- String record = it.next().toString();
- int len = record.length();
- int i=2;
- if(0==len){
- continue;
- }
- char relationType = record.charAt(0);
- String childName = new String();
- String parentName = new String();
- while(record.charAt(i)!='+'){
- childName += record.charAt(i);
- i++;
- }
- i += 1;
- while(i<len){
- parentName += record.charAt(i);
- i++;
- }
- if('1'==relationType){
- grandchild[grandchildnum] = childName;
- grandchildnum++;
- }
- if('2'==relationType){
- grandparent[grandparentnum] = parentName;
- grandparentnum++;
- }
- }//end while 结束一行数据
- if(0!=grandchildnum && 0!=grandparentnum){
- for(int m=0;m<grandchildnum;m++){
- for(int n=0;n<grandparentnum;n++){
- context.write(new Text(grandchild[m]),new Text(grandparent[m]));
- }
- }
- }
- }//end reduce
- }//end Reduce
- public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
- // TODO Auto-generated method stub
- Configuration conf = new Configuration();
- Job job = new Job(conf,"singleJoin");
- job.setJarByClass(singleJoin.class);
- job.setMapperClass(Map.class);
- job.setReducerClass(Reduce.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- FileInputFormat.addInputPath(job,new Path(args[0]));
- FileOutputFormat.setOutputPath(job, new Path(args[1]));
- System.exit(job.waitForCompletion(true)?0:1);
- }
- }
多表链接:输入2个文件 ,一个学生表 包含学生名 和课程编号,另一个是教师名和课程编号 ,输出学生相对应的教师。
xd1xyy2lf3zz3输出:xdcyxyyldlfcyzzcy这种多表链接关系 类似于关系型数据库的 多表之间的查询,链接的join语句。
解决方法与上面一致:(左右表)
代码如下:
- package reverseIndex;
- import java.io.IOException;
- import java.util.Iterator;
- import java.util.StringTokenizer;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- public class multiJoin {
- public static int time = 0;
- public static class Map extends Mapper<LongWritable,Text,Text,Text>{
- public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{
- String mapKey = new String();
- String mapValue = new String();
- String line = value.toString();
- String relationType = new String();
- StringTokenizer it = new StringTokenizer(line);
- if(line.contains("studentName")||line.contains("courseId")){
- return;
- }
- int i=0;
- while(it.hasMoreTokens()){
- String token = it.nextToken();
- //如果是课程编号在前,就将作为右表,若是人名在前面就是作为左表
- if(token.charAt(0)>='0'&&token.charAt(0)<='9'){
- mapKey = token;
- if(i>0){
- relationType = "1";
- }else{
- relationType = "2";
- }
- continue;
- }
- mapValue += token+" ";
- i++;
- }
- //完后左右表的输出
- context.write(new Text(mapKey),new Text(relationType+"+"+mapValue));
- }
- }
- public static class Reduce extends Reducer<Text,Text,Text,Text>{
- public void reduce(Text key,Iterable<Text>values,Context context) throws IOException, InterruptedException{
- if(time == 0){
- context.write(new Text("student"),new Text("teacher"));
- time++;
- }
- int studentnum = 0;
- int teachernum = 0;
- String[] student = new String[10];
- String[] teacher = new String[10];
- Iterator<Text> it = values.iterator();
- while(it.hasNext()){
- String record = it.next().toString();
- int len = record.length();
- int i=2;//忽略relationType和+号
- if(0 == len){
- return;
- }
- char relationType = record.charAt(0);
- if('1' == relationType){
- student[studentnum] = record.substring(i);
- studentnum++;
- }
- if('2'==relationType){
- teacher[teachernum] = record.substring(i);
- teachernum++;
- }
- }
- if(0!=studentnum && 0!=teachernum){
- for(int m=0;m<studentnum;m++){
- for(int n=0;n<teachernum;n++){
- context.write(new Text(student[m]), new Text(teacher[n]));
- }
- }
- }
- }
- }
- public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
- // TODO Auto-generated method stub
- Configuration conf = new Configuration();
- Job job = new Job(conf,"multiJoin");
- job.setJarByClass(multiJoin.class);
- job.setMapperClass(Map.class);
- job.setReducerClass(Reduce.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- FileInputFormat.addInputPath(job,new Path(args[0]));
- FileOutputFormat.setOutputPath(job, new Path(args[1]));
- System.exit(job.waitForCompletion(true)?0:1);
- }
- }
0 0
- Hadoop单表与多表关联
- Hadoop 单表关联 多表关联
- hadoop单表关联
- hadoop单表关联
- hadoop--单表关联
- hadoop学习--单表关联
- hadoop学习--单表关联
- hadoop实现单表和多表关联
- hadoop实现单表和多表关联
- hadoop编程实例--单表关联
- Hadoop MapReduce单表关联程序
- Hadoop 2.x 单表关联
- 【Hadoop基础教程】6、Hadoop之单表关联查询
- hadoop多表关联
- hadoop多表关联
- hadoop 多表关联
- Hadoop案例之单表关联输出祖孙关系
- hadoop mapreduce多表关联
- 需求规格说明书中的特性编写参考
- Android下定时执行特定任务的几种方法
- OpenCV基础篇之读取显示图片
- callablestatement 使用方法
- OpenCV基础篇之像素操作对比度调节
- Hadoop单表与多表关联
- 给定正整数n和m,计算出n个元素的集合{1,2,...,n}可以划分为多少个不同的由m个元素组成的子集合
- Win32控制台程序是什么
- 子类、父类各种方法的执行顺序
- DSP/BIOS使用之初窥门径——滴答时钟及烧写Flash
- Hadoop管理员的十个最佳实践
- Spring容器动态注入bean
- LINUX学习笔记
- POJ_2262