hadoop学习--多表关联
来源:互联网 发布:软土路基沉降观测数据 编辑:程序博客网 时间:2024/06/05 09:09
本例从多个表中提取出所需要的信息。
输入是2个文件,一个表示工厂表,包含工厂名和地址编号;另一个表示地址表,包含地址名和地址编号。根据2个表的信息输出工厂名-地址名表。
factory.txt:
factorynameaddressedBeijing Red Star1Shenzhan Thunder3Guangzhou Honda2Beijing Rising1Guangzhou Development Bank2Tencent3Bank of Beijing1address.txt:
addressIDaddressname1Beijing2Guangzhou3Shenzhen5Hangzhou输出:
factorynameaddressnameBeijing Red StarBeijingBeijing RisingBeijingBank of BeijingBeijingGuangzhou HondaGuangzhouGuangzhou Development BankGuangzhouShenzhan ThunderShenzhenTencentShenzhen1、设计思路在map阶段,对于每个输入以adressID为key进行保存;
来自factory.txt则存为:
<1,2:Beijing Red Star>
<3,2:Shenzhan Thunder>
...
来自address.txt则存为
<1,1:Beijing>
<2,1:Guangzhou>
...
这里的value开头的1: 2: 用来区分来自不同的表,将在reduce中用到。
在reduce阶段
对于相同的key,保存相应的factoryname和addressname。具体细节可参考上一篇单表关联部分。
2、程序代码
- import java.io.IOException;
- import java.util.StringTokenizer;
- import java.util.*;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- public class MTjoin {
- public static class TokenizerMapper
- extends Mapper<Object, Text, Text, Text>{
- String tabletype = new String();
- String childname = new String();
- String parentname = new String();
- public void map(Object key, Text value, Context context
- ) throws IOException, InterruptedException {
- String line = value.toString();
- int i = 0;
- if(line.contains("factoryname") == true || line.contains("addressID") == true)
- {
- return;
- }
- while(line.charAt(i) > '9' || line.charAt(i) < '0')
- {
- i++;
- }
- if(i == 0) //address
- {
- int j = i + 1;
- while(line.charAt(j) != ' ')j++;
- String[] values = {line.substring(0,j),line.substring(j+1)};
- context.write(new Text(values[0]),new Text("1:" + values[1]));
- }
- else //name
- {
- int j = i - 1;
- while(line.charAt(j) != ' ')j--;
- String[] values = {line.substring(0,j),line.substring(i)};
- context.write(new Text(values[1]),new Text("2:" + values[0]));
- }
- }
- }
- public static class IntSumReducer
- extends Reducer<Text,Text,Text,Text> {
- private IntWritable result = new IntWritable();
- int count = 0;
- public void reduce(Text key, Iterable<Text> values,
- Context context
- ) throws IOException, InterruptedException {
- if(count == 0)
- {
- context.write(new Text("factoryname"),new Text("addressname"));
- count++;
- }
- int factorynum = 0;
- int addressnum = 0;
- String[] factoryname = new String[10];
- String[] addressname = new String[10];
- String strrecord = new String();
- String[] strArr = new String[3];
- Iterator ite = values.iterator();
- while(ite.hasNext())
- {
- strrecord = ite.next().toString();
- if(strrecord.length()<=0)
- {
- continue;
- }
- char type = strrecord.charAt(0);
- if(type == '1')
- {
- addressname[addressnum++] = strrecord.substring(2);
- }
- else if(type == '2')
- {
- factoryname[factorynum++] = strrecord.substring(2);
- }
- }
- if(factorynum != 0 && addressnum != 0)
- {
- for(int i = 0;i < factorynum;i++)
- {
- for(int j = 0; j < addressnum;j++)
- {
- context.write(new Text(factoryname[i]),new Text(addressname[j]));
- }
- }
- }
- }
- }
- public static void main(String[] args) throws Exception {
- Configuration conf = new Configuration();
- String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
- if (otherArgs.length != 3) {
- System.err.println("Usage: MTjoin <in> <out>");
- System.exit(2);
- }
- Job job = new Job(conf, "MTjoin");
- job.setJarByClass(MTjoin.class);
- job.setMapperClass(TokenizerMapper.class);
- //job.setCombinerClass(IntSumReducer.class);
- job.setReducerClass(IntSumReducer.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
- FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
- FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
- System.exit(job.waitForCompletion(true) ? 0 : 1);
- }
- }
0 0
- hadoop学习--多表关联
- hadoop学习--多表关联
- hadoop学习--单表关联
- hadoop学习--单表关联
- hadoop多表关联
- hadoop多表关联
- hadoop 多表关联
- Hadoop 单表关联 多表关联
- hadoop mapreduce多表关联
- hadoop实例---多表关联
- hadoop实例---多表关联
- Hadoop实现多表关联
- Hadoop单表与多表关联
- Hadoop MapReduce多表关联程序
- Hadoop 2.x 多表关联
- hadoop单表关联
- hadoop单表关联
- hadoop--单表关联
- JSP内置对象、异常处理与乱码问题
- Android每隔2秒执行一次命令 即定时发送任务 非Timer方法
- 浅谈Java中的Set、List、Map的区别
- uva 357 Let Me Count The Ways 简单dp
- IOS多线程通信
- hadoop学习--多表关联
- Android 开发实战 - 数据备份 - 01
- 如何测试SPI slave是否有问题
- js 分页判断
- 超链接 的相对路径的问题?
- iOS文字排版(CoreText)
- 浅谈Java内部类的四个应用场景
- JProbe Version: 8.1.0
- 烦死飞的士速递