hadoop实现表连接算法

来源：互联网发布：紫金红葫芦淘宝编辑：程序博客网时间：2024/05/24 02:20

常见的单表，多表连接可以用SQL很简单的表示出来，但是对于hadoop就有点复杂了，这里简单介绍下如何用hadoop的key/value实现表之间的连接。

现在有两张表emp, dept分别显示员工信息和部门信息，需要实现以下的需求

1. 求每个部门的总工资

2. 求每个部门的平均工资和人数

SQL> select * from emp;

EMPNO ENAME JOB MGR HIREDATE SAL COMM DEPTNO
---------- ---------- --------- ---------- -------------- ---------- ---------- ----------
7369 SMITH CLERK 7902 17-12月-80 800 20
7499 ALLEN SALESMAN 7698 20-2月 -81 1600 300 30
7521 WARD SALESMAN 7698 22-2月 -81 1250 500 30
7566 JONES MANAGER 7839 02-4月 -81 2975 20
7654 MARTIN SALESMAN 7698 28-9月 -81 1250 1400 30
7698 BLAKE MANAGER 7839 01-5月 -81 2850 30
7782 CLARK MANAGER 7839 09-6月 -81 2450 10
7839 KING PRESIDENT 17-11月-81 5000 10
7844 TURNER SALESMAN 7698 08-9月 -81 1500 0 30
7900 JAMES CLERK 7698 03-12月-81 950 30
7902 FORD ANALYST 7566 03-12月-81 3000 20
7934 MILLER CLERK 7782 23-1月 -82 1300 10

已选择12行。

SQL> select * from dept;

DEPTNO DNAME LOC
---------- -------------- -------------
10 ACCOUNTING NEW YORK
20 RESEARCH DALLAS
30 SALES CHICAGO
40 OPERATIONS BOSTON

代码如下：

package homework;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class Exercise_4 extends Configured implements Tool{

enum Counter{
LINESKIP;
}

public static class Map extends Mapper<LongWritable, Text, Text, Text> 
{

public void map ( LongWritable key, Text value, Context context ) throws IOException, InterruptedException 
{
String line = value.toString();//读取源数据

try
{
if(line.substring(0,10).trim().length() == 4){
String salary = line.substring(59, 69).trim();
String deptno = line.substring(78, 88).trim();
context.write(new Text(deptno), new Text("1" + salary));

}else if (line.substring(0,10).trim().length() == 2){
String deptno = line.substring(0,10).trim();
String dname = line.substring(11, 25).trim();
context.write(new Text(deptno), new Text("2" + dname));
}

}
catch ( java.lang.ArrayIndexOutOfBoundsException e )
{
context.getCounter(Counter.LINESKIP).increment(1);//出错令计数器+1
return;
}
}
}

public static class Reduce extends Reducer<Text, Text, Text, Text> 
{
public void reduce ( Text key, Iterable<Text> values, Context context ) throws IOException, InterruptedException
{
String valueString;
Integer sumSalary = 0;
Integer perCount = 0;
Integer avgSalary = 0;

for ( Text value : values )
{
valueString = value.toString();
Long flag = Long.parseLong(valueString.substring(0, 1));

if(flag == 1){
perCount += 1;
sumSalary = sumSalary + Integer.parseInt(((valueString.substring(1))));

}else if(flag == 2){
key = new Text(valueString.substring(1));
}

}

if(perCount == 0){
avgSalary = 0;
}else{
avgSalary = sumSalary / perCount;
}

context.write( key, new Text(sumSalary.toString()+ "   " + avgSalary.toString() + "   " + perCount.toString()));
//context.write( key, new Text(sumSalary.toString()));
}
}

@Override
public int run(String[] args) throws Exception 
{

Job job = new Job();
job.setJarByClass(Exercise_4.class);//指定Class

FileInputFormat.addInputPath( job, new Path(args[0]) );//输入路径
FileOutputFormat.setOutputPath( job, new Path(args[1]) );//输出路径

job.setMapperClass( Map.class );//调用上面Map类作为Map任务代码
job.setReducerClass ( Reduce.class );//调用上面Reduce类作为Reduce任务代码
job.setOutputFormatClass( TextOutputFormat.class );
job.setOutputKeyClass( Text.class );//指定输出的KEY的格式
job.setOutputValueClass( Text.class );//指定输出的VALUE的格式

job.waitForCompletion(true);

return job.isSuccessful() ? 0 : 1;
}

/**  
 * 设置系统说明
 * 设置MapReduce任务
 */  
public static void main(String[] args) throws Exception 
{

//判断参数个数是否正确
//如果无参数运行则显示以作程序说明
if ( args.length != 2 )
{
System.err.println("");
System.err.println("Usage: Exercise_4 < input path > < output path > ");
System.err.println("Example: hadoop jar ~/Exercise_4.jar hdfs://localhost:9000/home/james/Exercise_4 hdfs://localhost:9000/home/james/output");
System.err.println("Counter:");
System.err.println("\t"+"LINESKIP"+"\t"+"Lines which are too short");
System.exit(-1);
}


//运行任务
int res = ToolRunner.run(new Configuration(), new Exercise_4(), args);

        System.exit(res);
}

}

发现写MapReduce程序单元测试很重要啊，不然调试起来会很麻烦的，这里贴下MRUnit单元测试的代码

package homework;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.junit.Before;
import org.junit.Test;

public class Exercise_4Test {
MapDriver<LongWritable, Text, Text, Text> mapDriver;
ReduceDriver<Text, Text, Text, Text> reduceDriver;

 @Before
  public void setUp() {
Exercise_4.Map mapper = new Exercise_4.Map();
Exercise_4.Reduce reducer = new Exercise_4.Reduce();

    mapDriver = MapDriver.newMapDriver(mapper);;
    reduceDriver = ReduceDriver.newReduceDriver(reducer);

  }
 
 @Test
  public void testMapper() throws IOException{
  // Temperature
  Text value1 = new Text("      7369 SMITH      CLERK           7902 17-12-80            800                    20");
  //Text value2 = new Text("        10 ACCOUNTING     NEW YORK");
  
  mapDriver.withInput(new LongWritable(), value1);
  mapDriver.withOutput(new Text("20"), new Text("1800"));
  mapDriver.runTest();
 }
 
// @Test
  public void testReducer() throws IOException {
    List<Text> values = new ArrayList<Text>();
    values.add(new Text("2ACCOUNTING"));
    values.add(new Text("1800"));
    values.add(new Text("11600"));
    values.add(new Text("13000"));
    reduceDriver.withInput(new Text("20"), values);
    reduceDriver.withOutput(new Text("ACCOUNTING"), new Text("5400" +  "   " + "1800" + "   " + "3"));
    reduceDriver.runTest();
  }

}

运行结果截图：

0 0