hadoop实现表连接算法
来源:互联网 发布:紫金红葫芦淘宝 编辑:程序博客网 时间:2024/05/24 02:20
代码如下:SQL> select * from emp;
EMPNO ENAME JOB MGR HIREDATE SAL COMM DEPTNO
---------- ---------- --------- ---------- -------------- ---------- ---------- ----------
7369 SMITH CLERK 7902 17-12月-80 800 20
7499 ALLEN SALESMAN 7698 20-2月 -81 1600 300 30
7521 WARD SALESMAN 7698 22-2月 -81 1250 500 30
7566 JONES MANAGER 7839 02-4月 -81 2975 20
7654 MARTIN SALESMAN 7698 28-9月 -81 1250 1400 30
7698 BLAKE MANAGER 7839 01-5月 -81 2850 30
7782 CLARK MANAGER 7839 09-6月 -81 2450 10
7839 KING PRESIDENT 17-11月-81 5000 10
7844 TURNER SALESMAN 7698 08-9月 -81 1500 0 30
7900 JAMES CLERK 7698 03-12月-81 950 30
7902 FORD ANALYST 7566 03-12月-81 3000 20
7934 MILLER CLERK 7782 23-1月 -82 1300 10
已选择12行。
SQL> select * from dept;
DEPTNO DNAME LOC
---------- -------------- -------------
10 ACCOUNTING NEW YORK
20 RESEARCH DALLAS
30 SALES CHICAGO
40 OPERATIONS BOSTON
发现写MapReduce程序单元测试很重要啊,不然调试起来会很麻烦的,这里贴下MRUnit单元测试的代码package homework;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Exercise_4 extends Configured implements Tool{
enum Counter{
LINESKIP;
}
public static class Map extends Mapper<LongWritable, Text, Text, Text>
{
public void map ( LongWritable key, Text value, Context context ) throws IOException, InterruptedException
{
String line = value.toString();//读取源数据
try
{
if(line.substring(0,10).trim().length() == 4){
String salary = line.substring(59, 69).trim();
String deptno = line.substring(78, 88).trim();
context.write(new Text(deptno), new Text("1" + salary));
}else if (line.substring(0,10).trim().length() == 2){
String deptno = line.substring(0,10).trim();
String dname = line.substring(11, 25).trim();
context.write(new Text(deptno), new Text("2" + dname));
}
}
catch ( java.lang.ArrayIndexOutOfBoundsException e )
{
context.getCounter(Counter.LINESKIP).increment(1);//出错令计数器+1
return;
}
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text>
{
public void reduce ( Text key, Iterable<Text> values, Context context ) throws IOException, InterruptedException
{
String valueString;
Integer sumSalary = 0;
Integer perCount = 0;
Integer avgSalary = 0;
for ( Text value : values )
{
valueString = value.toString();
Long flag = Long.parseLong(valueString.substring(0, 1));
if(flag == 1){
perCount += 1;
sumSalary = sumSalary + Integer.parseInt(((valueString.substring(1))));
}else if(flag == 2){
key = new Text(valueString.substring(1));
}
}
if(perCount == 0){
avgSalary = 0;
}else{
avgSalary = sumSalary / perCount;
}
context.write( key, new Text(sumSalary.toString()+ " " + avgSalary.toString() + " " + perCount.toString()));
//context.write( key, new Text(sumSalary.toString()));
}
}
@Override
public int run(String[] args) throws Exception
{
Job job = new Job();
job.setJarByClass(Exercise_4.class);//指定Class
FileInputFormat.addInputPath( job, new Path(args[0]) );//输入路径
FileOutputFormat.setOutputPath( job, new Path(args[1]) );//输出路径
job.setMapperClass( Map.class );//调用上面Map类作为Map任务代码
job.setReducerClass ( Reduce.class );//调用上面Reduce类作为Reduce任务代码
job.setOutputFormatClass( TextOutputFormat.class );
job.setOutputKeyClass( Text.class );//指定输出的KEY的格式
job.setOutputValueClass( Text.class );//指定输出的VALUE的格式
job.waitForCompletion(true);
return job.isSuccessful() ? 0 : 1;
}
/**
* 设置系统说明
* 设置MapReduce任务
*/
public static void main(String[] args) throws Exception
{
//判断参数个数是否正确
//如果无参数运行则显示以作程序说明
if ( args.length != 2 )
{
System.err.println("");
System.err.println("Usage: Exercise_4 < input path > < output path > ");
System.err.println("Example: hadoop jar ~/Exercise_4.jar hdfs://localhost:9000/home/james/Exercise_4 hdfs://localhost:9000/home/james/output");
System.err.println("Counter:");
System.err.println("\t"+"LINESKIP"+"\t"+"Lines which are too short");
System.exit(-1);
}
//运行任务
int res = ToolRunner.run(new Configuration(), new Exercise_4(), args);
System.exit(res);
}
}
运行结果截图:package homework;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.junit.Before;
import org.junit.Test;
public class Exercise_4Test {
MapDriver<LongWritable, Text, Text, Text> mapDriver;
ReduceDriver<Text, Text, Text, Text> reduceDriver;
@Before
public void setUp() {
Exercise_4.Map mapper = new Exercise_4.Map();
Exercise_4.Reduce reducer = new Exercise_4.Reduce();
mapDriver = MapDriver.newMapDriver(mapper);;
reduceDriver = ReduceDriver.newReduceDriver(reducer);
}
@Test
public void testMapper() throws IOException{
// Temperature
Text value1 = new Text(" 7369 SMITH CLERK 7902 17-12-80 800 20");
//Text value2 = new Text(" 10 ACCOUNTING NEW YORK");
mapDriver.withInput(new LongWritable(), value1);
mapDriver.withOutput(new Text("20"), new Text("1800"));
mapDriver.runTest();
}
// @Test
public void testReducer() throws IOException {
List<Text> values = new ArrayList<Text>();
values.add(new Text("2ACCOUNTING"));
values.add(new Text("1800"));
values.add(new Text("11600"));
values.add(new Text("13000"));
reduceDriver.withInput(new Text("20"), values);
reduceDriver.withOutput(new Text("ACCOUNTING"), new Text("5400" + " " + "1800" + " " + "3"));
reduceDriver.runTest();
}
}
- hadoop实现表连接算法
- Hadoop 实现kmeans 算法
- TFIDF算法Hadoop实现
- Hadoop 表连接
- Hadoop表连接问题
- Hadoop表连接
- Hadoop表连接
- Hadoop k-means 算法实现
- 贝叶斯算法Hadoop实现<转>
- 用Hadoop实现KMeans算法
- PageRank算法在hadoop实现
- Hadoop实现协同过滤算法
- 实现MongoDB与Hadoop的连接
- Hadoop 实现协同过滤算法(2)
- Hadoop 实现协同过滤算法(1)
- Hadoop实现关联规则算法--二项集挖掘
- hadoop下的Kmeans算法实现一
- hadoop下的Kmeans算法实现二
- Hadoop分析NCDC气象数据
- Window平台的eclipse连接linux的hadoop集群
- MapReduce的单元测试框架MRUnit
- Hadoop Definitive Guide --- Chapter 6. How MapReduce Works
- 《Linux内核设计与实现》——中断和中断处理
- hadoop实现表连接算法
- FrameLayout中Margin设置无效,解决办法
- 大数据云计算的利器hadoop介绍
- Hadoop的脚本语言Pig应用
- Gnuplot图形展示hadoop处理结果
- 一、ReactiveCocoa(RAC)配置
- Hadoop数据仓库hive的应用
- RDBMS和HDFS, HIVE, HBASE的迁移工具Sqoop
- hadoop基础总结