Hadoop的JobControl设计及用法

来源：互联网发布：nba球队数据排名编辑：程序博客网时间：2024/05/18 13:12

JobControl设计及用法

1、JobControl设计原理分析：
JobControl由两个类组成：Job和JobControl。
Job类封装了一个MapReduce作业及其对应的依赖关系，主要负责监控各个依赖作业的运行状态，一次更新自己的状态。
作业刚开始处于WAITING状态。如果没有依赖作业或者所有作业均已运行完成，则进入READY状态。一旦进入REDAY状态，则作业可被提交到Hadoop集群上运行，并进入RUNNING状态。在RUNNING状态下，根据作业运行情况，可能进入SUCCESS或者FAILED状态。
需要注意的是，如果一个作业的依赖作业失败，则该作业也会失败，于是形成“多米诺骨牌效应”，后续所有作业均会失败。

旧API：

新API：

2、JobControl代码实现：

import java.io.File;import java.io.IOException;import java.util.HashSet;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import mapreduce.SegmentUtil;public class JobControlDemo {public static int main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {Configuration conf = new Configuration();String[] otherargs = new GenericOptionsParser(conf, args).getRemainingArgs();if (otherargs.length != 3) {System.err.println("Usage JobControlDemo <InputPath1> <InputPath1> <OutPath>");System.exit(2);}// 创建基础作业Job job1 = Job.getInstance(conf, JobControlDemo.class.getSimpleName() + "1");Job job2 = Job.getInstance(conf, JobControlDemo.class.getSimpleName() + "2");Job job3 = Job.getInstance(conf, JobControlDemo.class.getSimpleName() + "3");// Job1作业参数配置job1.setJarByClass(JobControlDemo.class);job1.setMapOutputKeyClass(Text.class);job1.setMapOutputValueClass(Text.class);job1.setOutputKeyClass(Text.class);job1.setOutputValueClass(Text.class);job1.setMapperClass(MyMapper1.class);job1.setReducerClass(MyReducer1.class);job1.setInputFormatClass(TextInputFormat.class);job1.setOutputFormatClass(TextOutputFormat.class);FileInputFormat.addInputPath(job1, new Path(otherargs[0]));FileOutputFormat.setOutputPath(job1, new Path(otherargs[2]+File.separator+"mid1"));// Job2作业参数配置job2.setJarByClass(JobControlDemo.class);job2.setMapOutputKeyClass(Text.class);job2.setMapOutputValueClass(Text.class);job2.setOutputKeyClass(Text.class);job2.setOutputValueClass(Text.class);job2.setMapperClass(MyMapper2.class);job2.setReducerClass(MyReducer2.class);job2.setInputFormatClass(TextInputFormat.class);job2.setOutputFormatClass(TextOutputFormat.class);FileInputFormat.addInputPath(job2, new Path(otherargs[1]));FileOutputFormat.setOutputPath(job2, new Path(otherargs[2]+File.separator+"mid2"));// Job3作业参数配置job3.setJarByClass(JobControlDemo.class);job3.setMapOutputKeyClass(Text.class);job3.setMapOutputValueClass(Text.class);job3.setOutputKeyClass(Text.class);job3.setOutputValueClass(Text.class);job3.setMapperClass(MyMapper3.class);job3.setReducerClass(MyReducer3.class);job3.setInputFormatClass(KeyValueTextInputFormat.class);job3.setOutputFormatClass(TextOutputFormat.class);FileInputFormat.addInputPath(job3, new Path(otherargs[2]+File.separator+"mid1"));FileInputFormat.addInputPath(job3, new Path(otherargs[2]+File.separator+"mid2"));FileOutputFormat.setOutputPath(job3, new Path(otherargs[2]+File.separator+"result"));// 创建受控作业ControlledJob cjob1 = new ControlledJob(conf);ControlledJob cjob2 = new ControlledJob(conf);ControlledJob cjob3 = new ControlledJob(conf);// 将普通作业包装成受控作业cjob1.setJob(job1);cjob2.setJob(job2);cjob3.setJob(job3);// 设置依赖关系//cjob2.addDependingJob(cjob1);cjob3.addDependingJob(cjob1);cjob3.addDependingJob(cjob2);// 新建作业控制器JobControl jc = new JobControl("My control job");// 将受控作业添加到控制器中jc.addJob(cjob1);jc.addJob(cjob2);jc.addJob(cjob3);/** * hadoop的JobControl类实现了线程Runnable接口。我们需要实例化一个线程来让它启动。直接调用JobControl的run()方法，线程将无法结束。 *///jc.run();        Thread jcThread = new Thread(jc);          jcThread.start();          while(true){              if(jc.allFinished()){                  System.out.println(jc.getSuccessfulJobList());                  jc.stop();                  return 0;              }              if(jc.getFailedJobList().size() > 0){                  System.out.println(jc.getFailedJobList());                  jc.stop();                  return 1;              }          } }/** * 第一个Job */public static class MyMapper1 extends Mapper<LongWritable, Text, Text, Text>{@Overrideprotected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)throws IOException, InterruptedException {String[] spl1=value.toString().split("\t");if(spl1.length==2){context.write(new Text(spl1[0].trim()), new Text(spl1[1].trim()));}}}public static class MyReducer1 extends Reducer<Text, Text, Text, Text>{@Overrideprotected void reduce(Text k2, Iterable<Text> v2s, Reducer<Text, Text, Text, Text>.Context context)throws IOException, InterruptedException {for (Text v2 : v2s) {context.write(k2, v2);}}}/** * 第二个Job */public static class MyMapper2 extends Mapper<LongWritable, Text, Text, Text>{@Overrideprotected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)throws IOException, InterruptedException {String[] spl2=value.toString().split("\t");if(spl2.length==2){context.write(new Text(spl2[0].trim()), new Text(spl2[1].trim()));}}}public static class MyReducer2 extends Reducer<Text, Text, Text, Text>{@Overrideprotected void reduce(Text k3, Iterable<Text> v3s, Reducer<Text, Text, Text, Text>.Context context)throws IOException, InterruptedException {for (Text v3 : v3s) {context.write(k3, v3);}}}/** * 第三个Job */public static class MyMapper3 extends Mapper<Text, Text, Text, Text>{@Overrideprotected void map(Text key, Text value, Mapper<Text, Text, Text, Text>.Context context)throws IOException, InterruptedException {context.write(key, value);}}public static class MyReducer3 extends Reducer<Text,Text, Text, Text>{@Overrideprotected void reduce(Text k4, Iterable<Text> v4s,Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {HashSet<String> hashSet=new HashSet<String>();for (Text v4 : v4s) {hashSet.add(v4.toString().trim());}if(hashSet.size()>=2){context.write(k4, new Text("OK"));}}}}

测试输入数据：

hdfs dfs -text /user/jiuqian/libin/input/inputpath1.txthadoop  aspark   ahive    ahbase   atachyon astorm   aredis   a

hdfs dfs -text /user/jiuqian/libin/input/inputpath2.txthadoop  bspark   bkafka   btachyon boozie   bflume   bsqoop   bsolr    b

测试输出数据：

 hdfs dfs -text /user/jiuqian/libin/input/inputpathmerge2.txt/result/*hadoop  OKspark   OKtachyon OK

运行输出信息：

[sshexec] cmd : bash -c 'source  /home/jiuqian/.bashrc; /home/hduser/hadoop/bin/hadoop jar  /home/jiuqian/blb/JobControlDemo.jar -D mapreduce.map.java.opts=-Xmx2048m -D mapreduce.input.fileinputformat.split.minsize=1 -Dmapreduce.input.fileinputformat.split.maxsize=512000000 -D mapred.linerecordreader.maxlength=32768 /user/jiuqian/libin/input/inputpath1.txt /user/jiuqian/libin/input/inputpath2.txt /user/jiuqian/libin/input/inputpathmerge2.txt'16/02/27 12:37:45 INFO client.RMProxy: Connecting to ResourceManager at sh-rslog1/192.168.1.2:803216/02/27 12:37:46 INFO input.FileInputFormat: Total input paths to process : 116/02/27 12:37:46 INFO mapreduce.JobSubmitter: number of splits:116/02/27 12:37:46 INFO Configuration.deprecation: mapred.linerecordreader.maxlength is deprecated. Instead, use mapreduce.input.linerecordreader.line.maxlength16/02/27 12:37:47 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1446086163035_1703716/02/27 12:37:47 INFO impl.YarnClientImpl: Submitted application application_1446086163035_1703716/02/27 12:37:47 INFO mapreduce.Job: The url to track the job: http://sh-rslog1:8088/proxy/application_1446086163035_17037/16/02/27 12:37:47 INFO client.RMProxy: Connecting to ResourceManager at sh-rslog1/27.115.29.102:803216/02/27 12:37:47 INFO input.FileInputFormat: Total input paths to process : 116/02/27 12:37:47 INFO mapreduce.JobSubmitter: number of splits:116/02/27 12:37:47 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1446086163035_1703816/02/27 12:37:47 INFO impl.YarnClientImpl: Submitted application application_1446086163035_1703816/02/27 12:37:47 INFO mapreduce.Job: The url to track the job: http://sh-rslog1:8088/proxy/application_1446086163035_17038/16/02/27 12:38:13 INFO client.RMProxy: Connecting to ResourceManager at sh-rslog1/27.115.29.102:803216/02/27 12:38:13 INFO input.FileInputFormat: Total input paths to process : 216/02/27 12:38:13 INFO mapreduce.JobSubmitter: number of splits:216/02/27 12:38:13 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1446086163035_1703916/02/27 12:38:13 INFO impl.YarnClientImpl: Submitted application application_1446086163035_1703916/02/27 12:38:13 INFO mapreduce.Job: The url to track the job: http://sh-rslog1:8088/proxy/application_1446086163035_17039/[job name:JobControlDemo1job id:My control job0job state:SUCCESSjob mapred id:job_1446086163035_17037job message:just initializedjob has no depending job:, job name:JobControlDemo2job id:My control job1job state:SUCCESSjob mapred id:job_1446086163035_17038job message:just initializedjob has no depending job:, job name:JobControlDemo3job id:My control job2job state:SUCCESSjob mapred id:job_1446086163035_17039job message:just initializedjob has 2 dependeng jobs: depending job 0:JobControlDemo1 depending job 1:JobControlDemo2][INFO] Executed tasks[INFO] ------------------------------------------------------------------------[INFO] BUILD SUCCESS[INFO] ------------------------------------------------------------------------

3个Job运行过程：

开始前两个Job同时被调度：

前两个Job运行完之后，第三个Job开始运行：

0 0