hadoop源代码分析（二）从wordCount开始，剖析mapreduce的运行机制

来源：互联网发布：保定seo外包编辑：程序博客网时间：2024/04/29 00:37

在上一篇文章中，只是简单介绍了Mapreduce作业，从执行hadoop jar test.jar 的shell命令，到是如何被加载并找到主类的。那么，从这个文章开始，研究从mapreduce的main方法开始，如何一步步提交、运行mapreduce作业的，此处会涉及到yarn相关知识。

编写的mapreduce程序的main方法如下，(map，reduce阶段代码很简单，就不贴上浪费CSDN的空间了)：

public static void main(String[] args) throws Exception {Configuration conf = new Configuration();        Job job = Job.getInstance(conf,Test.class.getSimpleName());        job.setMapperClass(Mapper.class);        job.setJarByClass(Test.class);        job.setOutputKeyClass(NullWritable.class);        job.setOutputValueClass(Text.class);        FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.80.100:9000/datawarehouse/src/userinfo/2016-02-20"));        FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.80.100:9000/datawarehouse/dw/userinfo/"+new SimpleDateFormat("yyyy-MM-dd").format(new Date())));        System.out.println("--------end---------");

<span style="white-space:pre"></span>//<span style="color:#ff0000;">上面的只是一些简单配置，从这里开始作业提交流程~~~咱们的分析，也是从这里开始！</span>        job.waitForCompletion(true);}

1、进入waitForCompletion方法：

public boolean waitForCompletion(boolean verbose) throws IOException, InterruptedException,                                           ClassNotFoundException {    //在Job内部，有一枚举类public static enum JobState {DEFINE, RUNNING};作业的jobstate有以上两种    if (state == JobState.DEFINE) {    //进入submit()方法，提交作业      submit();    }    if (verbose) {    //作业进入循环监控      monitorAndPrintJob();    } else {          //如果false一直循环现成睡眠，睡眠时间取决于mapreduce.client.completion.pollinterval配置，默认为5000      /**   * 循环一直持续到!isComplete()状态，即：作业的状态不为：SUCCEEDED，FAILED，KILLED   * 除此之外，JobStatus.State枚举类内部，还有RUNNING，PREP，以上几种状态，即yarn的几种job运行状态   */      int completionPollIntervalMillis =         Job.getCompletionPollInterval(cluster.getConf());      while (!isComplete()) {        try {          Thread.sleep(completionPollIntervalMillis);        } catch (InterruptedException ie) {        }      }    }    return isSuccessful();  }

2、上述方法中，调用了submit();方法，下面进入此方法：

public void submit()          throws IOException, InterruptedException, ClassNotFoundException {    ensureState(JobState.DEFINE);    /**   * 这个方法与主线关系不大具体实现不准备在这里贴出详解，方法内部做了以下事：  * 如未做配置，则调用新MR的API   *    */    setUseNewAPI();    //链接Resourcemanager，初始化Cluster对象    /**   * 前提，mapred-site.xml中你配置了mapreduce.framework.name选项为"YARN"  * Cluster对象的构造方法，参数InetSocketAddress，命名还是jobTrackAddr，但是调用构造方法的时候传入的NULL，这里看着有些别扭。   *    */    connect();    final JobSubmitter submitter =         getJobSubmitter(cluster.getFileSystem(), cluster.getClient());    status = ugi.doAs(new PrivilegedExceptionAction<JobStatus>() {      public JobStatus run() throws IOException, InterruptedException,       ClassNotFoundException {      //提交作业，代码在下面贴出        return submitter.submitJobInternal(Job.this, cluster);      }    });    state = JobState.RUNNING;    LOG.info("The url to track the job: " + getTrackingURL());   }

3、进入submitJobInternal方法

JobStatus submitJobInternal(Job job, Cluster cluster)   throws ClassNotFoundException, InterruptedException, IOException {    //对输出目录，检查    checkSpecs(job);    Configuration conf = job.getConfiguration();    addMRFrameworkToDistributedCache(conf);//中专目录。应该是job所需jarxml等文件的父目录，在此基础上，根据    Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);    //configure the command line options correctly on the submitting dfs    InetAddress ip = InetAddress.getLocalHost();    if (ip != null) {      submitHostAddress = ip.getHostAddress();      submitHostName = ip.getHostName();      conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);      conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);    }    JobID jobId = submitClient.getNewJobID();    job.setJobID(jobId);    Path submitJobDir = new Path(jobStagingArea, jobId.toString());    JobStatus status = null;    try {    //根据上面代码，做一些基本配置如mapreduce.job.dir      conf.set(MRJobConfig.USER_NAME,          UserGroupInformation.getCurrentUser().getShortUserName());      conf.set("hadoop.http.filter.initializers",           "org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");      conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, submitJobDir.toString());      LOG.debug("Configuring job " + jobId + " with " + submitJobDir           + " as the submit dir");      // get delegation token for the dir      TokenCache.obtainTokensForNamenodes(job.getCredentials(),          new Path[] { submitJobDir }, conf);            populateTokenCache(conf, job.getCredentials());      // generate a secret to authenticate shuffle transfers      if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {        KeyGenerator keyGen;        try {//获得加密key长度          int keyLen = CryptoUtils.isShuffleEncrypted(conf)               ? conf.getInt(MRJobConfig.MR_ENCRYPTED_INTERMEDIATE_DATA_KEY_SIZE_BITS,                   MRJobConfig.DEFAULT_MR_ENCRYPTED_INTERMEDIATE_DATA_KEY_SIZE_BITS)              : SHUFFLE_KEY_LENGTH;      //初始化加密使用加密算法Hmac_SHA1和keylen          keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM);          keyGen.init(keyLen);        } catch (NoSuchAlgorithmException e) {          throw new IOException("Error generating shuffle secret key", e);        }        SecretKey shuffleKey = keyGen.generateKey();        TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(),            job.getCredentials());      }//将所有的资源文件拷贝到资源中转目录中      copyAndConfigureFiles(job, submitJobDir);       //获取job文件job.xml      Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);            // Create the splits for the job      LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));      //根据inputsplit数量,决定map数量，同时会引发一些列操作，这里比较重要，需在下一篇独拿出来说明，这里之后你就会明白怎么控制map的数量，单单设置mapreduce.job.maps是不行的      int maps = writeSplits(job, submitJobDir);      //因为mapreduce.job.maps的设置，在源代码中conf.setInt(MRJobConfig.NUM_MAPS, maps);直接设置，并未查找配置文件      conf.setInt(MRJobConfig.NUM_MAPS, maps);      LOG.info("number of splits:" + maps);      // write "queue admins of the queue to which job is being submitted"      // to job file.      //作业提交到的队列，mapred-site.xm中配置mapreduce.job.queuename，默认default      String queue = conf.get(MRJobConfig.QUEUE_NAME,          JobConf.DEFAULT_QUEUE_NAME);      AccessControlList acl = submitClient.getQueueAdmins(queue);      conf.set(toFullPropertyName(queue,          QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());      // removing jobtoken referrals before copying the jobconf to HDFS      // as the tasks don't need this setting, actually they may break      // because of it if present as the referral will point to a      // different job.      TokenCache.cleanUpTokenReferral(conf);      if (conf.getBoolean(          MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED,          MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) {        // Add HDFS tracking ids        ArrayList<String> trackingIds = new ArrayList<String>();        for (Token<? extends TokenIdentifier> t :            job.getCredentials().getAllTokens()) {          trackingIds.add(t.decodeIdentifier().getTrackingId());        }        conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS,            trackingIds.toArray(new String[trackingIds.size()]));      }      // Set reservation info if it exists      ReservationId reservationId = job.getReservationId();      if (reservationId != null) {        conf.set(MRJobConfig.RESERVATION_ID, reservationId.toString());      }      // Write job file to submit dir      writeConf(conf, submitJobFile);            //      // Now, actually submit the job (using the submit name)      //      printTokens(jobId, job.getCredentials());      status = submitClient.submitJob(          jobId, submitJobDir.toString(), job.getCredentials());      if (status != null) {        return status;      } else {        throw new IOException("Could not launch job");      }    } finally {      if (status == null) {        LOG.info("Cleaning up the staging area " + submitJobDir);        if (jtFs != null && submitJobDir != null)          jtFs.delete(submitJobDir, true);      }    }  }

4、暂时先回到第一个代码片中，作业提交后，进入循环监控方法monitorAndPrintJob()

public boolean monitorAndPrintJob()       throws IOException, InterruptedException {    String lastReport = null;    Job.TaskStatusFilter filter;    Configuration clientConf = getConfiguration();    filter = Job.getTaskOutputFilter(clientConf);    JobID jobId = getJobID();    LOG.info("Running job: " + jobId);    int eventCounter = 0;    boolean profiling = getProfileEnabled();    IntegerRanges mapRanges = getProfileTaskRange(true);    IntegerRanges reduceRanges = getProfileTaskRange(false);    int progMonitorPollIntervalMillis =       Job.getProgressPollInterval(clientConf);    /* make sure to report full progress after the job is done */    boolean reportedAfterCompletion = false;    boolean reportedUberMode = false;    //如果作业status为RUNNING，PREP，则继续循环，循环时间由mapreduce.client.progressmonitor.pollinterval控制，默认1000    while (!isComplete() || !reportedAfterCompletion) {      if (isComplete()) {        reportedAfterCompletion = true;      } else {        Thread.sleep(progMonitorPollIntervalMillis);      }      if (status.getState() == JobStatus.State.PREP) {        continue;      }            if (!reportedUberMode) {        reportedUberMode = true;        //循环打印是否开isUber模式，在作业的控制台输出，我们都能看到这个        LOG.info("Job " + jobId + " running in uber mode : " + isUber());      }            //打印map,reduce任务的进行百分比,这里是一个有意思的地方      String report =         (" map " + StringUtils.formatPercent(mapProgress(), 0)+            " reduce " +             StringUtils.formatPercent(reduceProgress(), 0));      if (!report.equals(lastReport)) {        LOG.info(report);        lastReport = report;      }      TaskCompletionEvent[] events =         getTaskCompletionEvents(eventCounter, 10);       eventCounter += events.length;      printTaskEvents(events, filter, profiling, mapRanges, reduceRanges);    }    boolean success = isSuccessful();    //结束或者失败,打印出信息    if (success) {      LOG.info("Job " + jobId + " completed successfully");    } else {      LOG.info("Job " + jobId + " failed with state " + status.getState() +           " due to: " + status.getFailureInfo());    }    Counters counters = getCounters();    if (counters != null) {      LOG.info(counters.toString());    }    return success;  }

以上就是初步的一些分析,还有一些详细的地方,只是做出了说明解释,并未贴出代码,如果涉及到的都要贴出,篇幅太大,顶不住,下一篇准备做一个旁支,详细解释splits生成即map数量,如果可能的话,分析出map任务的本地化算法、host机器的选择。如果花的时间较长，可能这部分会延后，但是肯定会写出来。

欢迎加入hadoop技术交流群：481116275

1 0