Spark2.2 DAGScheduler源码分析[stage划分算法源码剖析]

来源：互联网发布：js弹出窗口居中编辑：程序博客网时间：2024/05/22 08:08

概述

DAGScheduler的stage的划分算法：

会从触发action操作的那个rdd开始向前倒退；
首先会为最后一个rdd创建一个stage，之后向前倒推的时候，会判断rdd的依赖，如果发现rdd是宽依赖，就会将宽依赖的那个rdd创建一个新的stage，这个新的rdd就是新的stage的最后一个rdd；
继续倒推，依据rdd的宽窄依赖，进行stage的划分，直到遍历完所有的rdd。

这里写图片描述

源码分析

调度执行的入口

sc.runJob() 的调度执行的入口：dagScheduler.runJob()

  /**   * 在RDD中在给定的分区上运行一个函数，并将结果传递给给定的处理程序函数。   * 这是所有Spark actions 的主要入口点。   * @param rdd 执行任务的目标RDD   * @param func   * @param partitions   * @param resultHandler 回调每一个结果   */  def runJob[T, U: ClassTag](      rdd: RDD[T],      func: (TaskContext, Iterator[T]) => U,      partitions: Seq[Int],      resultHandler: (Int, U) => Unit): Unit = {    if (stopped.get()) {      throw new IllegalStateException("SparkContext has been shutdown")    }    val callSite = getCallSite    val cleanedFunc = clean(func)    logInfo("Starting job: " + callSite.shortForm)    if (conf.getBoolean("spark.logLineage", false)) {      logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)    }    /**     * leen     * 调度执行的入口：dagScheduler.runJob()     */    dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)    progressBar.foreach(_.finishAll())    rdd.doCheckpoint()  }

DAGScheduler 调度的核心入口

dagScheduler.runJob() —> dagScheduler.handleJobSubmitted()

第一步：使用触发job的最后一个RDD，创建finalStage => ResultStage
第二步：用finalStage创建一个job
第三步：将job加入内存缓存中，并启动job
第四步：使用submitStage（）提交finalStage

  /**   * leen   * DAGScheduler 调度的核心入口   *   * stage划分算法的步骤：   * 1.从finalStage倒退   * 2.通过宽依赖，来进行Stage的划分   * 3.使用递归，优先提交父Stage   */  private[scheduler] def handleJobSubmitted(jobId: Int,                                            finalRDD: RDD[_],                                            func: (TaskContext, Iterator[_]) => _,                                            partitions: Array[Int],                                            callSite: CallSite,                                            listener: JobListener,                                            properties: Properties) {    var finalStage: ResultStage = null    try {      // 第一步：使用触发job的最后一个RDD，创建finalStage => ResultStage      finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)    } catch {      // 一个新的Stage的创建，可能会抛出一个异常，比如，当一个HadoopRDD运行的时候，它所依赖的HDFS上的文件被删除      case e: Exception =>        logWarning("Creating new stage failed due to exception - job: " + jobId, e)        listener.jobFailed(e)        return    }    // 第二步：用finalStage创建一个job    val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)    clearCacheLocs()    logInfo("Got job %s (%s) with %d output partitions".format(      job.jobId, callSite.shortForm, partitions.length))    logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")    logInfo("Parents of final stage: " + finalStage.parents)    logInfo("Missing parents: " + getMissingParentStages(finalStage))    val jobSubmissionTime = clock.getTimeMillis()    // 第三步： 将job加入内存缓存中    jobIdToActiveJob(jobId) = job    activeJobs += job    finalStage.setActiveJob(job)    val stageIds = jobIdToStageIds(jobId).toArray    val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))    listenerBus.post(      SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))    //第四步：使用submitStage（） 提交finalStage    submitStage(finalStage)  }

stage划分算法与task最佳位置判断的核心代码

submitStage(finalStage)

  /**   * leen   * 提交Stage，但是首先要递归提交每一个存在的父Stages   */  private def submitStage(stage: Stage) {    val jobId = activeJobForStage(stage)    if (jobId.isDefined) {      logDebug("submitStage(" + stage + ")")      if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {        // 使用getMissingParentStages() 获得当前Stage的父Stage        // 并按照ID，升序排序        val missing = getMissingParentStages(stage).sortBy(_.id)        logDebug("missing: " + missing)        /**         * 递归调用         * 直到最初的Stage，它没有父Stage了         * 那么此时就会被提交第一个stages ,stage0         */        if (missing.isEmpty) {          logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")          submitMissingTasks(stage, jobId.get)        } else {          //递归调用submitStage()，去提交父Stage          // 这里的递归，就是Stage划分算法的精髓          for (parent <- missing) {            submitStage(parent)          }          // 并且将当前的Stage，放入waittingStages等待Stage被调用的队列          waitingStages += stage        }      }    } else {      abortStage(stage, "No active job for stage " + stage.id, None)    }  }

stage划分算法核心

getMissingParentStages(）

  /**   * 获取某个Stage的父Stages   */  private def getMissingParentStages(stage: Stage): List[Stage] = {    val missing = new HashSet[Stage]    val visited = new HashSet[RDD[_]]    // 我们创建一个栈【先入后出】，避免由于循环遍历访问引起的栈溢出ERROR    val waitingForVisit = new Stack[RDD[_]]    def visit(rdd: RDD[_]) {      // 如果不是访问过的RDD      if (!visited(rdd)) {        visited += rdd        val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)        if (rddHasUncachedPartitions) {          // 遍历RDD的依赖          /**           * 所以说，其实对于每一种shuffle操作，不如reduceByKey,groupByKey,countByKey           * 底层对应了三个RDD：MapPartitionsRDD, ShuffleRDD, MapPartitionsRDD           */          for (dep <- rdd.dependencies) {            // 模式匹配            dep match {              /**               * 如果是宽依赖               * 那么使用宽依赖的那个RDD，创建一个 ShuffleMapStage               * 默认最后一个Stage，不是ShuffleMapStage               * 但是finalStage之前的所有Stage，都是ShuffleMapStage               */              case shufDep: ShuffleDependency[_, _, _] =>                val mapStage = getOrCreateShuffleMapStage(shufDep, stage.firstJobId)                if (!mapStage.isAvailable) {                  //将父Stage，放入missing中；                  missing += mapStage                }              /**               * 如果是窄依赖，则将依赖的RDD放入栈中               */              case narrowDep: NarrowDependency[_] =>                waitingForVisit.push(narrowDep.rdd)            }          }        }      }    }    // 首先，往栈中推入Stage的最后一个RDD    waitingForVisit.push(stage.rdd)    // 进行while循环，如果栈非空，则对Stage的最后一个RDD，调用自己的visit方法；    while (waitingForVisit.nonEmpty) {      // pop() 删除并返回栈中的最后一个元素【最后进去的元素】      visit(waitingForVisit.pop())    }    missing.toList  }

task启动的最佳位置

submitMissingTasks() 找出task启动的最佳位置

(1) 对于finalstage之外的stage创建ShuffleMapTask
(2) 对于finalstage创建ResultTask
(3) 在task创建之前，需要根据taskId，从finalRdd往前推，寻找到被cache或者checkPoint的位置，启动task；
(4) 最后，针对Stage的task，创建TaskSet对象，调用submitTasks()方法，提交taskSet

  /**   * 当stage的父Stage被找出来，并且我们可以执行它的task的时候调用   */  private def submitMissingTasks(stage: Stage, jobId: Int) {    logDebug("submitMissingTasks(" + stage + ")")    // 首先计算出索引分区ids。    val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()    val properties = jobIdToActiveJob(jobId).properties    runningStages += stage    stage match {      case s: ShuffleMapStage =>        outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions - 1)      case s: ResultStage =>        outputCommitCoordinator.stageStart(          stage = s.id, maxPartitionId = s.rdd.partitions.length - 1)    }    // 根据TaskId找到Task启动的最佳启动位置：PreferredLocs    // 从finalRdd向前推，寻找被缓存的 / checkPoint的位置，启动task    val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {      stage match {        case s: ShuffleMapStage =>          partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id)) }.toMap        case s: ResultStage =>          partitionsToCompute.map { id =>            val p = s.partitions(id)            (id, getPreferredLocs(stage.rdd, p))          }.toMap      }    } catch {      case NonFatal(e) =>        stage.makeNewStageAttempt(partitionsToCompute.size)        listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))        abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))        runningStages -= stage        return    }    /**     * leen     * 为Stage创建指定数量的tasks     */    val tasks: Seq[Task[_]] = try {      val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()      stage match {        /**         * 【1】对于finalStage之外的Stage，创建ShuffleMapTask         */        case stage: ShuffleMapStage =>          stage.pendingPartitions.clear()          partitionsToCompute.map { id =>            //给每一个Partition创建一个task            //给每一个task计算最佳位置            val locs = taskIdToLocations(id)            val part = stage.rdd.partitions(id)            stage.pendingPartitions += id            new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,              taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),              Option(sc.applicationId), sc.applicationAttemptId)          }        /**         * 【2】 对于finalStage，创建ResultTask         */        case stage: ResultStage =>          partitionsToCompute.map { id =>            val p: Int = stage.partitions(id)            val part = stage.rdd.partitions(p)            val locs = taskIdToLocations(id)            new ResultTask(stage.id, stage.latestInfo.attemptId,              taskBinary, part, locs, id, properties, serializedTaskMetrics,              Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)          }      }    } catch {      case NonFatal(e) =>        abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))        runningStages -= stage        return    }    if (tasks.size > 0) {      logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +        s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")      /**       * 【3】最后，针对Stage的task，创建TaskSet对象，       *     调用submitTasks()方法，提交taskSet       */      // 默认情况下，我们的StandAlone模式使用的是TaskSchedulerImpl,TaskScheduler只是一个trait      taskScheduler.submitTasks(new TaskSet(        tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))      stage.latestInfo.submissionTime = Some(clock.getTimeMillis())    } else {      // 如果没有任务需要运行，我们应该在此标记完成的阶段      markStageAsFinished(stage, None)      val debugString = stage match {        case stage: ShuffleMapStage =>          s"Stage ${stage} is actually done; " +            s"(available: ${stage.isAvailable}," +            s"available outputs: ${stage.numAvailableOutputs}," +            s"partitions: ${stage.numPartitions})"        case stage: ResultStage =>          s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"      }      logDebug(debugString)      submitWaitingChildStages(stage)    }  }

阅读全文

1 0