Spark stage切分和提交

来源:互联网 发布:java基础课程设计 编辑:程序博客网 时间:2024/05/16 01:56
    客户端构建好RDD的DAG以后,会提交至DAGScheduler来处理,这是一个Stage级别的调度器,他首先会把作业切分为一个个Stage,每个Stage由一组相同运算的tasks组成,然后会以taskset的形式提交给TaskScheduler。DS还会跟踪stage的输出与物化情况、检测task运行时的最优位置,重新提交失败的stage。
    DAGSheduler接收到JobSubmitted的消息后,进入作业提交逻辑
  override def onReceive(event: DAGSchedulerEvent): Unit = event match {    case JobSubmitted(jobId, rdd, func, partitions, allowLocal, callSite, listener, properties) =>      //作业在此提交      dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, allowLocal, callSite,        listener, properties)
提交过程中会创建finalStage,根据该stage创建ActiveJob,最后启动该Job
  private[scheduler] def handleJobSubmitted(jobId: Int,      finalRDD: RDD[_],      func: (TaskContext, Iterator[_]) => _,      partitions: Array[Int],      allowLocal: Boolean,      callSite: CallSite,      listener: JobListener,      properties: Properties) {    var finalStage: Stage = null    try {      finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite)    } catch {      ....    }    if (finalStage != null) {      val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)      clearCacheLocs()      ......      ......      if (shouldRunLocally) {        ...      } else {        jobIdToActiveJob(jobId) = job        activeJobs += job        finalStage.resultOfJob = Some(job)        val stageIds = jobIdToStageIds(jobId).toArray        val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))        listenerBus.post(          SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))        submitStage(finalStage)      }    }    submitWaitingStages()  } <span style="font-family: 'Courier New'; background-color: rgb(255, 255, 255);">   </span>
  在RDD的依赖关系中,有两种依赖:宽依赖和窄依赖,RDD遇到窄依赖会归到一个Stage中,形成pipeline,遇到宽依赖则切分stage,通常有shuffle就会形成宽依赖,所以shuffle成了stage切分的边界,newStage函数会生产stage的DAG图,该图记录了当前作业中各stage的依赖情况
  private def newStage(      rdd: RDD[_],      numTasks: Int,      shuffleDep: Option[ShuffleDependency[_, _, _]],      jobId: Int,      callSite: CallSite)    : Stage =  {  //获得父stage    val parentStages = getParentStages(rdd, jobId)    //获得stage id    val id = nextStageId.getAndIncrement()    //创建新的stage,会传入父stage,形成依赖关系,组成DAG图    val stage = new Stage(id, rdd, numTasks, shuffleDep, parentStages, jobId, callSite)    stageIdToStage(id) = stage    updateJobIdStageIdMaps(jobId, stage)    stage  }  
  获取父stage的流程如下,注意visit函数,从该段逻辑中可以看出,只有碰到shuffle依赖才会切割stage
  private def getParentStages(rdd: RDD[_], jobId: Int): List[Stage] = {    val parents = new HashSet[Stage]    val visited = new HashSet[RDD[_]]    // We are manually maintaining a stack here to prevent StackOverflowError    // caused by recursively visiting    val waitingForVisit = new Stack[RDD[_]]    def visit(r: RDD[_]) {      if (!visited(r)) {        visited += r        // 注意此处,两个stage之间的依赖必定是有shuffle依赖        //这里会一直向上查找,直到看到shuffle依赖,如果没有,那么这个作业就一个stage        for (dep <- r.dependencies) {          dep match {            case shufDep: ShuffleDependency[_, _, _] =>              parents += getShuffleMapStage(shufDep, jobId)            case _ =>              waitingForVisit.push(dep.rdd)          }        }      }    }    waitingForVisit.push(rdd)    //这里会不停的向上遍历    while (!waitingForVisit.isEmpty) {      visit(waitingForVisit.pop())    }    parents.toList  }
下面是父stage,也就是shuffle stage的创建流程
private def getShuffleMapStage(shuffleDep: ShuffleDependency[_, _, _], jobId: Int): Stage = {    shuffleToMapStage.get(shuffleDep.shuffleId) match {      case Some(stage) => stage      case None =>        // We are going to register ancestor shuffle dependencies        registerShuffleDependencies(shuffleDep, jobId)        // Then register current shuffleDep        val stage =          newOrUsedStage(            shuffleDep.rdd, shuffleDep.rdd.partitions.size, shuffleDep, jobId,            shuffleDep.rdd.creationSite)        shuffleToMapStage(shuffleDep.shuffleId) = stage         stage    }  } <span style="font-family: 'Courier New'; background-color: rgb(255, 255, 255);"> </span>
最終通过递归方式提交stage
private def submitStage(stage: Stage) {    val jobId = activeJobForStage(stage)    if (jobId.isDefined) {      logDebug("submitStage(" + stage + ")")      if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {        val missing = getMissingParentStages(stage).sortBy(_.id)        logDebug("missing: " + missing)        if (missing == Nil) {          logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")          submitMissingTasks(stage, jobId.get)        } else {          for (parent <- missing) {            submitStage(parent)          }          waitingStages += stage        }      }    } else {      abortStage(stage, "No active job for stage " + stage.id)    }  }


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
1 0
原创粉丝点击