spark 1.6.0 core源码分析9 从简单例子看action

来源：互联网发布：淘宝权重怎么提升编辑：程序博客网时间：2024/06/05 10:11

这一节以reduce为例讲解action操作
首先看submitJob方法，它将我们reduce中写的处理函数随JobSubmitted消息传递出去，因为每个分区都需要调用它进行计算；

而resultHandler是指最后合并的方法，在每个task完成后，需要调用resultHandler将最终结果合并。所以它不需要随JobSubmitted消息传递，而是保存在JobWaiter中

org.apache.spark.rdd.RDD

/**   * Reduces the elements of this RDD using the specified commutative and   * associative binary operator.   */  def reduce(f: (T, T) => T): T = withScope {    val cleanF = sc.clean(f)    val reducePartition: Iterator[T] => Option[T] = iter => {      if (iter.hasNext) {        Some(iter.reduceLeft(cleanF))      } else {        None      }    }    var jobResult: Option[T] = None    val mergeResult = (index: Int, taskResult: Option[T]) => {      if (taskResult.isDefined) {        jobResult = jobResult match {          case Some(value) => Some(f(value, taskResult.get))          case None => taskResult        }      }    }    sc.runJob(this, reducePartition, mergeResult)    // Get the final result out of our Option, or throw an exception if the RDD was empty    jobResult.getOrElse(throw new UnsupportedOperationException("empty collection"))  }

SparkContext.scala

/**   * Run a function on a given set of partitions in an RDD and pass the results to the given   * handler function. This is the main entry point for all actions in Spark.   */  def runJob[T, U: ClassTag](      rdd: RDD[T],      func: (TaskContext, Iterator[T]) => U,      partitions: Seq[Int],      resultHandler: (Int, U) => Unit): Unit = {    if (stopped.get()) {      throw new IllegalStateException("SparkContext has been shutdown")    }    val callSite = getCallSite    val cleanedFunc = clean(func)    logInfo("Starting job: " + callSite.shortForm)    if (conf.getBoolean("spark.logLineage", false)) {      logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)    }    dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)    progressBar.foreach(_.finishAll())    rdd.doCheckpoint()  }

def runJob[T, U](      rdd: RDD[T],      func: (TaskContext, Iterator[T]) => U,      partitions: Seq[Int],      callSite: CallSite,      resultHandler: (Int, U) => Unit,      properties: Properties): Unit = {    val start = System.nanoTime    val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)    waiter.awaitResult() match {      case JobSucceeded =>        logInfo("Job %d finished: %s, took %f s".format          (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))      case JobFailed(exception: Exception) =>        logInfo("Job %d failed: %s, took %f s".format          (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))        // SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.        val callerStackTrace = Thread.currentThread().getStackTrace.tail        exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)        throw exception    }  }

def submitJob[T, U](      rdd: RDD[T],      func: (TaskContext, Iterator[T]) => U,      partitions: Seq[Int],      callSite: CallSite,      resultHandler: (Int, U) => Unit,      properties: Properties): JobWaiter[U] = {    // Check to make sure we are not launching a task on a partition that does not exist.    val maxPartitions = rdd.partitions.length    partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>      throw new IllegalArgumentException(        "Attempting to access a non-existent partition: " + p + ". " +          "Total number of partitions: " + maxPartitions)    }    val jobId = nextJobId.getAndIncrement()    if (partitions.size == 0) {      // Return immediately if the job is running 0 tasks      return new JobWaiter[U](this, jobId, 0, resultHandler)    }    assert(partitions.size > 0)    val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]    val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)    eventProcessLoop.post(JobSubmitted(      jobId, rdd, func2, partitions.toArray, callSite, waiter,      SerializationUtils.clone(properties)))    waiter  }

private[scheduler] def handleJobSubmitted(jobId: Int,      finalRDD: RDD[_],      func: (TaskContext, Iterator[_]) => _,      partitions: Array[Int],      callSite: CallSite,      listener: JobListener,      properties: Properties) {    var finalStage: ResultStage = null    try {      // New stage creation may throw an exception if, for example, jobs are run on a      // HadoopRDD whose underlying HDFS files have been deleted.      finalStage = newResultStage(finalRDD, func, partitions, jobId, callSite)    } catch {      case e: Exception =>        logWarning("Creating new stage failed due to exception - job: " + jobId, e)        listener.jobFailed(e)        return    }    val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)    clearCacheLocs()    logInfo("Got job %s (%s) with %d output partitions".format(      job.jobId, callSite.shortForm, partitions.length))    logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")    logInfo("Parents of final stage: " + finalStage.parents)    logInfo("Missing parents: " + getMissingParentStages(finalStage))    val jobSubmissionTime = clock.getTimeMillis()    jobIdToActiveJob(jobId) = job    activeJobs += job    finalStage.setActiveJob(job)    val stageIds = jobIdToStageIds(jobId).toArray    val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))    listenerBus.post(      SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))    submitStage(finalStage)    submitWaitingStages()  }

private def newResultStage(      rdd: RDD[_],      func: (TaskContext, Iterator[_]) => _,      partitions: Array[Int],      jobId: Int,      callSite: CallSite): ResultStage = {    val (parentStages: List[Stage], id: Int) = getParentStagesAndId(rdd, jobId)    val stage = new ResultStage(id, rdd, func, partitions, parentStages, jobId, callSite)    stageIdToStage(id) = stage    updateJobIdStageIdMaps(jobId, stage)    stage  }

getParentStagesAndId内部的处理逻辑：从finalRDD开始，查找它的所有依赖中的shuffle依赖，如果是普通依赖，则继续往前找，直到找到shuffle依赖为止。这样，就能获取到与finalRDD相邻的所有shuffle依赖。在上图中，即是groupBy和join两个操作产生的依赖。

得到这些shuffle依赖之后，再往前获取整个job所有shuffle依赖，并以shuffle依赖为边界创建ShuffleMapStage，将每个shuffleId注册到mapOutputTracker中，它是跟踪每个shuffleMapStage输出的位置等信息。
在newResultStage方法中，getParentStagesAndId只返回与finalRDD最近的stage
之后再通过父stages，分区数目，stageId，finalRDD，jobId等构建ResultStage。将jobId保存到所有stage的jobIds成员中。一个stage还能有多个jobId？？？
至此，finalStage的建设就完成了。
接着创建了ActiveJob，它只是将那些参数信息封装起来，并有一个成员记录每个partition是否完成。

最后就是调用submitStage将finalStage提交

 private def submitStage(stage: Stage) {    val jobId = activeJobForStage(stage)    if (jobId.isDefined) {      logDebug("submitStage(" + stage + ")")      if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {        val missing = getMissingParentStages(stage).sortBy(_.id)        logDebug("missing: " + missing)        if (missing.isEmpty) {//如果没有父stage未完成，则提交本身的stage            logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")          submitMissingTasks(stage, jobId.get)        } else {          for (parent <- missing) {//如果还有未完成的父stage，则递归调用submitStage，先提交父stage，把自己放进waitingStages中            submitStage(parent)          }          waitingStages += stage        }      }    } else {      abortStage(stage, "No active job for stage " + stage.id, None)    }  }

/** Called when stage's parents are available and we can now do its task. */  private def submitMissingTasks(stage: Stage, jobId: Int) {    logDebug("submitMissingTasks(" + stage + ")")    // Get our pending tasks and remember them in our pendingTasks entry    stage.pendingPartitions.clear()    // First figure out the indexes of partition ids to compute.    val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()    // Create internal accumulators if the stage has no accumulators initialized.    // Reset internal accumulators only if this stage is not partially submitted    // Otherwise, we may override existing accumulator values from some tasks    if (stage.internalAccumulators.isEmpty || stage.numPartitions == partitionsToCompute.size) {      stage.resetInternalAccumulators()    }    // Use the scheduling pool, job group, description, etc. from an ActiveJob associated    // with this Stage    val properties = jobIdToActiveJob(jobId).properties    runningStages += stage    // SparkListenerStageSubmitted should be posted before testing whether tasks are    // serializable. If tasks are not serializable, a SparkListenerStageCompleted event    // will be posted, which should always come after a corresponding SparkListenerStageSubmitted    // event.    stage match {      case s: ShuffleMapStage =>        outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions - 1)      case s: ResultStage =>        outputCommitCoordinator.stageStart(          stage = s.id, maxPartitionId = s.rdd.partitions.length - 1)    }    val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {      stage match {        case s: ShuffleMapStage =>          partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap        case s: ResultStage =>          val job = s.activeJob.get          partitionsToCompute.map { id =>            val p = s.partitions(id)            (id, getPreferredLocs(stage.rdd, p))          }.toMap      }    } catch {      case NonFatal(e) =>        stage.makeNewStageAttempt(partitionsToCompute.size)        listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))        abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))        runningStages -= stage        return    }    stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)    listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))    // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.    // Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast    // the serialized copy of the RDD and for each task we will deserialize it, which means each    // task gets a different copy of the RDD. This provides stronger isolation between tasks that    // might modify state of objects referenced in their closures. This is necessary in Hadoop    // where the JobConf/Configuration object is not thread-safe.    var taskBinary: Broadcast[Array[Byte]] = null    try {      // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).      // For ResultTask, serialize and broadcast (rdd, func).      val taskBinaryBytes: Array[Byte] = stage match {        case stage: ShuffleMapStage =>          closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array()        case stage: ResultStage =>          closureSerializer.serialize((stage.rdd, stage.func): AnyRef).array()      }      taskBinary = sc.broadcast(taskBinaryBytes)    } catch {      // In the case of a failure during serialization, abort the stage.      case e: NotSerializableException =>        abortStage(stage, "Task not serializable: " + e.toString, Some(e))        runningStages -= stage        // Abort execution        return      case NonFatal(e) =>        abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}", Some(e))        runningStages -= stage        return    }    val tasks: Seq[Task[_]] = try {      stage match {        case stage: ShuffleMapStage =>          partitionsToCompute.map { id =>            val locs = taskIdToLocations(id)            val part = stage.rdd.partitions(id)            new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,              taskBinary, part, locs, stage.internalAccumulators)          }        case stage: ResultStage =>          val job = stage.activeJob.get          partitionsToCompute.map { id =>            val p: Int = stage.partitions(id)            val part = stage.rdd.partitions(p)            val locs = taskIdToLocations(id)            new ResultTask(stage.id, stage.latestInfo.attemptId,              taskBinary, part, locs, id, stage.internalAccumulators)          }      }    } catch {      case NonFatal(e) =>        abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))        runningStages -= stage        return    }    if (tasks.size > 0) {      logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")      stage.pendingPartitions ++= tasks.map(_.partitionId)      logDebug("New pending partitions: " + stage.pendingPartitions)      taskScheduler.submitTasks(new TaskSet(        tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))      stage.latestInfo.submissionTime = Some(clock.getTimeMillis())    } else {      // Because we posted SparkListenerStageSubmitted earlier, we should mark      // the stage as completed here in case there are no tasks to run      markStageAsFinished(stage, None)      val debugString = stage match {        case stage: ShuffleMapStage =>          s"Stage ${stage} is actually done; " +            s"(available: ${stage.isAvailable}," +            s"available outputs: ${stage.numAvailableOutputs}," +            s"partitions: ${stage.numPartitions})"        case stage : ResultStage =>          s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"      }      logDebug(debugString)    }  }  /** Merge updates from a task to our local accumulator values */  private def updateAccumulators(event: CompletionEvent): Unit = {    val task = event.task    val stage = stageIdToStage(task.stageId)    if (event.accumUpdates != null) {      try {        Accumulators.add(event.accumUpdates)        event.accumUpdates.foreach { case (id, partialValue) =>          // In this instance, although the reference in Accumulators.originals is a WeakRef,          // it's guaranteed to exist since the event.accumUpdates Map exists          val acc = Accumulators.originals(id).get match {            case Some(accum) => accum.asInstanceOf[Accumulable[Any, Any]]            case None => throw new NullPointerException("Non-existent reference to Accumulator")          }          // To avoid UI cruft, ignore cases where value wasn't updated          if (acc.name.isDefined && partialValue != acc.zero) {            val name = acc.name.get            val value = s"${acc.value}"            stage.latestInfo.accumulables(id) =              new AccumulableInfo(id, name, None, value, acc.isInternal)            event.taskInfo.accumulables +=              new AccumulableInfo(id, name, Some(s"$partialValue"), value, acc.isInternal)          }        }      } catch {        // If we see an exception during accumulator update, just log the        // error and move on.        case e: Exception =>          logError(s"Failed to update accumulators for $task", e)      }    }  }

taskSchedulerImpl的submitTasks方法

override def submitTasks(taskSet: TaskSet) {    val tasks = taskSet.tasks    logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")    this.synchronized {      val manager = createTaskSetManager(taskSet, maxTaskFailures)      val stage = taskSet.stageId      val stageTaskSets =        taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])      stageTaskSets(taskSet.stageAttemptId) = manager      val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>        ts.taskSet != taskSet && !ts.isZombie      }      if (conflictingTaskSet) {        throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +          s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")      }      schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)      if (!isLocal && !hasReceivedTask) {        starvationTimer.scheduleAtFixedRate(new TimerTask() {          override def run() {            if (!hasLaunchedTask) {              logWarning("Initial job has not accepted any resources; " +                "check your cluster UI to ensure that workers are registered " +                "and have sufficient resources")            } else {              this.cancel()            }          }        }, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)      }      hasReceivedTask = true    }    backend.reviveOffers()  }

调用了reviveOffers，实际最终还是调用了CoarseGrainedSchedulerBackend的makeOffers方法

// Make fake resource offers on all executors    private def makeOffers() {      // Filter out executors under killing      val activeExecutors = executorDataMap.filterKeys(executorIsAlive)      val workOffers = activeExecutors.map { case (id, executorData) =>        new WorkerOffer(id, executorData.executorHost, executorData.freeCores)      }.toSeq      launchTasks(scheduler.resourceOffers(workOffers))    }

org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend

private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {      for (task <- tasks.flatten) {        val serializedTask = ser.serialize(task)        if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {          scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr =>            try {              var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +                "spark.akka.frameSize (%d bytes) - reserved (%d bytes). Consider increasing " +                "spark.akka.frameSize or using broadcast variables for large values."              msg = msg.format(task.taskId, task.index, serializedTask.limit, akkaFrameSize,                AkkaUtils.reservedSizeBytes)              taskSetMgr.abort(msg)            } catch {              case e: Exception => logError("Exception in error callback", e)            }          }        }        else {          val executorData = executorDataMap(task.executorId)          executorData.freeCores -= scheduler.CPUS_PER_TASK          executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))        }      }    }

Executor.scala

def launchTask(      context: ExecutorBackend,      taskId: Long,      attemptNumber: Int,      taskName: String,      serializedTask: ByteBuffer): Unit = {    val tr = new TaskRunner(context, taskId = taskId, attemptNumber = attemptNumber, taskName,      serializedTask)    runningTasks.put(taskId, tr)    threadPool.execute(tr)  }

0 0