spark 2.1 DAGScheduler.handleTaskCompletion
来源:互联网 发布:网络歌曲播放视频播放 编辑:程序博客网 时间:2024/06/13 21:39
handleTaskCompletion
/** * Responds to a task finishing. This is called inside the event loop so it assumes that it can * modify the scheduler's internal state. Use taskEnded() to post a task end event from outside. */ private[scheduler] def handleTaskCompletion(event: CompletionEvent) { val task = event.task val taskId = event.taskInfo.id val stageId = task.stageId val taskType = Utils.getFormattedClassName(task) outputCommitCoordinator.taskCompleted( stageId, task.partitionId, event.taskInfo.attemptNumber, // this is a task attempt number event.reason) // Reconstruct task metrics. Note: this may be null if the task has failed. val taskMetrics: TaskMetrics = if (event.accumUpdates.nonEmpty) { try { TaskMetrics.fromAccumulators(event.accumUpdates) } catch { case NonFatal(e) => logError(s"Error when attempting to reconstruct metrics for task $taskId", e) null } } else { null } // The stage may have already finished when we get this event -- eg. maybe it was a // speculative task. It is important that we send the TaskEnd event in any case, so listeners // are properly notified and can chose to handle it. For instance, some listeners are // doing their own accounting and if they don't get the task end event they think // tasks are still running when they really aren't. listenerBus.post(SparkListenerTaskEnd( stageId, task.stageAttemptId, taskType, event.reason, event.taskInfo, taskMetrics)) if (!stageIdToStage.contains(task.stageId)) { // Skip all the actions if the stage has been cancelled. return } val stage = stageIdToStage(task.stageId) event.reason match { case Success => stage.pendingPartitions -= task.partitionId task match { case rt: ResultTask[_, _] => // Cast to ResultStage here because it's part of the ResultTask // TODO Refactor this out to a function that accepts a ResultStage val resultStage = stage.asInstanceOf[ResultStage] resultStage.activeJob match { case Some(job) => if (!job.finished(rt.outputId)) { updateAccumulators(event) job.finished(rt.outputId) = true job.numFinished += 1 // If the whole job has finished, remove it if (job.numFinished == job.numPartitions) { markStageAsFinished(resultStage) cleanupStateForJobAndIndependentStages(job) listenerBus.post( SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), JobSucceeded)) } // taskSucceeded runs some user code that might throw an exception. Make sure // we are resilient against that. try { job.listener.taskSucceeded(rt.outputId, event.result) } catch { case e: Exception => // TODO: Perhaps we want to mark the resultStage as failed? job.listener.jobFailed(new SparkDriverExecutionException(e)) } } case None => logInfo("Ignoring result from " + rt + " because its job has finished") } case smt: ShuffleMapTask => val shuffleStage = stage.asInstanceOf[ShuffleMapStage] updateAccumulators(event) val status = event.result.asInstanceOf[MapStatus] val execId = status.location.executorId logDebug("ShuffleMapTask finished on " + execId) if (failedEpoch.contains(execId) && smt.epoch <= failedEpoch(execId)) { logInfo(s"Ignoring possibly bogus $smt completion from executor $execId") } else { shuffleStage.addOutputLoc(smt.partitionId, status) } if (runningStages.contains(shuffleStage) && shuffleStage.pendingPartitions.isEmpty) { markStageAsFinished(shuffleStage) logInfo("looking for newly runnable stages") logInfo("running: " + runningStages) logInfo("waiting: " + waitingStages) logInfo("failed: " + failedStages) // We supply true to increment the epoch number here in case this is a // recomputation of the map outputs. In that case, some nodes may have cached // locations with holes (from when we detected the error) and will need the // epoch incremented to refetch them. // TODO: Only increment the epoch number if this is not the first time // we registered these map outputs. mapOutputTracker.registerMapOutputs( shuffleStage.shuffleDep.shuffleId, shuffleStage.outputLocInMapOutputTrackerFormat(), changeEpoch = true) clearCacheLocs() if (!shuffleStage.isAvailable) { // Some tasks had failed; let's resubmit this shuffleStage // TODO: Lower-level scheduler should also deal with this logInfo("Resubmitting " + shuffleStage + " (" + shuffleStage.name + ") because some of its tasks had failed: " + shuffleStage.findMissingPartitions().mkString(", ")) submitStage(shuffleStage) } else { // Mark any map-stage jobs waiting on this stage as finished if (shuffleStage.mapStageJobs.nonEmpty) { val stats = mapOutputTracker.getStatistics(shuffleStage.shuffleDep) for (job <- shuffleStage.mapStageJobs) { markMapStageJobAsFinished(job, stats) } } submitWaitingChildStages(shuffleStage) } } } case Resubmitted => logInfo("Resubmitted " + task + ", so marking it as still running") stage.pendingPartitions += task.partitionId case FetchFailed(bmAddress, shuffleId, mapId, reduceId, failureMessage) => val failedStage = stageIdToStage(task.stageId) val mapStage = shuffleIdToMapStage(shuffleId) if (failedStage.latestInfo.attemptId != task.stageAttemptId) { logInfo(s"Ignoring fetch failure from $task as it's from $failedStage attempt" + s" ${task.stageAttemptId} and there is a more recent attempt for that stage " + s"(attempt ID ${failedStage.latestInfo.attemptId}) running") } else { // It is likely that we receive multiple FetchFailed for a single stage (because we have // multiple tasks running concurrently on different executors). In that case, it is // possible the fetch failure has already been handled by the scheduler. if (runningStages.contains(failedStage)) { logInfo(s"Marking $failedStage (${failedStage.name}) as failed " + s"due to a fetch failure from $mapStage (${mapStage.name})") markStageAsFinished(failedStage, Some(failureMessage)) } else { logDebug(s"Received fetch failure from $task, but its from $failedStage which is no " + s"longer running") } if (disallowStageRetryForTest) { abortStage(failedStage, "Fetch failure will not retry stage due to testing config", None) } else if (failedStage.failedOnFetchAndShouldAbort(task.stageAttemptId)) { abortStage(failedStage, s"$failedStage (${failedStage.name}) " + s"has failed the maximum allowable number of " + s"times: ${Stage.MAX_CONSECUTIVE_FETCH_FAILURES}. " + s"Most recent failure reason: ${failureMessage}", None) } else { if (failedStages.isEmpty) { // Don't schedule an event to resubmit failed stages if failed isn't empty, because // in that case the event will already have been scheduled. // TODO: Cancel running tasks in the stage logInfo(s"Resubmitting $mapStage (${mapStage.name}) and " + s"$failedStage (${failedStage.name}) due to fetch failure") messageScheduler.schedule(new Runnable { override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages) }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS) } failedStages += failedStage failedStages += mapStage } // Mark the map whose fetch failed as broken in the map stage if (mapId != -1) { mapStage.removeOutputLoc(mapId, bmAddress) mapOutputTracker.unregisterMapOutput(shuffleId, mapId, bmAddress) } // TODO: mark the executor as failed only if there were lots of fetch failures on it if (bmAddress != null) { handleExecutorLost(bmAddress.executorId, filesLost = true, Some(task.epoch)) } } case commitDenied: TaskCommitDenied => // Do nothing here, left up to the TaskScheduler to decide how to handle denied commits case exceptionFailure: ExceptionFailure => // Tasks failed with exceptions might still have accumulator updates. updateAccumulators(event) case TaskResultLost => // Do nothing here; the TaskScheduler handles these failures and resubmits the task. case _: ExecutorLostFailure | TaskKilled | UnknownReason => // Unrecognized failure - also do nothing. If the task fails repeatedly, the TaskScheduler // will abort the job. } }
阅读全文
0 0
- spark 2.1 DAGScheduler.handleTaskCompletion
- spark dagscheduler
- 【Spark】DAGScheduler源码浅析
- Spark深入第一步DAGScheduler
- Spark分析之DAGScheduler
- Spark源码阅读笔记:DAGScheduler
- 【Spark】DAGScheduler源码浅析2
- Spark的TaskScheduler和DagScheduler
- Spark TaskScheduler和DagScheduler介绍
- Spark的TaskScheduler和DagScheduler
- Spark的TaskScheduler和DagScheduler
- DAGScheduler
- Spark Core Runtime分析: DAGScheduler, TaskScheduler, SchedulerBackend
- 深入Spark内核:任务调度(2)-DAGScheduler
- [Spark源码剖析] DAGScheduler划分stage
- [Spark源码剖析] DAGScheduler提交stage
- Spark DAGScheduler 功能及源码解析
- (八)Spark源码理解之DAGScheduler---part1
- .csv文件中存储列表(已解决)
- 模板与特殊化
- Linux 下tomcat 操作命令
- j2ee项目中emoji相关的问题
- java中使用poi导入excel文件
- spark 2.1 DAGScheduler.handleTaskCompletion
- php在设计模式中的应用之单例模式
- 监听器
- github for window 2.0客户端连接github
- 搭建APP框架
- 浮点数的相等比较
- 管道(pipe)
- 第十届河南acm省赛F: Binary to Prime
- linux下的redis安装和改密