spark core 2.0 Driver HeartbeatReceiver

来源：互联网发布：组织权限控制 java 编辑：程序博客网时间：2024/05/16 14:55

HeartbeatReceiver 在driver端接收从执行器发过来的心跳。

/** * Lives in the driver to receive heartbeats from executors.. */private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock)  extends SparkListener with ThreadSafeRpcEndpoint with Logging {

onStart方法启动一个定时线程，用于删除过时的对象。_.ask[Boolean](ExpireDeadHosts)用于向本对象发送ExpreDeadHosts事件。

  override def onStart(): Unit = {    timeoutCheckingTask = eventLoopThread.scheduleAtFixedRate(new Runnable {      override def run(): Unit = Utils.tryLogNonFatalError {        Option(self).foreach(_.ask[Boolean](ExpireDeadHosts))      }    }, 0, checkTimeoutIntervalMs, TimeUnit.MILLISECONDS)  }

接收到ExpireDeadHosts之后，调用expireDeadHosts()方法，然后调用context.reply(true)返回true.

override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {    // Messages sent and received locally    case ExecutorRegistered(executorId) =>      executorLastSeen(executorId) = clock.getTimeMillis()      context.reply(true)    case ExecutorRemoved(executorId) =>      executorLastSeen.remove(executorId)      context.reply(true)    case TaskSchedulerIsSet =>      scheduler = sc.taskScheduler      context.reply(true)    case ExpireDeadHosts =>      expireDeadHosts()      context.reply(true)

expireDeadHosts方法如下：对于每个执行器进行超时判断，对于超时的，调用scheduler.executorLost方法，然后在killExecutorThread 提交一个杀死的方法，如下：

private def expireDeadHosts(): Unit = {    logTrace("Checking for hosts with no recent heartbeats in HeartbeatReceiver.")    val now = clock.getTimeMillis()    for ((executorId, lastSeenMs) <- executorLastSeen) {      if (now - lastSeenMs > executorTimeoutMs) {        logWarning(s"Removing executor $executorId with no recent heartbeats: " +          s"${now - lastSeenMs} ms exceeds timeout $executorTimeoutMs ms")        scheduler.executorLost(executorId, SlaveLost("Executor heartbeat " +          s"timed out after ${now - lastSeenMs} ms"))          // Asynchronously kill the executor to avoid blocking the current thread        killExecutorThread.submit(new Runnable {          override def run(): Unit = Utils.tryLogNonFatalError {            // Note: we want to get an executor back after expiring this one,            // so do not simply call `sc.killExecutor` here (SPARK-8119)            sc.killAndReplaceExecutor(executorId)          }        })        executorLastSeen.remove(executorId)      }    }  }

killAndPreplaceExecutor方法请求集群管理器杀掉指定的执行器，不调整程序的资源需求。

结果是新的执行器将会被启动来代替请求杀死的这个执行器。假定集群管理器将自动并最终完成所有丢失的应用资源请求。

注意：这个替换不保证，同一集群的另一个应用程序可能趁机获到资源。如果是CorseGrainedSchedulerBackend，则调用 b.killExecutors

 /**   * Request that the cluster manager kill the specified executor without adjusting the   * application resource requirements.   *   * The effect is that a new executor will be launched in place of the one killed by   * this request. This assumes the cluster manager will automatically and eventually   * fulfill all missing application resource requests.   *   * Note: The replace is by no means guaranteed; another application on the same cluster   * can steal the window of opportunity and acquire this application's resources in the   * mean time.   *   * @return whether the request is received.   */  private[spark] def killAndReplaceExecutor(executorId: String): Boolean = {    schedulerBackend match {      case b: CoarseGrainedSchedulerBackend =>        b.killExecutors(Seq(executorId), replace = true, force = true)      case _ =>        logWarning("Killing executors is only supported in coarse-grained mode")        false    }  }

CorseGrainedSchedulerBackend的killExecutors方法如下:

当请求执行器被替换时，执行器丢失被认为是一个失败，杀死在这个执行器上正在运行的任务会统计失败。如果没有请求替换，则任务不会统计到失败里。

/**   * Request that the cluster manager kill the specified executors.   *   * When asking the executor to be replaced, the executor loss is considered a failure, and   * killed tasks that are running on the executor will count towards the failure limits. If no   * replacement is being requested, then the tasks will not count towards the limit.   *   * @param executorIds identifiers of executors to kill   * @param replace whether to replace the killed executors with new ones   * @param force whether to force kill busy executors   * @return whether the kill request is acknowledged. If list to kill is empty, it will return   *         false.   */  final def killExecutors(      executorIds: Seq[String],      replace: Boolean,      force: Boolean): Boolean = {    logInfo(s"Requesting to kill executor(s) ${executorIds.mkString(", ")}")    val response = synchronized {      val (knownExecutors, unknownExecutors) = executorIds.partition(executorDataMap.contains)      unknownExecutors.foreach { id =>        logWarning(s"Executor to kill $id does not exist!")      }      // If an executor is already pending to be removed, do not kill it again (SPARK-9795)      // If this executor is busy, do not kill it unless we are told to force kill it (SPARK-9552)      val executorsToKill = knownExecutors        .filter { id => !executorsPendingToRemove.contains(id) }        .filter { id => force || !scheduler.isExecutorBusy(id) }      executorsToKill.foreach { id => executorsPendingToRemove(id) = !replace }      // If we do not wish to replace the executors we kill, sync the target number of executors      // with the cluster manager to avoid allocating new ones. When computing the new target,      // take into account executors that are pending to be added or removed.      val adjustTotalExecutors =        if (!replace) {          doRequestTotalExecutors(            numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)        } else {          numPendingExecutors += knownExecutors.size          Future.successful(true)        }      val killExecutors: Boolean => Future[Boolean] =        if (!executorsToKill.isEmpty) {          _ => doKillExecutors(executorsToKill)        } else {          _ => Future.successful(false)        }      adjustTotalExecutors.flatMap(killExecutors)(ThreadUtils.sameThread)    }    defaultAskTimeout.awaitResult(response)  }

心跳信息发过来之后，如果该执行器在列表中，则调用调度器的executorheartbeatReceived方法，然后构造结果返回，否则向返回重新注意块管理器的信息。

// Messages received from executors    case heartbeat @ Heartbeat(executorId, accumUpdates, blockManagerId) =>      if (scheduler != null) {        if (executorLastSeen.contains(executorId)) {          executorLastSeen(executorId) = clock.getTimeMillis()          eventLoopThread.submit(new Runnable {            override def run(): Unit = Utils.tryLogNonFatalError {              val unknownExecutor = !scheduler.executorHeartbeatReceived(                executorId, accumUpdates, blockManagerId)              val response = HeartbeatResponse(reregisterBlockManager = unknownExecutor)              context.reply(response)            }          })        } else {          // This may happen if we get an executor's in-flight heartbeat immediately          // after we just removed it. It's not really an error condition so we should          // not log warning here. Otherwise there may be a lot of noise especially if          // we explicitly remove executors (SPARK-4134).          logDebug(s"Received heartbeat from unknown executor $executorId")          context.reply(HeartbeatResponse(reregisterBlockManager = true))        }      } else {        // Because Executor will sleep several seconds before sending the first "Heartbeat", this        // case rarely happens. However, if it really happens, log it and ask the executor to        // register itself again.        logWarning(s"Dropping $heartbeat because TaskScheduler is not ready yet")        context.reply(HeartbeatResponse(reregisterBlockManager = true))      }

0 0