Spark源码学习笔记7-HeartbeatReceiver

来源：互联网发布：罗素名言知乎编辑：程序博客网时间：2024/06/05 03:57

我们继续了解SparkContext中的_heartbeatReceiver: RpcEndpointRef，其初始化代码如下：

// We need to register "HeartbeatReceiver" before "createTaskScheduler" because Executor will    // retrieve "HeartbeatReceiver" in the constructor. (SPARK-6640)    _heartbeatReceiver = env.rpcEnv.setupEndpoint(      HeartbeatReceiver.ENDPOINT_NAME, new HeartbeatReceiver(this))

在上两节中已经了解了RpcEnv了，_heartbeatReceiver是端点引用类型RpcEndpointRef，类HeartbeatReceiver是RpcEndpoint类型。

HeartbeatReceiver

类HeartbeatReceiver具有伴生对象，伴生对象仅有成员val ENDPOINT_NAME = “HeartbeatReceiver”。

HeartbeatReceiver所在文件为HeartbeatReceiver.scala，该文件开始部分包含一些HeartbeatReceiver接受/返回的消息类的定义： Heartbeat表示executors发给driver的心跳信息； TaskSchedulerIsSet是SparkContext通知HeartbeatReceiver； taskScheduler已经创建好； ExpireDeadHosts用于通知HeartbeatReceiver将已经没有心跳的Host销毁掉； ExecutorRegistered, ExecutorRemoved表示Executor的注册和移除； HeartbeatResponse是HeartbeatReceiver处理Heartbeat消息后返回的消息。
这些类的代码如下：

package org.apache.spark....../** * A heartbeat from executors to the driver. This is a shared message used by several internal * components to convey liveness or execution information for in-progress tasks. It will also * expire the hosts that have not heartbeated for more than spark.network.timeout. * spark.executor.heartbeatInterval should be significantly less than spark.network.timeout. */private[spark] case class Heartbeat(    executorId: String,    accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], // taskId -> accumulator updates    blockManagerId: BlockManagerId)/** * An event that SparkContext uses to notify HeartbeatReceiver that SparkContext.taskScheduler is * created. */private[spark] case object TaskSchedulerIsSetprivate[spark] case object ExpireDeadHostsprivate case class ExecutorRegistered(executorId: String)private case class ExecutorRemoved(executorId: String)private[spark] case class HeartbeatResponse(reregisterBlockManager: Boolean)

先贴下HeartbeatReceiver的源码吧

/** * Lives in the driver to receive heartbeats from executors.. */private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock)  extends SparkListener with ThreadSafeRpcEndpoint with Logging {  def this(sc: SparkContext) {    this(sc, new SystemClock)  }  sc.addSparkListener(this)  override val rpcEnv: RpcEnv = sc.env.rpcEnv  private[spark] var scheduler: TaskScheduler = null  // executor ID -> timestamp of when the last heartbeat from this executor was received  private val executorLastSeen = new mutable.HashMap[String, Long]  // "spark.network.timeout" uses "seconds", while `spark.storage.blockManagerSlaveTimeoutMs` uses  // "milliseconds"  private val slaveTimeoutMs =    sc.conf.getTimeAsMs("spark.storage.blockManagerSlaveTimeoutMs", "120s")  private val executorTimeoutMs =    sc.conf.getTimeAsSeconds("spark.network.timeout", s"${slaveTimeoutMs}ms") * 1000  // "spark.network.timeoutInterval" uses "seconds", while  // "spark.storage.blockManagerTimeoutIntervalMs" uses "milliseconds"  private val timeoutIntervalMs =    sc.conf.getTimeAsMs("spark.storage.blockManagerTimeoutIntervalMs", "60s")  private val checkTimeoutIntervalMs =    sc.conf.getTimeAsSeconds("spark.network.timeoutInterval", s"${timeoutIntervalMs}ms") * 1000  private var timeoutCheckingTask: ScheduledFuture[_] = null  // "eventLoopThread" is used to run some pretty fast actions. The actions running in it should not  // block the thread for a long time.  private val eventLoopThread =    ThreadUtils.newDaemonSingleThreadScheduledExecutor("heartbeat-receiver-event-loop-thread")  private val killExecutorThread = ThreadUtils.newDaemonSingleThreadExecutor("kill-executor-thread")  override def onStart(): Unit = {    timeoutCheckingTask = eventLoopThread.scheduleAtFixedRate(new Runnable {      override def run(): Unit = Utils.tryLogNonFatalError {        Option(self).foreach(_.ask[Boolean](ExpireDeadHosts))      }    }, 0, checkTimeoutIntervalMs, TimeUnit.MILLISECONDS)  }  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {    // Messages sent and received locally    case ExecutorRegistered(executorId) =>      executorLastSeen(executorId) = clock.getTimeMillis()      context.reply(true)    case ExecutorRemoved(executorId) =>      executorLastSeen.remove(executorId)      context.reply(true)    case TaskSchedulerIsSet =>      scheduler = sc.taskScheduler      context.reply(true)    case ExpireDeadHosts =>      expireDeadHosts()      context.reply(true)    // Messages received from executors    case heartbeat @ Heartbeat(executorId, accumUpdates, blockManagerId) =>      if (scheduler != null) {        if (executorLastSeen.contains(executorId)) {          executorLastSeen(executorId) = clock.getTimeMillis()          eventLoopThread.submit(new Runnable {            override def run(): Unit = Utils.tryLogNonFatalError {              val unknownExecutor = !scheduler.executorHeartbeatReceived(                executorId, accumUpdates, blockManagerId)              val response = HeartbeatResponse(reregisterBlockManager = unknownExecutor)              context.reply(response)            }          })        } else {          // This may happen if we get an executor's in-flight heartbeat immediately          // after we just removed it. It's not really an error condition so we should          // not log warning here. Otherwise there may be a lot of noise especially if          // we explicitly remove executors (SPARK-4134).          logDebug(s"Received heartbeat from unknown executor $executorId")          context.reply(HeartbeatResponse(reregisterBlockManager = true))        }      } else {        // Because Executor will sleep several seconds before sending the first "Heartbeat", this        // case rarely happens. However, if it really happens, log it and ask the executor to        // register itself again.        logWarning(s"Dropping $heartbeat because TaskScheduler is not ready yet")        context.reply(HeartbeatResponse(reregisterBlockManager = true))      }  }  /**   * Send ExecutorRegistered to the event loop to add a new executor. Only for test.   *   * @return if HeartbeatReceiver is stopped, return None. Otherwise, return a Some(Future) that   *         indicate if this operation is successful.   */  def addExecutor(executorId: String): Option[Future[Boolean]] = {    Option(self).map(_.ask[Boolean](ExecutorRegistered(executorId)))  }  /**   * If the heartbeat receiver is not stopped, notify it of executor registrations.   */  override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = {    addExecutor(executorAdded.executorId)  }  /**   * Send ExecutorRemoved to the event loop to remove an executor. Only for test.   *   * @return if HeartbeatReceiver is stopped, return None. Otherwise, return a Some(Future) that   *         indicate if this operation is successful.   */  def removeExecutor(executorId: String): Option[Future[Boolean]] = {    Option(self).map(_.ask[Boolean](ExecutorRemoved(executorId)))  }  /**   * If the heartbeat receiver is not stopped, notify it of executor removals so it doesn't   * log superfluous errors.   *   * Note that we must do this after the executor is actually removed to guard against the   * following race condition: if we remove an executor's metadata from our data structure   * prematurely, we may get an in-flight heartbeat from the executor before the executor is   * actually removed, in which case we will still mark the executor as a dead host later   * and expire it with loud error messages.   */  override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = {    removeExecutor(executorRemoved.executorId)  }  private def expireDeadHosts(): Unit = {    logTrace("Checking for hosts with no recent heartbeats in HeartbeatReceiver.")    val now = clock.getTimeMillis()    for ((executorId, lastSeenMs) <- executorLastSeen) {      if (now - lastSeenMs > executorTimeoutMs) {        logWarning(s"Removing executor $executorId with no recent heartbeats: " +          s"${now - lastSeenMs} ms exceeds timeout $executorTimeoutMs ms")        scheduler.executorLost(executorId, SlaveLost("Executor heartbeat " +          s"timed out after ${now - lastSeenMs} ms"))          // Asynchronously kill the executor to avoid blocking the current thread        killExecutorThread.submit(new Runnable {          override def run(): Unit = Utils.tryLogNonFatalError {            // Note: we want to get an executor back after expiring this one,            // so do not simply call `sc.killExecutor` here (SPARK-8119)            sc.killAndReplaceExecutor(executorId)          }        })        executorLastSeen.remove(executorId)      }    }  }  override def onStop(): Unit = {    if (timeoutCheckingTask != null) {      timeoutCheckingTask.cancel(true)    }    eventLoopThread.shutdownNow()    killExecutorThread.shutdownNow()  }}private[spark] object HeartbeatReceiver {  val ENDPOINT_NAME = "HeartbeatReceiver"}

类HeartbeatReceiver继承SparkListener和ThreadSafeRpcEndpoint。HeartbeatReceiver的构造函数中，把它自己也加入到SparkContext的ListenerBus中: sc.addSparkListener(this)，在之前学习中知道JobProgressListener已先加入其中。

HeartbeatReceiver内部通过executorLastSeen: HashMap[String, Long]记录着注册的Executor的最近的心跳时间。

类HeartbeatReceiver中包含线程eventLoopThread，该线程用于快速运行一些actions，不会阻塞线程很长时间：

// "eventLoopThread" is used to run some pretty fast actions. The actions running in it should not  // block the thread for a long time.  private val eventLoopThread =    ThreadUtils.newDaemonSingleThreadScheduledExecutor("heartbeat-receiver-event-loop-thread")

函数newDaemonSingleThreadScheduledExecutor实现为：

/** * Wrapper over ScheduledThreadPoolExecutor.   */  def newDaemonSingleThreadScheduledExecutor(threadName: String): ScheduledExecutorService = {    val threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat(threadName).build()    val executor = new ScheduledThreadPoolExecutor(1, threadFactory)    // By default, a cancelled task is not automatically removed from the work queue until its delay    // elapses. We have to enable it manually.    executor.setRemoveOnCancelPolicy(true)    executor  }

eventLoopThread类型为ScheduledExecutorService，从上述函数中可以看到内部实现是new出对象ScheduledThreadPoolExecutor，且线程个数为1，即单线程的线程池（Schedualed线程池，task延时处理）。类的关系为：class ScheduledThreadPoolExecutor
extends ThreadPoolExecutor
implements ScheduledExecutorService。我们分别看看这几个类。

ThreadPoolExecutor
ThreadPoolExecutor是java中的线程池，该类的构造函数中提供多个参数，可根据需要通过参数设置来配置具有不同特点的线程池。关于Java线程池种类及各自特点可以另行深入研究。
其部分注释及部分代码如下：

package java.util.concurrent;....../** * An {@link ExecutorService} that executes each submitted task using * one of possibly several pooled threads, normally configured * using {@link Executors} factory methods. * ......public class ThreadPoolExecutor extends AbstractExecutorService {......

killExecutorThread是类HeartbeatReceiver的单线程线程池：

  private val killExecutorThread = ThreadUtils.newDaemonSingleThreadExecutor("kill-executor-thread")

类HeartbeatReceiver的onStart函数调用eventLoopThread创建了一个task，定期执行。

类HeartbeatReceiver主要消息处理函数receiveAndReply为偏函数，主要处理几种特殊消息：

override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {    // Messages sent and received locally    case ExecutorRegistered(executorId) =>      executorLastSeen(executorId) = clock.getTimeMillis()      context.reply(true)    case ExecutorRemoved(executorId) =>      executorLastSeen.remove(executorId)      context.reply(true)    case TaskSchedulerIsSet =>      scheduler = sc.taskScheduler      context.reply(true)    case ExpireDeadHosts =>      expireDeadHosts()      context.reply(true)    // Messages received from executors    case heartbeat @ Heartbeat(executorId, accumUpdates, blockManagerId) =>      if (scheduler != null) {        if (executorLastSeen.contains(executorId)) {          executorLastSeen(executorId) = clock.getTimeMillis()          eventLoopThread.submit(new Runnable {            override def run(): Unit = Utils.tryLogNonFatalError {              val unknownExecutor = !scheduler.executorHeartbeatReceived(                executorId, accumUpdates, blockManagerId)              val response = HeartbeatResponse(reregisterBlockManager = unknownExecutor)              context.reply(response)            }          })        } else {          // This may happen if we get an executor's in-flight heartbeat immediately          // after we just removed it. It's not really an error condition so we should          // not log warning here. Otherwise there may be a lot of noise especially if          // we explicitly remove executors (SPARK-4134).          logDebug(s"Received heartbeat from unknown executor $executorId")          context.reply(HeartbeatResponse(reregisterBlockManager = true))        }      } else {        // Because Executor will sleep several seconds before sending the first "Heartbeat", this        // case rarely happens. However, if it really happens, log it and ask the executor to        // register itself again.        logWarning(s"Dropping $heartbeat because TaskScheduler is not ready yet")        context.reply(HeartbeatResponse(reregisterBlockManager = true))      }  }

0 0