深入Spark内核：任务调度(3)-TaskScheduler

来源：互联网发布：千兆路由器知乎编辑：程序博客网时间：2024/05/21 14:47

TaskScheduler相较于DAGScheduler来说是一个低级的任务调度器(low-level task scheduler)，更加准确的来说它更像是一个任务调度器接口。TaskScheduler的实现类只有一个TaskSchedulerImpl。TaskScheduler核心工作负责任务的发送和任务执行过程的监控，而一个TaskScheduler只为一个SparkContext实例服务，并接受DAGScheduler发送过来的TaskSet，TaskScheduler收到任务后负责把任务分发到集群中Worker的Excutor中去运行。因为DAGScheduler给TaskScheduler发送任务时是以Stage为单位来提交的，所以每个Stage内部都是TaskScheduler管理的一个单位就是TaskSet，TaskSet里面会有若干个任务(Task)，而任任务(Task)是以数据分片为单位划分的。如果某个任务(Task)运行失败，TaskScheduler就要负责重试；如果TaskScheduler发现某个任务(Task)一直未运行完或者运行过慢则有可能会在另一个机器开辟同一个任务(Task)，哪个任务(Task)先运行完就用哪个任务(Task)的结果。

/** * Low-level task scheduler interface, currently implemented exclusively by TaskSchedulerImpl. * This interface allows plugging in different task schedulers. Each TaskScheduler schedulers tasks * for a single SparkContext. These schedulers get sets of tasks submitted to them from the * DAGScheduler for each stage, and are responsible for sending the tasks to the cluster, running * them, retrying if there are failures, and mitigating stragglers. They return events to the * DAGScheduler. */private[spark] trait TaskScheduler {  def rootPool: Pool  def schedulingMode: SchedulingMode  def start(): Unit  // Invoked after system has successfully initialized (typically in spark context).  // Yarn uses this to bootstrap allocation of resources based on preferred locations,  // wait for slave registerations, etc.  def postStartHook() { }  // Disconnect from the cluster.  def stop(): Unit  // Submit a sequence of tasks to run.  def submitTasks(taskSet: TaskSet): Unit  // Cancel a stage.  def cancelTasks(stageId: Int, interruptThread: Boolean)  // Set the DAG scheduler for upcalls. This is guaranteed to be set before submitTasks is called.  def setDAGScheduler(dagScheduler: DAGScheduler): Unit  // Get the default level of parallelism to use in the cluster, as a hint for sizing jobs.  def defaultParallelism(): Int  /**   * Update metrics for in-progress tasks and let the master know that the BlockManager is still   * alive. Return true if the driver knows about the given block manager. Otherwise, return false,   * indicating that the block manager should re-register.   */  def executorHeartbeatReceived(execId: String, taskMetrics: Array[(Long, TaskMetrics)],    blockManagerId: BlockManagerId): Boolean}

在SparkContext中TaskScheduler实例对象创建的源码如下：

  // Create and start the scheduler  private[spark] var taskScheduler = SparkContext.createTaskScheduler(this, master)

接下来让我们进入createTaskScheduler方法，通过模式匹配可以看出创建任务时存在多种运行模式，不同的运行模式都有与之相对应的SchedulerBackend：

Local模式：TaskSchedulerImpl + LocalBackend

　　Standalone模式：TaskSchedulerImpl + SparkDepolySchedulerBackend

　　Yarn-Cluster模式：YarnClusterScheduler + CoarseGrainedSchedulerBackend

　　Yarn-Client模式：YarnClientClusterScheduler + YarnClientSchedulerBackend

Mesos模式：TaskSchedulerImpl + CoarseMesosSchedulerBackend

 /** Creates a task scheduler based on a given master URL. Extracted for testing. */  private def createTaskScheduler(sc: SparkContext, master: String): TaskScheduler = {    // Regular expression used for local[N] and local[*] master formats    val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r    // Regular expression for local[N, maxRetries], used in tests with failing tasks    val LOCAL_N_FAILURES_REGEX = """local\[([0-9]+|\*)\s*,\s*([0-9]+)\]""".r    // Regular expression for simulating a Spark cluster of [N, cores, memory] locally    val LOCAL_CLUSTER_REGEX = """local-cluster\[\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*]""".r    // Regular expression for connecting to Spark deploy clusters    val SPARK_REGEX = """spark://(.*)""".r    // Regular expression for connection to Mesos cluster by mesos:// or zk:// url    val MESOS_REGEX = """(mesos|zk)://.*""".r    // Regular expression for connection to Simr cluster    val SIMR_REGEX = """simr://(.*)""".r    // When running locally, don't try to re-execute tasks on failure.    val MAX_LOCAL_TASK_FAILURES = 1    master match {      case "local" =>        val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)        val backend = new LocalBackend(scheduler, 1)        scheduler.initialize(backend)        scheduler      case LOCAL_N_REGEX(threads) =>        def localCpuCount = Runtime.getRuntime.availableProcessors()        // local[*] estimates the number of cores on the machine; local[N] uses exactly N threads.        val threadCount = if (threads == "*") localCpuCount else threads.toInt        val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)        val backend = new LocalBackend(scheduler, threadCount)        scheduler.initialize(backend)        scheduler      case LOCAL_N_FAILURES_REGEX(threads, maxFailures) =>        def localCpuCount = Runtime.getRuntime.availableProcessors()        // local[*, M] means the number of cores on the computer with M failures        // local[N, M] means exactly N threads with M failures        val threadCount = if (threads == "*") localCpuCount else threads.toInt        val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true)        val backend = new LocalBackend(scheduler, threadCount)        scheduler.initialize(backend)        scheduler      case SPARK_REGEX(sparkUrl) =>        val scheduler = new TaskSchedulerImpl(sc)        val masterUrls = sparkUrl.split(",").map("spark://" + _)        val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls)        scheduler.initialize(backend)        scheduler      case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>        // Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang.        val memoryPerSlaveInt = memoryPerSlave.toInt        if (sc.executorMemory > memoryPerSlaveInt) {          throw new SparkException(            "Asked to launch cluster with %d MB RAM / worker but requested %d MB/worker".format(              memoryPerSlaveInt, sc.executorMemory))        }        val scheduler = new TaskSchedulerImpl(sc)        val localCluster = new LocalSparkCluster(          numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt)        val masterUrls = localCluster.start()        val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls)        scheduler.initialize(backend)        backend.shutdownCallback = (backend: SparkDeploySchedulerBackend) => {          localCluster.stop()        }        scheduler      case "yarn-standalone" | "yarn-cluster" =>        if (master == "yarn-standalone") {          logWarning(            "\"yarn-standalone\" is deprecated as of Spark 1.0. Use \"yarn-cluster\" instead.")        }        val scheduler = try {          val clazz = Class.forName("org.apache.spark.scheduler.cluster.YarnClusterScheduler")          val cons = clazz.getConstructor(classOf[SparkContext])          cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl]        } catch {          // TODO: Enumerate the exact reasons why it can fail          // But irrespective of it, it means we cannot proceed !          case e: Exception => {            throw new SparkException("YARN mode not available ?", e)          }        }        val backend = try {          val clazz =            Class.forName("org.apache.spark.scheduler.cluster.YarnClusterSchedulerBackend")          val cons = clazz.getConstructor(classOf[TaskSchedulerImpl], classOf[SparkContext])          cons.newInstance(scheduler, sc).asInstanceOf[CoarseGrainedSchedulerBackend]        } catch {          case e: Exception => {            throw new SparkException("YARN mode not available ?", e)          }        }        scheduler.initialize(backend)        scheduler      case "yarn-client" =>        val scheduler = try {          val clazz =            Class.forName("org.apache.spark.scheduler.cluster.YarnClientClusterScheduler")          val cons = clazz.getConstructor(classOf[SparkContext])          cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl]        } catch {          case e: Exception => {            throw new SparkException("YARN mode not available ?", e)          }        }        val backend = try {          val clazz =            Class.forName("org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend")          val cons = clazz.getConstructor(classOf[TaskSchedulerImpl], classOf[SparkContext])          cons.newInstance(scheduler, sc).asInstanceOf[CoarseGrainedSchedulerBackend]        } catch {          case e: Exception => {            throw new SparkException("YARN mode not available ?", e)          }        }        scheduler.initialize(backend)        scheduler      case mesosUrl @ MESOS_REGEX(_) =>        MesosNativeLibrary.load()        val scheduler = new TaskSchedulerImpl(sc)        val coarseGrained = sc.conf.getBoolean("spark.mesos.coarse", false)        val url = mesosUrl.stripPrefix("mesos://") // strip scheme from raw Mesos URLs        val backend = if (coarseGrained) {          new CoarseMesosSchedulerBackend(scheduler, sc, url)        } else {          new MesosSchedulerBackend(scheduler, sc, url)        }        scheduler.initialize(backend)        scheduler      case SIMR_REGEX(simrUrl) =>        val scheduler = new TaskSchedulerImpl(sc)        val backend = new SimrSchedulerBackend(scheduler, sc, simrUrl)        scheduler.initialize(backend)        scheduler      case _ =>        throw new SparkException("Could not parse Master URL: '" + master + "'")    }  }}

让我们以Spark的Standalone模式为例，首先会创建一个TaskSchedulerImpl的实例对象(scheduler)，接着会创建SparkDeploySchedulerBackend实例对象(backend)，最后scheduler会初始化backend。由此可见，TaskSchedulerImpl的实例对象(scheduler)的主要目的在于启动backend。

 case SPARK_REGEX(sparkUrl) =>   val scheduler = new TaskSchedulerImpl(sc)   val masterUrls = sparkUrl.split(",").map("spark://" + _)   val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls)   scheduler.initialize(backend)   scheduler

TaskSchedulerImpl主要负责为任务(Task)分配资源，创建实例对象时会在主构造方法(primary constructor)中读取相应配置。例如，spark.task.maxFailures - 任务(Task)失败重试次数、spark.speculation.interval - 多长时间进行检查任务(Task)运行状态用以推测、spark.starvation.timeout - TaskSet饥饿超时时间、spark.task.cpus - 每个任务(Task)所需的CPU个数等。

/** * Schedules tasks for multiple types of clusters by acting through a SchedulerBackend. * It can also work with a local setup by using a LocalBackend and setting isLocal to true. * It handles common logic, like determining a scheduling order across jobs, waking up to launch * speculative tasks, etc. * * Clients should first call initialize() and start(), then submit task sets through the * runTasks method. * * THREADING: SchedulerBackends and task-submitting clients can call this class from multiple * threads, so it needs locks in public API methods to maintain its state. In addition, some * SchedulerBackends synchronize on themselves when they want to send events here, and then * acquire a lock on us, so we need to make sure that we don't try to lock the backend while * we are holding a lock on ourselves. */private[spark] class TaskSchedulerImpl(    val sc: SparkContext,    val maxTaskFailures: Int,    isLocal: Boolean = false)  extends TaskScheduler with Logging{  def this(sc: SparkContext) = this(sc, sc.conf.getInt("spark.task.maxFailures", 4))  val conf = sc.conf  // How often to check for speculative tasks  val SPECULATION_INTERVAL = conf.getLong("spark.speculation.interval", 100)  // Threshold above which we warn user initial TaskSet may be starved  val STARVATION_TIMEOUT = conf.getLong("spark.starvation.timeout", 15000)  // CPUs to request per task  val CPUS_PER_TASK = conf.getInt("spark.task.cpus", 1)

SparkDeploySchedulerBackend作为CoarseGrainedSchedulerBackend的子类，是任务(Task)的部署模块，专门负责收集为Application分配的Worker的资源信息。SparkDeploySchedulerBackend其核心是为了启动CoarseGrainedExecutorBackend，通过创建AppClient可以向Standalone的Master注册Application，而CoarseGrainedExecutorBackend就是Worker节点上的Executor。

private[spark] class SparkDeploySchedulerBackend(    scheduler: TaskSchedulerImpl,    sc: SparkContext,    masters: Array[String])  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem)  with AppClientListener  with Logging

SparkDeploySchedulerBackend实例对象(backend)创建后会被当作参数传入TaskSchedulerImpl中的initialize方法。

scheduler.initialize(backend)

在initialize方法中会创建任务的调度策略，任务的调度策略有两种-先进先出调度(FIFO)和公平调度(FAIR)。

  def initialize(backend: SchedulerBackend) {    this.backend = backend    // temporarily set rootPool name to empty    rootPool = new Pool("", schedulingMode, 0, 0)    schedulableBuilder = {      schedulingMode match {        case SchedulingMode.FIFO =>          new FIFOSchedulableBuilder(rootPool)        case SchedulingMode.FAIR =>          new FairSchedulableBuilder(rootPool, conf)      }    }    schedulableBuilder.buildPools()  }

创建TaskScheduler的实例对象接下来会执行TaskSchedulerImpl的start方法：

  // start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's  // constructor  taskScheduler.start()

TaskSchedulerImpl的start方法首先会执行SparkDeploySchedulerBackend的start方法

  override def start() {    backend.start()    if (!isLocal && conf.getBoolean("spark.speculation", false)) {      logInfo("Starting speculative execution thread")      import sc.env.actorSystem.dispatcher      sc.env.actorSystem.scheduler.schedule(SPECULATION_INTERVAL milliseconds,            SPECULATION_INTERVAL milliseconds) {        Utils.tryOrExit { checkSpeculatableTasks() }      }    }  }

  override def start() {    super.start()    // The endpoint for executors to talk to us    val driverUrl = "akka.tcp://%s@%s:%s/user/%s".format(      SparkEnv.driverActorSystemName,      conf.get("spark.driver.host"),      conf.get("spark.driver.port"),      CoarseGrainedSchedulerBackend.ACTOR_NAME)    val args = Seq(driverUrl, "{{EXECUTOR_ID}}", "{{HOSTNAME}}", "{{CORES}}", "{{WORKER_URL}}")    val extraJavaOpts = sc.conf.getOption("spark.executor.extraJavaOptions")      .map(Utils.splitCommandString).getOrElse(Seq.empty)    val classPathEntries = sc.conf.getOption("spark.executor.extraClassPath").toSeq.flatMap { cp =>      cp.split(java.io.File.pathSeparator)    }    val libraryPathEntries =      sc.conf.getOption("spark.executor.extraLibraryPath").toSeq.flatMap { cp =>        cp.split(java.io.File.pathSeparator)      }    // Start executors with a few necessary configs for registering with the scheduler    val sparkJavaOpts = Utils.sparkJavaOpts(conf, SparkConf.isExecutorStartupConf)    val javaOpts = sparkJavaOpts ++ extraJavaOpts    val command = Command("org.apache.spark.executor.CoarseGrainedExecutorBackend",      args, sc.executorEnvs, classPathEntries, libraryPathEntries, javaOpts)    val appDesc = new ApplicationDescription(sc.appName, maxCores, sc.executorMemory, command,      sc.ui.appUIAddress, sc.eventLogger.map(_.logDir))    client = new AppClient(sc.env.actorSystem, masters, appDesc, this, conf)    client.start()  }

首先会执行CoarseGrainedSchedulerBackend的start方法，此时会获取配置信息保存在properties中，然后创建DriverActor。

  override def start() {    val properties = new ArrayBuffer[(String, String)]    for ((key, value) <- scheduler.sc.conf.getAll) {      if (key.startsWith("spark.")) {        properties += ((key, value))      }    }    // TODO (prashant) send conf instead of properties    driverActor = actorSystem.actorOf(      Props(new DriverActor(properties)), name = CoarseGrainedSchedulerBackend.ACTOR_NAME)  }

接下来就启动ApplicationDescription中携带的CoarseGrainedExecutorBackend。

    // Start executors with a few necessary configs for registering with the scheduler    val sparkJavaOpts = Utils.sparkJavaOpts(conf, SparkConf.isExecutorStartupConf)    val javaOpts = sparkJavaOpts ++ extraJavaOpts    val command = Command("org.apache.spark.executor.CoarseGrainedExecutorBackend",      args, sc.executorEnvs, classPathEntries, libraryPathEntries, javaOpts)    val appDesc = new ApplicationDescription(sc.appName, maxCores, sc.executorMemory, command,      sc.ui.appUIAddress, sc.eventLogger.map(_.logDir))

CoarseGrainedExecutorBackend启动后，会首先根据driverUrl获取Remote Actor向CoarseGrainedSchedulerBackend的DriverActor发送注册Executor(RegisterExecutor)的消息。此外需要接收并处理的消息，主要是RegisteredExecutor，LaunchTask、KillTask、StopExecutor等。

override def preStart() {    logInfo("Connecting to driver: " + driverUrl)    driver = context.actorSelection(driverUrl)    driver ! RegisterExecutor(executorId, hostPort, cores)    context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])  }

override def receiveWithLogging = {    case RegisteredExecutor =>      logInfo("Successfully registered with driver")      // Make this host instead of hostPort ?      executor = new Executor(executorId, Utils.parseHostPort(hostPort)._1, sparkProperties,        false)    case RegisterExecutorFailed(message) =>      logError("Slave registration failed: " + message)      System.exit(1)    case LaunchTask(data) =>      if (executor == null) {        logError("Received LaunchTask command but executor was null")        System.exit(1)      } else {        val ser = SparkEnv.get.closureSerializer.newInstance()        val taskDesc = ser.deserialize[TaskDescription](data.value)        logInfo("Got assigned task " + taskDesc.taskId)        executor.launchTask(this, taskDesc.taskId, taskDesc.name, taskDesc.serializedTask)      }    case KillTask(taskId, _, interruptThread) =>      if (executor == null) {        logError("Received KillTask command but executor was null")        System.exit(1)      } else {        executor.killTask(taskId, interruptThread)      }    case x: DisassociatedEvent =>      logError(s"Driver $x disassociated! Shutting down.")      System.exit(1)    case StopExecutor =>      logInfo("Driver commanded a shutdown")      executor.stop()      context.stop(self)      context.system.shutdown()  }

最后会创建AppClient的实例对象并调用start方法创建ClientActor

    client = new AppClient(sc.env.actorSystem, masters, appDesc, this, conf)    client.start()

  def start() {    // Just launch an actor; it will call back into the listener.    actor = actorSystem.actorOf(Props(new ClientActor))  }

0 0