sparkContext源码分析

来源:互联网 发布:过期备案域名查询 编辑:程序博客网 时间:2024/06/03 20:29

sparkContext启动最主要的三件事,1.启动web UI,2,创建TaskScheduler 3,创建DAGScheduler

创建webUI 

    _ui =      if (conf.getBoolean("spark.ui.enabled", true)) {        Some(SparkUI.createLiveUI(this, _conf, listenerBus, _jobProgressListener,          _env.securityManager, appName, startTime = startTime))      } else {        // For tests, do not enable the UI        None      }    // Bind the UI before starting the task scheduler to communicate    // the bound port to the cluster manager properly    _ui.foreach(_.bind())

创建TaskScheduler和DAGScheduler

    // Create and start the scheduler    val (sched, ts) = SparkContext.createTaskScheduler(this, master, deployMode)    _schedulerBackend = sched    _taskScheduler = ts    _dagScheduler = new DAGScheduler(this)    _heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet)    // start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's    // constructor    _taskScheduler.start()

我们进入createTaskScheduler方法,可以看到各种启动模式的判断。

private def createTaskScheduler(      sc: SparkContext,      master: String,      deployMode: String): (SchedulerBackend, TaskScheduler) = {    import SparkMasterRegex._    // When running locally, don't try to re-execute tasks on failure.    val MAX_LOCAL_TASK_FAILURES = 1    master match {      case "local" =>        val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)        val backend = new LocalSchedulerBackend(sc.getConf, scheduler, 1)        scheduler.initialize(backend)        (backend, scheduler)      case LOCAL_N_REGEX(threads) =>        def localCpuCount: Int = Runtime.getRuntime.availableProcessors()        // local[*] estimates the number of cores on the machine; local[N] uses exactly N threads.        val threadCount = if (threads == "*") localCpuCount else threads.toInt        if (threadCount <= 0) {          throw new SparkException(s"Asked to run locally with $threadCount threads")        }        val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)        val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)        scheduler.initialize(backend)        (backend, scheduler)      case LOCAL_N_FAILURES_REGEX(threads, maxFailures) =>        def localCpuCount: Int = Runtime.getRuntime.availableProcessors()        // local[*, M] means the number of cores on the computer with M failures        // local[N, M] means exactly N threads with M failures        val threadCount = if (threads == "*") localCpuCount else threads.toInt        val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true)        val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)        scheduler.initialize(backend)        (backend, scheduler)      case SPARK_REGEX(sparkUrl) =>        val scheduler = new TaskSchedulerImpl(sc)        val masterUrls = sparkUrl.split(",").map("spark://" + _)        val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls)        scheduler.initialize(backend)        (backend, scheduler)      case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>        // Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang.        val memoryPerSlaveInt = memoryPerSlave.toInt        if (sc.executorMemory > memoryPerSlaveInt) {          throw new SparkException(            "Asked to launch cluster with %d MB RAM / worker but requested %d MB/worker".format(              memoryPerSlaveInt, sc.executorMemory))        }        val scheduler = new TaskSchedulerImpl(sc)        val localCluster = new LocalSparkCluster(          numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt, sc.conf)        val masterUrls = localCluster.start()        val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls)        scheduler.initialize(backend)        backend.shutdownCallback = (backend: StandaloneSchedulerBackend) => {          localCluster.stop()        }        (backend, scheduler)      case masterUrl =>        val cm = getClusterManager(masterUrl) match {          case Some(clusterMgr) => clusterMgr          case None => throw new SparkException("Could not parse Master URL: '" + master + "'")        }        try {          val scheduler = cm.createTaskScheduler(sc, masterUrl)          val backend = cm.createSchedulerBackend(sc, masterUrl, scheduler)          cm.initialize(scheduler, backend)          (backend, scheduler)        } catch {          case se: SparkException => throw se          case NonFatal(e) =>            throw new SparkException("External scheduler cannot be instantiated", e)        }    }  }

看到 SPARK_REGEX 对应的standlone模式,里面创建了 TaskSchedulerImpl对象和 StandloneSchedulerBackend对象,并执行TaskSchedulerImpl的initialize方法,把StandloneSchedulerBackend对象作为参数传入TaskSchedulerImpl 中。

接着在下面的代码中,执行了

_taskScheduler.start()
方法。我们进入start方法,可以看到在start()方法中执行了StandloneSchedulerBackend对象的start方法

  override def start() {    backend.start()    if (!isLocal && conf.getBoolean("spark.speculation", false)) {      logInfo("Starting speculative execution thread")      speculationScheduler.scheduleAtFixedRate(new Runnable {        override def run(): Unit = Utils.tryOrStopSparkContext(sc) {          checkSpeculatableTasks()        }      }, SPECULATION_INTERVAL_MS, SPECULATION_INTERVAL_MS, TimeUnit.MILLISECONDS)    }  }

我们进入StandloneSchedulerBackend对象的start方法

  override def start() {    super.start()    launcherBackend.connect()    // The endpoint for executors to talk to us    val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName,      RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt),      CoarseGrainedSchedulerBackend.ENDPOINT_NAME)    val args = Seq(      "--driver-url", driverUrl,      "--executor-id", "{{EXECUTOR_ID}}",      "--hostname", "{{HOSTNAME}}",      "--cores", "{{CORES}}",      "--app-id", "{{APP_ID}}",      "--worker-url", "{{WORKER_URL}}")    val extraJavaOpts = sc.conf.getOption("spark.executor.extraJavaOptions")      .map(Utils.splitCommandString).getOrElse(Seq.empty)    val classPathEntries = sc.conf.getOption("spark.executor.extraClassPath")      .map(_.split(java.io.File.pathSeparator).toSeq).getOrElse(Nil)    val libraryPathEntries = sc.conf.getOption("spark.executor.extraLibraryPath")      .map(_.split(java.io.File.pathSeparator).toSeq).getOrElse(Nil)    // When testing, expose the parent class path to the child. This is processed by    // compute-classpath.{cmd,sh} and makes all needed jars available to child processes    // when the assembly is built with the "*-provided" profiles enabled.    val testingClassPath =      if (sys.props.contains("spark.testing")) {        sys.props("java.class.path").split(java.io.File.pathSeparator).toSeq      } else {        Nil      }    // Start executors with a few necessary configs for registering with the scheduler    val sparkJavaOpts = Utils.sparkJavaOpts(conf, SparkConf.isExecutorStartupConf)    val javaOpts = sparkJavaOpts ++ extraJavaOpts    val command = Command("org.apache.spark.executor.CoarseGrainedExecutorBackend",      args, sc.executorEnvs, classPathEntries ++ testingClassPath, libraryPathEntries, javaOpts)    val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")    val coresPerExecutor = conf.getOption("spark.executor.cores").map(_.toInt)    val appDesc = new ApplicationDescription(sc.appName, maxCores, sc.executorMemory,      command, appUIAddress, sc.eventLogDir, sc.eventLogCodec, coresPerExecutor)    //创建appClient,并执行start方法    client = new AppClient(sc.env.rpcEnv, masters, appDesc, this, conf)    client.start()    launcherBackend.setState(SparkAppHandle.State.SUBMITTED)    waitForRegistration()    launcherBackend.setState(SparkAppHandle.State.RUNNING)  }

可以看到这个方法中,在收集各种参数,包括java参数,sparkConf参数,命令行参数,app地址,cpu,内存参数等,所有这些参数用于创建ApplicationDescription对象

val appDesc = new ApplicationDescription(sc.appName, maxCores, sc.executorMemory, command,      appUIAddress, sc.eventLogDir, sc.eventLogCodec, coresPerExecutor, initialExecutorLimit)
这个对象用于描述Application信息

private[spark] case class ApplicationDescription(    name: String,    maxCores: Option[Int],    memoryPerExecutorMB: Int,    command: Command,    appUiUrl: String,    eventLogDir: Option[URI] = None,    // short name of compression codec used when writing event logs, if any (e.g. lzf)    eventLogCodec: Option[String] = None,    coresPerExecutor: Option[Int] = None,    // number of executors this application wants to start with,    // only used if dynamic allocation is enabled    initialExecutorLimit: Option[Int] = None,    user: String = System.getProperty("user.name", "<unknown>")) {  override def toString: String = "ApplicationDescription(" + name + ")"}

创建完ApplicationDescription对象后,创建AppClient对象,并执行该对象的start方法,把Application信息发送给Master。

  def start() {    // Just launch an rpcEndpoint; it will call back into the listener.    endpoint.set(rpcEnv.setupEndpoint("AppClient", new ClientEndpoint(rpcEnv)))  }
start方法注册了ClientEndpoint对象到rpcEnv中。进入ClientEndpoint类,可以看到这个类继承了ThreadSafeRpcEndpoint。rpcEnv会

调用这个类的onStart方法

    override def onStart(): Unit = {      try {        registerWithMaster(1)      } catch {        case e: Exception =>          logWarning("Failed to connect to master", e)          markDisconnected()          stop()      }    }
    private def registerWithMaster(nthRetry: Int) {      //向所有的master注册      registerMasterFutures.set(tryRegisterAllMasters())      registrationRetryTimer.set(registrationRetryThread.scheduleAtFixedRate(new Runnable {        override def run(): Unit = {          Utils.tryOrExit {            if (registered.get) {              registerMasterFutures.get.foreach(_.cancel(true))              registerMasterThreadPool.shutdownNow()            } else if (nthRetry >= REGISTRATION_RETRIES) {              markDead("All masters are unresponsive! Giving up.")            } else {              registerMasterFutures.get.foreach(_.cancel(true))              registerWithMaster(nthRetry + 1)            }          }        }      }, REGISTRATION_TIMEOUT_SECONDS, REGISTRATION_TIMEOUT_SECONDS, TimeUnit.SECONDS))    }
private def tryRegisterAllMasters(): Array[JFuture[_]] = {      //循环获取master地址      for (masterAddress <- masterRpcAddresses) yield {        registerMasterThreadPool.submit(new Runnable {          override def run(): Unit = try {            if (registered.get) {              return            }            logInfo("Connecting to master " + masterAddress.toSparkURL + "...")            //获取master RPC            val masterRef =              rpcEnv.setupEndpointRef(Master.SYSTEM_NAME, masterAddress, Master.ENDPOINT_NAME)            //发送注册Application消息            masterRef.send(RegisterApplication(appDescription, self))          } catch {            case ie: InterruptedException => // Cancelled            case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e)          }        })      }    }


注意看SparkDeploySchedulerBackend的start方法,其中调用了super.start()方法。也就是CoarseGrainedSchedulerBackend的start方法

  override def start() {    val properties = new ArrayBuffer[(String, String)]    for ((key, value) <- scheduler.sc.conf.getAll) {      if (key.startsWith("spark.")) {        properties += ((key, value))      }    }    // TODO (prashant) send conf instead of properties    driverEndpoint = rpcEnv.setupEndpoint(ENDPOINT_NAME, createDriverEndpoint(properties))  }
  protected def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {    new DriverEndpoint(rpcEnv, properties)  }
这个类继承了ThreadSafeRpcEndpoint
class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])    extends ThreadSafeRpcEndpoint with Logging
所以setupEndpoint方法时,会调用这个类的onStart()方法

    override def onStart() {      // Periodically revive offers to allow delay scheduling to work      val reviveIntervalMs = conf.getTimeAsMs("spark.scheduler.revive.interval", "1s")      reviveThread.scheduleAtFixedRate(new Runnable {        override def run(): Unit = Utils.tryLogNonFatalError {          Option(self).foreach(_.send(ReviveOffers))        }      }, 0, reviveIntervalMs, TimeUnit.MILLISECONDS)    }