spark 1.6.0 core源码分析7 Spark executor的运行

来源：互联网发布：安卓中控源码编辑：程序博客网时间：2024/06/05 18:20

源码位置：org.apache.spark.executor.CoarseGrainedExecutorBackend

private def run(      driverUrl: String,      executorId: String,      hostname: String,      cores: Int,      appId: String,      workerUrl: Option[String],      userClassPath: Seq[URL]) {    SignalLogger.register(log)    SparkHadoopUtil.get.runAsSparkUser { () =>      // Debug code      Utils.checkHost(hostname)      // Bootstrap to fetch the driver's Spark properties.      val executorConf = new SparkConf //创建sparkConf      val port = executorConf.getInt("spark.executor.port", 0)      val fetcher = RpcEnv.create(//创建nettyRpcEnv，内部包含actorSystem        "driverPropsFetcher",        hostname,        port,        executorConf,        new SecurityManager(executorConf),        clientMode = true)      val driver = fetcher.setupEndpointRefByURI(driverUrl)//获取driver的ActorRef      val props = driver.askWithRetry[Seq[(String, String)]](RetrieveSparkProps) ++        Seq[(String, String)](("spark.app.id", appId))      fetcher.shutdown()      // Create SparkEnv using properties we fetched from the driver.      val driverConf = new SparkConf()//创建driver sparkConf      for ((key, value) <- props) {        // this is required for SSL in standalone mode        if (SparkConf.isExecutorStartupConf(key)) {          driverConf.setIfMissing(key, value)        } else {          driverConf.set(key, value)        }      }      if (driverConf.contains("spark.yarn.credentials.file")) {        logInfo("Will periodically update credentials from: " +          driverConf.get("spark.yarn.credentials.file"))        SparkHadoopUtil.get.startExecutorDelegationTokenRenewer(driverConf)      }      val env = SparkEnv.createExecutorEnv(/创建Executor 的sparkEnv，下面分析        driverConf, executorId, hostname, port, cores, isLocal = false)      // SparkEnv will set spark.executor.port if the rpc env is listening for incoming      // connections (e.g., if it's using akka). Otherwise, the executor is running in      // client mode only, and does not accept incoming connections.      val sparkHostPort = env.conf.getOption("spark.executor.port").map { port =>          hostname + ":" + port        }.orNull      env.rpcEnv.setupEndpoint("Executor", new CoarseGrainedExecutorBackend(//这里创建Executor 的ActorRef，onStart方法主要是向driver注册Executor，见下面分析        env.rpcEnv, driverUrl, executorId, sparkHostPort, cores, userClassPath, env))      workerUrl.foreach { url =>        env.rpcEnv.setupEndpoint("WorkerWatcher", new WorkerWatcher(env.rpcEnv, url))      }      env.rpcEnv.awaitTermination()      SparkHadoopUtil.get.stopExecutorDelegationTokenRenewer()    }  }

先介绍createExecutorEnv，这个与driver端的几乎一样，之前已经介绍过了，这里就介绍一下与driver不同的地方
1、mapOutputTracker在Executor端是MapOutputTrackerWorker对象，mapOutputTracker.trackerEndpoint实际引用的是driver的ActorRef。
2、blockManagerMaster在内部保存的也是driver的ActorRef
3、outputCommitCoordinator.coordinatorRef实际包含的也是driver的ActorRef
现在介绍一下CoarseGrainedExecutorBackend的onStart方法，看它主动干了什么事。

发送RegisterExecutor消息到driver端，注册Executor。成功返回后再向自己发送RegisteredExecutor消息

override def onStart() {    logInfo("Connecting to driver: " + driverUrl)    rpcEnv.asyncSetupEndpointRefByURI(driverUrl).flatMap { ref =>      // This is a very fast action so we can use "ThreadUtils.sameThread"      driver = Some(ref)      ref.ask[RegisterExecutorResponse](        RegisterExecutor(executorId, self, hostPort, cores, extractLogUrls))    }(ThreadUtils.sameThread).onComplete {      // This is a very fast action so we can use "ThreadUtils.sameThread"      case Success(msg) => Utils.tryLogNonFatalError {        Option(self).foreach(_.send(msg)) // msg must be RegisterExecutorResponse      }      case Failure(e) => {        logError(s"Cannot register with driver: $driverUrl", e)        System.exit(1)      }    }(ThreadUtils.sameThread)  }

ase RegisterExecutor(executorId, executorRef, hostPort, cores, logUrls) =>        if (executorDataMap.contains(executorId)) {          context.reply(RegisterExecutorFailed("Duplicate executor ID: " + executorId))        } else {          // If the executor's rpc env is not listening for incoming connections, `hostPort`          // will be null, and the client connection should be used to contact the executor.          val executorAddress = if (executorRef.address != null) {              executorRef.address            } else {              context.senderAddress            }          logInfo(s"Registered executor $executorRef ($executorAddress) with ID $executorId")          addressToExecutorId(executorAddress) = executorId          totalCoreCount.addAndGet(cores)          totalRegisteredExecutors.addAndGet(1)          val data = new ExecutorData(executorRef, executorRef.address, executorAddress.host,            cores, cores, logUrls)          // This must be synchronized because variables mutated          // in this block are read when requesting executors          CoarseGrainedSchedulerBackend.this.synchronized {            executorDataMap.put(executorId, data)            if (numPendingExecutors > 0) {              numPendingExecutors -= 1              logDebug(s"Decremented number of pending executors ($numPendingExecutors left)")            }          }          // Note: some tests expect the reply to come after we put the executor in the map          context.reply(RegisteredExecutor(executorAddress.host))          listenerBus.post(            SparkListenerExecutorAdded(System.currentTimeMillis(), executorId, data))          makeOffers()        }

看driver端接收到后如何处理？重点看最后的makeOffers。当由Executor注册上来之后，如果有等待执行的任务，这时就可以开始了。这个方法后续还会用到，且目前还没讲到任务调度的章节，后续再解释。这里只需要知道，Executor注册上来之后，会触发一把任务调度(如果有任务的话)

case RegisteredExecutor(hostname) =>      logInfo("Successfully registered with driver")      executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false)

org.apache.spark.executor.Executor

override def run(): Unit = {      val taskMemoryManager = new TaskMemoryManager(env.memoryManager, taskId)      val deserializeStartTime = System.currentTimeMillis()      Thread.currentThread.setContextClassLoader(replClassLoader)      val ser = env.closureSerializer.newInstance()      logInfo(s"Running $taskName (TID $taskId)")      execBackend.statusUpdate(taskId, TaskState.RUNNING, EMPTY_BYTE_BUFFER)      var taskStart: Long = 0      startGCTime = computeTotalGcTime()      try {        val (taskFiles, taskJars, taskBytes) = Task.deserializeWithDependencies(serializedTask)        updateDependencies(taskFiles, taskJars)        task = ser.deserialize[Task[Any]](taskBytes, Thread.currentThread.getContextClassLoader)        task.setTaskMemoryManager(taskMemoryManager)        // If this task has been killed before we deserialized it, let's quit now. Otherwise,        // continue executing the task.        if (killed) {          // Throw an exception rather than returning, because returning within a try{} block          // causes a NonLocalReturnControl exception to be thrown. The NonLocalReturnControl          // exception will be caught by the catch block, leading to an incorrect ExceptionFailure          // for the task.          throw new TaskKilledException        }        logDebug("Task " + taskId + "'s epoch is " + task.epoch)        env.mapOutputTracker.updateEpoch(task.epoch)        // Run the actual task and measure its runtime.        taskStart = System.currentTimeMillis()        var threwException = true        val (value, accumUpdates) = try {          val res = task.run(            taskAttemptId = taskId,            attemptNumber = attemptNumber,            metricsSystem = env.metricsSystem)          threwException = false          res        } finally {          val freedMemory = taskMemoryManager.cleanUpAllAllocatedMemory()          if (freedMemory > 0) {            val errMsg = s"Managed memory leak detected; size = $freedMemory bytes, TID = $taskId"            if (conf.getBoolean("spark.unsafe.exceptionOnMemoryLeak", false) && !threwException) {              throw new SparkException(errMsg)            } else {              logError(errMsg)            }          }        }        val taskFinish = System.currentTimeMillis()        // If the task has been killed, let's fail it.        if (task.killed) {          throw new TaskKilledException        }        val resultSer = env.serializer.newInstance()        val beforeSerialization = System.currentTimeMillis()        val valueBytes = resultSer.serialize(value)        val afterSerialization = System.currentTimeMillis()        for (m <- task.metrics) {          // Deserialization happens in two parts: first, we deserialize a Task object, which          // includes the Partition. Second, Task.run() deserializes the RDD and function to be run.          m.setExecutorDeserializeTime(            (taskStart - deserializeStartTime) + task.executorDeserializeTime)          // We need to subtract Task.run()'s deserialization time to avoid double-counting          m.setExecutorRunTime((taskFinish - taskStart) - task.executorDeserializeTime)          m.setJvmGCTime(computeTotalGcTime() - startGCTime)          m.setResultSerializationTime(afterSerialization - beforeSerialization)          m.updateAccumulators()        }        val directResult = new DirectTaskResult(valueBytes, accumUpdates, task.metrics.orNull)        val serializedDirectResult = ser.serialize(directResult)        val resultSize = serializedDirectResult.limit        // directSend = sending directly back to the driver        val serializedResult: ByteBuffer = {          if (maxResultSize > 0 && resultSize > maxResultSize) {            logWarning(s"Finished $taskName (TID $taskId). Result is larger than maxResultSize " +              s"(${Utils.bytesToString(resultSize)} > ${Utils.bytesToString(maxResultSize)}), " +              s"dropping it.")            ser.serialize(new IndirectTaskResult[Any](TaskResultBlockId(taskId), resultSize))          } else if (resultSize >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {            val blockId = TaskResultBlockId(taskId)            env.blockManager.putBytes(              blockId, serializedDirectResult, StorageLevel.MEMORY_AND_DISK_SER)            logInfo(              s"Finished $taskName (TID $taskId). $resultSize bytes result sent via BlockManager)")            ser.serialize(new IndirectTaskResult[Any](blockId, resultSize))          } else {            logInfo(s"Finished $taskName (TID $taskId). $resultSize bytes result sent to driver")            serializedDirectResult          }        }        execBackend.statusUpdate(taskId, TaskState.FINISHED, serializedResult)      } catch {        case ffe: FetchFailedException =>          val reason = ffe.toTaskEndReason          execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason))        case _: TaskKilledException | _: InterruptedException if task.killed =>          logInfo(s"Executor killed $taskName (TID $taskId)")          execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled))        case cDE: CommitDeniedException =>          val reason = cDE.toTaskEndReason          execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason))        case t: Throwable =>          // Attempt to exit cleanly by informing the driver of our failure.          // If anything goes wrong (or this was a fatal exception), we will delegate to          // the default uncaught exception handler, which will terminate the Executor.          logError(s"Exception in $taskName (TID $taskId)", t)          val metrics: Option[TaskMetrics] = Option(task).flatMap { task =>            task.metrics.map { m =>              m.setExecutorRunTime(System.currentTimeMillis() - taskStart)              m.setJvmGCTime(computeTotalGcTime() - startGCTime)              m.updateAccumulators()              m            }          }          val serializedTaskEndReason = {            try {              ser.serialize(new ExceptionFailure(t, metrics))            } catch {              case _: NotSerializableException =>                // t is not serializable so just send the stacktrace                ser.serialize(new ExceptionFailure(t, metrics, false))            }          }          execBackend.statusUpdate(taskId, TaskState.FAILED, serializedTaskEndReason)          // Don't forcibly exit unless the exception was inherently fatal, to avoid          // stopping other tasks unnecessarily.          if (Utils.isFatalError(t)) {            SparkUncaughtExceptionHandler.uncaughtException(t)          }      } finally {        runningTasks.remove(taskId)      }    }  }

0 0