Spark2.2 Worker原理剖析图解及源码剖析

来源：互联网发布：电脑版淘宝微淘哪里看编辑：程序博客网时间：2024/05/17 05:50

Worker原理剖析图解

这里写图片描述

LaunchExecutor()源码剖析

这里写图片描述

LaunchExecutor

    /**     * leen     */    case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) =>      if (masterUrl != activeMasterUrl) {        logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")      } else {        try {          logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))          // 创建Executors的工作目录          val executorDir = new File(workDir, appId + "/" + execId)          // 验证是否创建成功          if (!executorDir.mkdirs()) {            throw new IOException("Failed to create directory " + executorDir)          }          // 为Executor创建本地目录，这些通过SPARK_EXECUTOR_DIRS环境变量传递给executor，并在应用程序完成时删除。          val appLocalDirs = appDirectories.getOrElse(appId, {            val localRootDirs = Utils.getOrCreateLocalRootDirs(conf)            val dirs = localRootDirs.flatMap { dir =>              try {                val appDir = Utils.createDirectory(dir, namePrefix = "executor")                Utils.chmod700(appDir)                Some(appDir.getAbsolutePath())              } catch {                case e: IOException =>                  logWarning(s"${e.getMessage}. Ignoring this directory.")                  None              }            }.toSeq            if (dirs.isEmpty) {              throw new IOException("No subfolder can be created in " +                s"${localRootDirs.mkString(",")}.")            }            dirs          })          appDirectories(appId) = appLocalDirs          /**           * 创建ExecutorRunner对象           */          val manager = new ExecutorRunner(            appId,            execId,            appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),            cores_,            memory_,            self,            workerId,            host,            webUi.boundPort,            publicAddress,            sparkHome,            executorDir,            workerUri,            conf,            appLocalDirs, ExecutorState.RUNNING)          executors(appId + "/" + execId) = manager          // 调用ExecutorRunner对象的start() 方法          manager.start()          // 更新资源占用          coresUsed += cores_          memoryUsed += memory_          // 当Executor的状态值发生变化的时候，发送给Master进行处理  ExecutorStateChanged()          sendToMaster(ExecutorStateChanged(appId, execId, manager.state, None, None))        } catch {          case e: Exception =>            logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e)            if (executors.contains(appId + "/" + execId)) {              executors(appId + "/" + execId).kill()              executors -= appId + "/" + execId            }            sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED,              Some(e.toString), None))        }      }

ExecutorRunner的start方法

  /**   * leen   * start()方法   */  private[worker] def start() {    // 创建一个Java的线程体    workerThread = new Thread("ExecutorRunner for " + fullId) {      // fetchAndRunExecutor() 创建并运行Executor      override def run() {        fetchAndRunExecutor()      }    }    workerThread.start()    shutdownHook = ShutdownHookManager.addShutdownHook { () =>      // 在我们调用fetchAndRunExecutor方法之前，我们可能会到达这里，这种情况下，设置state = ExecutorState.FAILED      if (state == ExecutorState.RUNNING) {        state = ExecutorState.FAILED      }      // Kill executor进程，等待退出，并通知worker更新资源状态      killProcess(Some("Worker shutting down"))    }  }

fetchAndRunExecutor

  /**   * 下载并运行我们ApplicationDescription中描述的Executor   */  private def fetchAndRunExecutor() {    try {      // 创建ProcessBuilder，启动进程      val builder = CommandUtils.buildProcessBuilder(appDesc.command, new SecurityManager(conf),        memory, sparkHome.getAbsolutePath, substituteVariables)      // 启动命令      val command = builder.command()      // 格式化启动命令      val formattedCommand = command.asScala.mkString("\"", "\" \"", "\"")      logInfo(s"Launch command: $formattedCommand")      builder.directory(executorDir)      builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))      builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")      // 添加webUI日志网址      val baseUrl =        if (conf.getBoolean("spark.ui.reverseProxy", false)) {          s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType="        } else {          s"http://$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="        }      builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")      builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")      /** 启动builder */      process = builder.start()      val header = "Spark Executor Command: %s\n%s\n\n".format(        formattedCommand, "=" * 40)      // 将其stdout和stderr重定向到文件      val stdout = new File(executorDir, "stdout")      stdoutAppender = FileAppender(process.getInputStream, stdout, conf)      val stderr = new File(executorDir, "stderr")      Files.write(header, stderr, StandardCharsets.UTF_8)      stderrAppender = FileAppender(process.getErrorStream, stderr, conf)      //等待它退出;执行器可以使用代码0退出（当Driver命令它关闭），或者非0的退出状态值      val exitCode = process.waitFor()      state = ExecutorState.EXITED      val message = "Command exited with code " + exitCode      // 当Executor的状态值发生变化的时候，发送给Master进行处理  ExecutorStateChanged()      worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))    } catch {      case interrupted: InterruptedException =>        logInfo("Runner thread for executor " + fullId + " interrupted")        state = ExecutorState.KILLED        killProcess(None)      case e: Exception =>        logError("Error running executor", e)        state = ExecutorState.FAILED        killProcess(Some(e.toString))    }  }

killProcess

  /**   * Kill executor进程，等待退出，并通知worker更新资源状态   * @param message 引起Executor失败的异常消息   */  private def killProcess(message: Option[String]) {    var exitCode: Option[Int] = None    if (process != null) {      logInfo("Killing process!")      if (stdoutAppender != null) {        stdoutAppender.stop()      }      if (stderrAppender != null) {        stderrAppender.stop()      }      exitCode = Utils.terminateProcess(process, EXECUTOR_TERMINATE_TIMEOUT_MS)      if (exitCode.isEmpty) {        logWarning("Failed to terminate process: " + process +          ". This process will likely be orphaned.")      }    }    try {      worker.send(ExecutorStateChanged(appId, execId, state, message, exitCode))    } catch {      case e: IllegalStateException => logWarning(e.getMessage(), e)    }  }

ExecutorStateChanged

/**接收到 ExecutorStateChanged 消息的时候  ===>>>  handleExecutorStateChanged()*/case executorStateChanged@ExecutorStateChanged(appId, execId, state, message, exitStatus) =>handleExecutorStateChanged(executorStateChanged)

handleExecutorStateChanged

  /**   * 处理 handleExecutorStateChanged   * @param executorStateChanged   */  private[worker] def handleExecutorStateChanged(executorStateChanged: ExecutorStateChanged):  Unit = {    // 发送消息给Master 处理    sendToMaster(executorStateChanged)    val state = executorStateChanged.state    //如果 Executor是正常完成，进行资源处理    if (ExecutorState.isFinished(state)) {      val appId = executorStateChanged.appId      val fullId = appId + "/" + executorStateChanged.execId      val message = executorStateChanged.message      val exitStatus = executorStateChanged.exitStatus      executors.get(fullId) match {        case Some(executor) =>          logInfo("Executor " + fullId + " finished with state " + state +            message.map(" message " + _).getOrElse("") +            exitStatus.map(" exitStatus " + _).getOrElse(""))          executors -= fullId          finishedExecutors(fullId) = executor          trimFinishedExecutorsIfNecessary()          coresUsed -= executor.cores          memoryUsed -= executor.memory        case None =>          logInfo("Unknown Executor " + fullId + " finished with state " + state +            message.map(" message " + _).getOrElse("") +            exitStatus.map(" exitStatus " + _).getOrElse(""))      }      //判断是否整个Application完成，如果是，则清理整个Application的资源占用      maybeCleanupApplication(appId)    }  }

Master的ExecutorStateChanged方法

    /**     * leen     * Executorde的状态发生改变     */    case ExecutorStateChanged(appId, execId, state, message, exitStatus) =>      // 1.找到Executor所对应的App,之后反过来通过App内部的Executors缓存获得 ExecutorDescription      //   其中ExecutorDescription中含有 appId、execId、cores、state[ExecutorState.Value]信息      val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId))      execOption match {        // 2.如果有值        case Some(exec) =>          val appInfo = idToApp(appId)          // 2.1 设置Executor的状态          val oldState = exec.state          exec.state = state          // 2.2 如果Executor的状态为：RUNNING          if (state == ExecutorState.RUNNING) {            assert(oldState == ExecutorState.LAUNCHING,              s"executor $execId state transfer from $oldState to RUNNING is illegal")            appInfo.resetRetryCount()          }          // 2.3向Driver同步发送当下Executor的状态信息          exec.application.driver.send(ExecutorUpdated(execId, state, message, exitStatus, false))          // 2.4 如果Executor的状态为完成状态：KILLED, FAILED, LOST, EXITED          if (ExecutorState.isFinished(state)) {            // 从Worker和App中移除这个Executor            logInfo(s"Removing executor ${exec.fullId} because it is $state")            // 如果一个Application已经被完成，则保存其信息，显示在前端页面            // 从App的缓存中移除Executor            if (!appInfo.isFinished) {              appInfo.removeExecutor(exec)            }            //从运行Executor的Worker的缓存中移除Executor            exec.worker.removeExecutor(exec)            val normalExit = exitStatus == Some(0)            // 只需要重试一定次数，这样我们就不会进入无限循环            //如果退出的状态不正常，并且EXECUTOR重试的次数 >= MAX_EXECUTOR_RETRIES[10次]，则 removeApplication            if (!normalExit              && appInfo.incrementRetryCount() >= MAX_EXECUTOR_RETRIES              && MAX_EXECUTOR_RETRIES >= 0) {              // < 0 disables this application-killing path              val execs = appInfo.executors.values              if (!execs.exists(_.state == ExecutorState.RUNNING)) {                logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " +                  s"${appInfo.retryCount} times; removing it")                removeApplication(appInfo, ApplicationState.FAILED)              }            }          }          // 3.重新调度执行          schedule()        case None =>          logWarning(s"Got status update for unknown executor $appId/$execId")      }

阅读全文

0 0