Spark2.2 Worker原理剖析图解及源码剖析
来源:互联网 发布:电脑版淘宝微淘哪里看 编辑:程序博客网 时间:2024/05/17 05:50
Worker原理剖析图解
LaunchExecutor()源码剖析
LaunchExecutor
/** * leen */ case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) => if (masterUrl != activeMasterUrl) { logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.") } else { try { logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name)) // 创建Executors的工作目录 val executorDir = new File(workDir, appId + "/" + execId) // 验证是否创建成功 if (!executorDir.mkdirs()) { throw new IOException("Failed to create directory " + executorDir) } // 为Executor创建本地目录,这些通过SPARK_EXECUTOR_DIRS环境变量传递给executor,并在应用程序完成时删除。 val appLocalDirs = appDirectories.getOrElse(appId, { val localRootDirs = Utils.getOrCreateLocalRootDirs(conf) val dirs = localRootDirs.flatMap { dir => try { val appDir = Utils.createDirectory(dir, namePrefix = "executor") Utils.chmod700(appDir) Some(appDir.getAbsolutePath()) } catch { case e: IOException => logWarning(s"${e.getMessage}. Ignoring this directory.") None } }.toSeq if (dirs.isEmpty) { throw new IOException("No subfolder can be created in " + s"${localRootDirs.mkString(",")}.") } dirs }) appDirectories(appId) = appLocalDirs /** * 创建ExecutorRunner对象 */ val manager = new ExecutorRunner( appId, execId, appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)), cores_, memory_, self, workerId, host, webUi.boundPort, publicAddress, sparkHome, executorDir, workerUri, conf, appLocalDirs, ExecutorState.RUNNING) executors(appId + "/" + execId) = manager // 调用ExecutorRunner对象的start() 方法 manager.start() // 更新资源占用 coresUsed += cores_ memoryUsed += memory_ // 当Executor的状态值发生变化的时候,发送给Master进行处理 ExecutorStateChanged() sendToMaster(ExecutorStateChanged(appId, execId, manager.state, None, None)) } catch { case e: Exception => logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e) if (executors.contains(appId + "/" + execId)) { executors(appId + "/" + execId).kill() executors -= appId + "/" + execId } sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED, Some(e.toString), None)) } }
ExecutorRunner的start方法
/** * leen * start()方法 */ private[worker] def start() { // 创建一个Java的线程体 workerThread = new Thread("ExecutorRunner for " + fullId) { // fetchAndRunExecutor() 创建并运行Executor override def run() { fetchAndRunExecutor() } } workerThread.start() shutdownHook = ShutdownHookManager.addShutdownHook { () => // 在我们调用fetchAndRunExecutor方法之前,我们可能会到达这里,这种情况下,设置state = ExecutorState.FAILED if (state == ExecutorState.RUNNING) { state = ExecutorState.FAILED } // Kill executor进程,等待退出,并通知worker更新资源状态 killProcess(Some("Worker shutting down")) } }
fetchAndRunExecutor
/** * 下载并运行我们ApplicationDescription中描述的Executor */ private def fetchAndRunExecutor() { try { // 创建ProcessBuilder,启动进程 val builder = CommandUtils.buildProcessBuilder(appDesc.command, new SecurityManager(conf), memory, sparkHome.getAbsolutePath, substituteVariables) // 启动命令 val command = builder.command() // 格式化启动命令 val formattedCommand = command.asScala.mkString("\"", "\" \"", "\"") logInfo(s"Launch command: $formattedCommand") builder.directory(executorDir) builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator)) builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0") // 添加webUI日志网址 val baseUrl = if (conf.getBoolean("spark.ui.reverseProxy", false)) { s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType=" } else { s"http://$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType=" } builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr") builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout") /** 启动builder */ process = builder.start() val header = "Spark Executor Command: %s\n%s\n\n".format( formattedCommand, "=" * 40) // 将其stdout和stderr重定向到文件 val stdout = new File(executorDir, "stdout") stdoutAppender = FileAppender(process.getInputStream, stdout, conf) val stderr = new File(executorDir, "stderr") Files.write(header, stderr, StandardCharsets.UTF_8) stderrAppender = FileAppender(process.getErrorStream, stderr, conf) //等待它退出;执行器可以使用代码0退出(当Driver命令它关闭),或者非0的退出状态值 val exitCode = process.waitFor() state = ExecutorState.EXITED val message = "Command exited with code " + exitCode // 当Executor的状态值发生变化的时候,发送给Master进行处理 ExecutorStateChanged() worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode))) } catch { case interrupted: InterruptedException => logInfo("Runner thread for executor " + fullId + " interrupted") state = ExecutorState.KILLED killProcess(None) case e: Exception => logError("Error running executor", e) state = ExecutorState.FAILED killProcess(Some(e.toString)) } }
killProcess
/** * Kill executor进程,等待退出,并通知worker更新资源状态 * @param message 引起Executor失败的异常消息 */ private def killProcess(message: Option[String]) { var exitCode: Option[Int] = None if (process != null) { logInfo("Killing process!") if (stdoutAppender != null) { stdoutAppender.stop() } if (stderrAppender != null) { stderrAppender.stop() } exitCode = Utils.terminateProcess(process, EXECUTOR_TERMINATE_TIMEOUT_MS) if (exitCode.isEmpty) { logWarning("Failed to terminate process: " + process + ". This process will likely be orphaned.") } } try { worker.send(ExecutorStateChanged(appId, execId, state, message, exitCode)) } catch { case e: IllegalStateException => logWarning(e.getMessage(), e) } }
ExecutorStateChanged
/**接收到 ExecutorStateChanged 消息的时候 ===>>> handleExecutorStateChanged()*/case executorStateChanged@ExecutorStateChanged(appId, execId, state, message, exitStatus) =>handleExecutorStateChanged(executorStateChanged)
handleExecutorStateChanged
/** * 处理 handleExecutorStateChanged * @param executorStateChanged */ private[worker] def handleExecutorStateChanged(executorStateChanged: ExecutorStateChanged): Unit = { // 发送消息给Master 处理 sendToMaster(executorStateChanged) val state = executorStateChanged.state //如果 Executor是正常完成,进行资源处理 if (ExecutorState.isFinished(state)) { val appId = executorStateChanged.appId val fullId = appId + "/" + executorStateChanged.execId val message = executorStateChanged.message val exitStatus = executorStateChanged.exitStatus executors.get(fullId) match { case Some(executor) => logInfo("Executor " + fullId + " finished with state " + state + message.map(" message " + _).getOrElse("") + exitStatus.map(" exitStatus " + _).getOrElse("")) executors -= fullId finishedExecutors(fullId) = executor trimFinishedExecutorsIfNecessary() coresUsed -= executor.cores memoryUsed -= executor.memory case None => logInfo("Unknown Executor " + fullId + " finished with state " + state + message.map(" message " + _).getOrElse("") + exitStatus.map(" exitStatus " + _).getOrElse("")) } //判断是否整个Application完成,如果是,则清理整个Application的资源占用 maybeCleanupApplication(appId) } }
Master的ExecutorStateChanged方法
/** * leen * Executorde的状态发生改变 */ case ExecutorStateChanged(appId, execId, state, message, exitStatus) => // 1.找到Executor所对应的App,之后反过来通过App内部的Executors缓存获得 ExecutorDescription // 其中ExecutorDescription中含有 appId、execId、cores、state[ExecutorState.Value]信息 val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId)) execOption match { // 2.如果有值 case Some(exec) => val appInfo = idToApp(appId) // 2.1 设置Executor的状态 val oldState = exec.state exec.state = state // 2.2 如果Executor的状态为:RUNNING if (state == ExecutorState.RUNNING) { assert(oldState == ExecutorState.LAUNCHING, s"executor $execId state transfer from $oldState to RUNNING is illegal") appInfo.resetRetryCount() } // 2.3向Driver同步发送当下Executor的状态信息 exec.application.driver.send(ExecutorUpdated(execId, state, message, exitStatus, false)) // 2.4 如果Executor的状态为完成状态:KILLED, FAILED, LOST, EXITED if (ExecutorState.isFinished(state)) { // 从Worker和App中移除这个Executor logInfo(s"Removing executor ${exec.fullId} because it is $state") // 如果一个Application已经被完成,则保存其信息,显示在前端页面 // 从App的缓存中移除Executor if (!appInfo.isFinished) { appInfo.removeExecutor(exec) } //从运行Executor的Worker的缓存中移除Executor exec.worker.removeExecutor(exec) val normalExit = exitStatus == Some(0) // 只需要重试一定次数,这样我们就不会进入无限循环 //如果退出的状态不正常,并且EXECUTOR重试的次数 >= MAX_EXECUTOR_RETRIES[10次],则 removeApplication if (!normalExit && appInfo.incrementRetryCount() >= MAX_EXECUTOR_RETRIES && MAX_EXECUTOR_RETRIES >= 0) { // < 0 disables this application-killing path val execs = appInfo.executors.values if (!execs.exists(_.state == ExecutorState.RUNNING)) { logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " + s"${appInfo.retryCount} times; removing it") removeApplication(appInfo, ApplicationState.FAILED) } } } // 3.重新调度执行 schedule() case None => logWarning(s"Got status update for unknown executor $appId/$execId") }
阅读全文
0 0
- Spark2.2 Worker原理剖析图解及源码剖析
- Spark2.2 Worker、Driver和Executor向Master注册原理剖析图解及源码
- Spark2.2 SparkContext原理剖析图及源码
- Spark2.2 Executor原理剖析及源码分析
- Spark2.2 广播变量broadcast原理及源码剖析
- Spark2.2 job触发流程原理剖析与源码分析
- Spark2.2 TaskScheduler原理剖析与源码分析
- Spark2.2 内核架构深层剖析图解
- Spark2.2 宽窄依赖剖析图解
- Spark Worker原理和源码剖析解密
- Worker原理剖析与源码分析
- Spark2.2内核剖析
- Spark2.2 DAGScheduler源码分析[stage划分算法源码剖析]
- Spark2.2 任务调度机制schedule()源码剖析
- 第32课:Spark Worker原理和源码剖析解密:Worker工作流程图、Worker启动Driver源码解密、Worker启动Executor源码解密等
- PageHelper分页插件源码及原理剖析
- PageHelper分页插件源码及原理剖析
- Spark2.2 基于Yarn的两种提交模式剖析图解
- 听说云服务器_也可以竞价了_1折起售_怎么买更划算??
- CentOS 7 python Opencv2.4.13安装记录 Install opencv2.4.13 python in centos 7
- 淘宝抢购倒计时(时、分、秒)
- c# Http Get Post Put Delete类整理
- 热修复Tinker的使用 + TinkerPatch补丁管理平台 gradle接入
- Spark2.2 Worker原理剖析图解及源码剖析
- paperweekly-迁移学习
- CORBA概述
- Spring-ApplicationContext
- 移动端H5页面截图【含 domtoimage、html2canvas 】
- Idea
- springBoot 动态数据源以及Mybatis多数据源
- 在其他数都出现偶数次的数组中找到出现奇数次的数
- 怎样理解阻塞非阻塞与同步异步的区别