Spark源码分析-worker

来源：互联网发布：淘宝网电脑端描述编辑：程序博客网时间：2024/05/16 06:23

Worker接受LaunchDriver()的源码

case LaunchDriver(driverId, driverDesc) => {      logInfo(s"Asked to launch driver $driverId")      //1.新建了一个DrievrRunner实例对象      val driver = new DriverRunner(        conf,        driverId,        workDir,        sparkHome,        driverDesc.copy(command = Worker.maybeUpdateSSLSettings(driverDesc.command, conf)),        self,        akkaUrl)      //2.将driver加入内存      drivers(driverId) = driver      //3. 启动driver      driver.start()      //4.将worker内存，cpu使用情况进行更新      coresUsed += driverDesc.cores      memoryUsed += driverDesc.mem    }

start()源码

def start() = {    //3.1创建了一个新线程    new Thread("DriverRunner for " + driverId) {      override def run() {        try {          //3.2.创建一个工作目录          val driverDir = createWorkingDirectory()          //3.3下载用户上传的的Jar文件          val localJarFilename = downloadUserJar(driverDir)          //          def substituteVariables(argument: String): String = argument match {            case "{{WORKER_URL}}" => workerUrl            case "{{USER_JAR}}" => localJarFilename            case other => other          }          // TODO: If we add ability to submit multiple jars they should also be added here          //3.4 构建ProcessBuilder，传入了driver的启动命令，需要的内存大小等信息          val builder = CommandUtils.buildProcessBuilder(driverDesc.command, driverDesc.mem,            sparkHome.getAbsolutePath, substituteVariables)          //3.5启动driver          launchDriver(builder, driverDir, driverDesc.supervise)        }        catch {          case e: Exception => finalException = Some(e)        }        //3.6 判断driver退出的状态        val state =          if (killed) {            DriverState.KILLED          } else if (finalException.isDefined) {            DriverState.ERROR          } else {            finalExitCode match {              case Some(0) => DriverState.FINISHED              case _ => DriverState.FAILED            }          }        finalState = Some(state)        //3.7将driver状态改变信息发送给worker        worker ! DriverStateChanged(driverId, state, finalException)      }    }.start()  }

createWorkingDirectory()源码

private def createWorkingDirectory(): File = {    //3.2.1创建了一个本地路径    val driverDir = new File(workDir, driverId)    //如果路径不存在或者路径创建失败，抛出异常    if (!driverDir.exists() && !driverDir.mkdirs()) {      throw new IOException("Failed to create directory " + driverDir)    }    driverDir  }

downloadUserJar()源码

private def downloadUserJar(driverDir: File): String = {    //3.3.1 用hadoop的Path得到jar文件的路径    val jarPath = new Path(driverDesc.jarUrl)    //3.3.2 得到hadoop的conf配置    val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)    //3.3.3 得到了HDFS的FileSystem    val jarFileSystem = jarPath.getFileSystem(hadoopConf)    //3.3.4 创建本地目录    val destPath = new File(driverDir.getAbsolutePath, jarPath.getName)    val jarFileName = jarPath.getName    val localJarFile = new File(driverDir, jarFileName)    val localJarFilename = localJarFile.getAbsolutePath    //3.3.5 判断本地文件是否存在，如果不存在，用FileUtil将HDFS上的jar拷贝到本地    if (!localJarFile.exists()) { // May already exist if running multiple workers on one node      logInfo(s"Copying user jar $jarPath to $destPath")      FileUtil.copy(jarFileSystem, jarPath, destPath, false, hadoopConf)    }    //3.3.6 再次进行判断，用来确认，如果还不存在，说明copy失败，抛出异常。    if (!localJarFile.exists()) { // Verify copy succeeded      throw new Exception(s"Did not see expected jar $jarFileName in $driverDir")    }    localJarFilename  }

launchDriver()源码

private def launchDriver(builder: ProcessBuilder, baseDir: File, supervise: Boolean) {    builder.directory(baseDir)    def initialize(process: Process) = {      // Redirect stdout and stderr to files      //3.5.1 重定向stdout和stderr到文件中      val stdout = new File(baseDir, "stdout")      CommandUtils.redirectStream(process.getInputStream, stdout)      val stderr = new File(baseDir, "stderr")      val header = "Launch Command: %s\n%s\n\n".format(        builder.command.mkString("\"", "\" \"", "\""), "=" * 40)      Files.append(header, stderr, UTF_8)      CommandUtils.redirectStream(process.getErrorStream, stderr)    }    // 3.5.2 通过ProcessBuilder启动driver进程    runCommandWithRetry(ProcessBuilderLike(builder), initialize, supervise)  }

runCommandWithRetry()源码

private[deploy] def runCommandWithRetry(command: ProcessBuilderLike, initialize: Process => Unit,    supervise: Boolean) {    // Time to wait between submission retries.    var waitSeconds = 1    // A run of this many seconds resets the exponential back-off.    val successfulRunDuration = 5    var keepTrying = !killed    while (keepTrying) {      logInfo("Launch Command: " + command.command.mkString("\"", "\" \"", "\""))      synchronized {        if (killed) { return }        process = Some(command.start())        initialize(process.get)      }      val processStart = clock.getTimeMillis()      //启动driver进程      val exitCode = process.get.waitFor()      if (clock.getTimeMillis() - processStart > successfulRunDuration * 1000) {        waitSeconds = 1      }      if (supervise && exitCode != 0 && !killed) {        logInfo(s"Command exited with status $exitCode, re-launching after $waitSeconds s.")        sleeper.sleep(waitSeconds)        waitSeconds = waitSeconds * 2 // exponential back-off      }      keepTrying = supervise && exitCode != 0 && !killed      finalExitCode = Some(exitCode)    }  }}

Worker接收DriverStateChanged()源码

case DriverStateChanged(driverId, state, exception) => {      // 1.先对driver的结束状态进行判断      state match {        case DriverState.ERROR =>          logWarning(s"Driver $driverId failed with unrecoverable exception: ${exception.get}")        case DriverState.FAILED =>          logWarning(s"Driver $driverId exited with failure")        case DriverState.FINISHED =>          logInfo(s"Driver $driverId exited successfully")        case DriverState.KILLED =>          logInfo(s"Driver $driverId was killed by user")        case _ =>          logDebug(s"Driver $driverId changed state to $state")      }      // 2.再向master发送DriverStateChanged信息      master ! DriverStateChanged(driverId, state, exception)      // 3.将driver从内存中移除      val driver = drivers.remove(driverId).get      finishedDrivers(driverId) = driver      // 4.更新内存信息      memoryUsed -= driver.driverDesc.mem      coresUsed -= driver.driverDesc.cores    }

LaunchExecutor()源码

case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) =>      //先判断master是否是正在运行的master      if (masterUrl != activeMasterUrl) {        logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")      } else {        try {          logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))          // Create the executor's working directory          //创建executor的工作目录          val executorDir = new File(workDir, appId + "/" + execId)          //判断工作目录是否创建成功          if (!executorDir.mkdirs()) {            throw new IOException("Failed to create directory " + executorDir)          }          // Create local dirs for the executor. These are passed to the executor via the          // SPARK_LOCAL_DIRS environment variable, and deleted by the Worker when the          // application finishes.          // 为executor创建本地目录，通过SPARK_LOCAL_DIRS环境变量传给executor，然后在application完成的时候通过worker删除这个目录          val appLocalDirs = appDirectories.get(appId).getOrElse {            Utils.getOrCreateLocalRootDirs(conf).map { dir =>              Utils.createDirectory(dir).getAbsolutePath()            }.toSeq          }          appDirectories(appId) = appLocalDirs          //创建一个ExecutorRunnner          val manager = new ExecutorRunner(            appId,            execId,            appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),            cores_,            memory_,            self,            workerId,            host,            webUi.boundPort,            publicAddress,            sparkHome,            executorDir,            akkaUrl,            conf,            appLocalDirs, ExecutorState.LOADING)          //将Executor加入本地缓存          executors(appId + "/" + execId) = manager          //启动executor          manager.start()          coresUsed += cores_          memoryUsed += memory_          master ! ExecutorStateChanged(appId, execId, manager.state, None, None)        } catch {          case e: Exception => {            logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e)            if (executors.contains(appId + "/" + execId)) {              executors(appId + "/" + execId).kill()              executors -= appId + "/" + execId            }            master ! ExecutorStateChanged(appId, execId, ExecutorState.FAILED,              Some(e.toString), None)          }        }      }

manager.start()中start()方法源码

def start() {    //创建一个线程    workerThread = new Thread("ExecutorRunner for " + fullId) {      override def run() { fetchAndRunExecutor() }    }    workerThread.start()    // Shutdown hook that kills actors on shutdown.    shutdownHook = new Thread() {      override def run() {        killProcess(Some("Worker shutting down"))      }    }    Runtime.getRuntime.addShutdownHook(shutdownHook)  }

fetchAndRunExecutor()源码

def fetchAndRunExecutor() {    try {      // Launch the process      //封装一个ProcessBuilder      val builder = CommandUtils.buildProcessBuilder(appDesc.command, memory,        sparkHome.getAbsolutePath, substituteVariables)      val command = builder.command()      logInfo("Launch command: " + command.mkString("\"", "\" \"", "\""))      //设置builder的目录、环境变量      builder.directory(executorDir)      builder.environment.put("SPARK_LOCAL_DIRS", appLocalDirs.mkString(","))      // In case we are running this from within the Spark Shell, avoid creating a "scala"      // parent process for the executor command      builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")      // Add webUI log urls      val baseUrl =        s"http://$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="      builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")      builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")      //启动      process = builder.start()      val header = "Spark Executor Command: %s\n%s\n\n".format(        command.mkString("\"", "\" \"", "\""), "=" * 40)      // Redirect its stdout and stderr to files      //重定向输出流到本地文件中，      val stdout = new File(executorDir, "stdout")      stdoutAppender = FileAppender(process.getInputStream, stdout, conf)      val stderr = new File(executorDir, "stderr")      Files.write(header, stderr, UTF_8)      stderrAppender = FileAppender(process.getErrorStream, stderr, conf)      // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)      // or with nonzero exit code      //使用waitFor方法启动Executor的进程      val exitCode = process.waitFor()      //executor执行完之后拿到返回状态      state = ExecutorState.EXITED      val message = "Command exited with code " + exitCode      //向worker发送状态改变的消息      worker ! ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode))    } catch {      case interrupted: InterruptedException => {        logInfo("Runner thread for executor " + fullId + " interrupted")        state = ExecutorState.KILLED        killProcess(None)      }      case e: Exception => {        logError("Error running executor", e)        state = ExecutorState.FAILED        killProcess(Some(e.toString))      }    }  }

Worker接收到ExecutorStateChanged()的源码

case ExecutorStateChanged(appId, execId, state, message, exitStatus) =>      //向master发送ExecutorStateChanged的消息      master ! ExecutorStateChanged(appId, execId, state, message, exitStatus)      val fullId = appId + "/" + execId      //判断executor的状态是否是finished      if (ExecutorState.isFinished(state)) {        executors.get(fullId) match {          case Some(executor) =>            logInfo("Executor " + fullId + " finished with state " + state +              message.map(" message " + _).getOrElse("") +              exitStatus.map(" exitStatus " + _).getOrElse(""))            //将executor从内存缓存中移除            executors -= fullId            finishedExecutors(fullId) = executor            //释放executor占用的内存和cpu资源            coresUsed -= executor.cores            memoryUsed -= executor.memory          case None =>            logInfo("Unknown Executor " + fullId + " finished with state " + state +              message.map(" message " + _).getOrElse("") +              exitStatus.map(" exitStatus " + _).getOrElse(""))        }        maybeCleanupApplication(appId)      }

阅读全文

0 0