3.Master注册机制源码分析和状态改变机制源码分析

来源:互联网 发布:川普 知乎 编辑:程序博客网 时间:2024/06/06 04:28

master注册机制原理图如下 , 说明了就是将Application信息 , Driver信息和所有的Worker信息加入缓存队列中



1. Application的注册其实在master.scala的代码中就一段代码 : 
  1. /**
  2. * 处理Application注册请求信息
  3. */
  4. case RegisterApplication(description) => {
  5. // 如果当前master为standByMaster , 不是ActiveMaster , 那么Application来注册则什么都不会做
  6. if (state == RecoveryState.STANDBY) {
  7. // ignore, don't send response
  8. } else {
  9. logInfo("Registering app " + description.name)
  10. // 通过接收到的application desc信息创建Application对象
  11. val app = createApplication(description, sender)
  12. // 注册Application对象
  13. registerApplication(app)
  14. logInfo("Registered app " + description.name + " with ID " + app.id)
  15. // 持久化Application信息
  16. persistenceEngine.addApplication(app)
  17. // 向master发送注册Application的信息 , 也就是反向向SparkDeploySchedulerBackend的AppClient的ClientActor发型已经注册的RegisteredApplication消息
  18. sender ! RegisteredApplication(app.id, masterUrl)
  19. // 开始资源调度
  20. schedule()
  21. }
  22. }

创建Application代码如下 :
  1. def createApplication(desc: ApplicationDescription, driver: ActorRef): ApplicationInfo = {
  2. val now = System.currentTimeMillis()
  3. val date = new Date(now)
  4. new ApplicationInfo(now, newApplicationId(date), desc, date, driver, defaultCores)
  5. }

注册Application对象如下 :
  1. /**
  2. * 注册Application信息 , 将相关的Application信息加入缓存队列中
  3. */
  4. def registerApplication(app: ApplicationInfo): Unit = {
  5. val appAddress = app.driver.path.address
  6. if (addressToApp.contains(appAddress)) {
  7. logInfo("Attempted to re-register application at same address: " + appAddress)
  8. return
  9. }
  10. // 这里将相关的Application信息加入缓存队列中
  11. applicationMetricsSystem.registerSource(app.appSource)
  12. apps += app
  13. idToApp(app.id) = app
  14. actorToApp(app.driver) = app
  15. addressToApp(appAddress) = app
  16. // 这里将Application加入等待调度的队列中 , waitingApps其实就是一个ArrayBuffer
  17. waitingApps += app
  18. }

这其中关于ApplicationDescription的代码如下 :
  1. private[spark] class ApplicationDescription(
  2. val name: String, // Application的名称
  3. val maxCores: Option[Int], //Application的最大使用cou core
  4. val memoryPerSlave: Int, //Application需要的每个节点使用的内存
  5. val command: Command, //命令
  6. var appUiUrl: String, //Application所在节点的ui URL
  7. val eventLogDir: Option[String] = None,
  8. // short name of compression codec used when writing event logs, if any (e.g. lzf)
  9. val eventLogCodec: Option[String] = None)

2.Worker的注册原理和Application相似 , 代码如下 : 
  1. case RegisterWorker(id, workerHost, workerPort, cores, memory, workerUiPort, publicAddress) =>
  2. {
  3. logInfo("Registering worker %s:%d with %d cores, %s RAM".format(
  4. workerHost, workerPort, cores, Utils.megabytesToString(memory)))
  5. // 检查Master的状态
  6. if (state == RecoveryState.STANDBY) {
  7. // ignore, don't send response
  8. } else if (idToWorker.contains(id)) {
  9. sender ! RegisterWorkerFailed("Duplicate worker ID")
  10. } else {
  11. // 创建WorkerInfo信息对象 , 封装相关的worker信息
  12. val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory,
  13. sender, workerUiPort, publicAddress)
  14. // 注册worker , 调用registerWorker方法进行注册 , 若注册成功进行worker信息持久化并向master发送注册的消息
  15. if (registerWorker(worker)) {
  16. // worker信息持久化
  17. persistenceEngine.addWorker(worker)
  18. // 向master发送注册消息
  19. sender ! RegisteredWorker(masterUrl, masterWebUiUrl)
  20. // 开始资源调度
  21. schedule()
  22. } else {
  23. // 注册失败的话向master发送注册失败的消息
  24. val workerAddress = worker.actor.path.address
  25. logWarning("Worker registration failed. Attempted to re-register worker at same " +
  26. "address: " + workerAddress)
  27. sender ! RegisterWorkerFailed("Attempted to re-register worker at same address: "
  28. + workerAddress)
  29. }
  30. }
  31. }
注册worker的相关信息源码如下 : 
  1. /**
  2. * 注册worker信息
  3. */
  4. def registerWorker(worker: WorkerInfo): Boolean = {
  5. // There may be one or more refs to dead workers on this same node (w/ different ID's),
  6. // remove them.
  7. // 这里过滤掉已经死掉的worker , 将他们从缓存队列中移除
  8. workers.filter { w =>
  9. (w.host == worker.host && w.port == worker.port) && (w.state == WorkerState.DEAD)
  10. }.foreach { w =>
  11. workers -= w
  12. }
  13. // 获取worker的url地址
  14. val workerAddress = worker.actor.path.address
  15. // 检查worker的地址缓存队列中是否已经有了该worker的地址信息
  16. if (addressToWorker.contains(workerAddress)) {
  17. // 从worker的地址缓存队列(HashMap)中获取已经存在的worker的地址信息 , 称之为oldworker
  18. val oldWorker = addressToWorker(workerAddress)
  19. // 若是oldworker为UNKNOW状态的话需要将其从缓存队列中移除
  20. if (oldWorker.state == WorkerState.UNKNOWN) {
  21. // A worker registering from UNKNOWN implies that the worker was restarted during recovery.
  22. // The old worker must thus be dead, so we will remove it and accept the new worker.
  23. removeWorker(oldWorker)
  24. } else {
  25. logInfo("Attempted to re-register worker at same address: " + workerAddress)
  26. return false
  27. }
  28. }
  29. // 将新增加的worker加入缓存队列HashSet中
  30. workers += worker
  31. // worker的id信息加入id缓存队列
  32. idToWorker(worker.id) = worker
  33. // 将worker的地址加入address缓存队列中
  34. addressToWorker(workerAddress) = worker
  35. true
  36. }

3.Dirver的注册源码也相似 , 代码如下 :
    
  1. /**
  2. * 注册Driver
  3. */
  4. case RequestSubmitDriver(description) => {
  5. // 检查master状态
  6. if (state != RecoveryState.ALIVE) {
  7. val msg = s"Can only accept driver submissions in ALIVE state. Current state: $state."
  8. sender ! SubmitDriverResponse(false, None, msg)
  9. } else {
  10. logInfo("Driver submitted " + description.command.mainClass)
  11. // 根据DriverDescription创建Driver
  12. val driver = createDriver(description)
  13. // 持久化Driver信息
  14. persistenceEngine.addDriver(driver)
  15. // 将Driver加入等待调度的缓存队列中
  16. waitingDrivers += driver
  17. // 将Driver加入缓存队列
  18. drivers.add(driver)
  19. // 开始调度
  20. schedule()
  21. // TODO: It might be good to instead have the submission client poll the master to determine
  22. // the current status of the driver. For now it's simply "fire and forget".
  23. sender ! SubmitDriverResponse(true, Some(driver.id),
  24. s"Driver successfully submitted as ${driver.id}")
  25. }
  26. }
 关于DriverDescription.scala的部分代码如下 :
  1. private[spark] class DriverDescription(
  2. val jarUrl: String, // jar包的名称
  3. val mem: Int, // Dirver所需要的内存
  4. val cores: Int, // Driver所需要的cpu core数量
  5. val supervise: Boolean, // Driver是否被master监控
  6. val command: Command) // 相关命令
  7. extends Serializable {
  8. def copy(
  9. jarUrl: String = jarUrl,
  10. mem: Int = mem,
  11. cores: Int = cores,
  12. supervise: Boolean = supervise,
  13. command: Command = command): DriverDescription =
  14. new DriverDescription(jarUrl, mem, cores, supervise, command)
  15. override def toString: String = s"DriverDescription (${command.mainClass})"
  16. }

同时呢在看一下Driver状态改变的代码 : 
  1. /**
  2. * Driver的状态改变时需要做的曹组
  3. */
  4. case DriverStateChanged(driverId, state, exception) => {
  5. state match {
  6. // 如果Driver的状态为ERROR,FINISHED,KILLED,FAILED 那么都会将Driver杀掉
  7. case DriverState.ERROR | DriverState.FINISHED | DriverState.KILLED | DriverState.FAILED =>
  8. removeDriver(driverId, state, exception)
  9. case _ =>
  10. throw new Exception(s"Received unexpected state update for driver $driverId: $state")
  11. }
  12. }
 重点查看Driver移除的代码 :
  1. /**
  2. * 移除Driver
  3. */
  4. def removeDriver(driverId: String, finalState: DriverState, exception: Option[Exception]) {
  5. // 通过scala的find调度找到缓存队列中的driver并进行匹配
  6. drivers.find(d => d.id == driverId) match {
  7. case Some(driver) =>
  8. logInfo(s"Removing driver: $driverId")
  9. // 从缓存队列Driver中移除
  10. drivers -= driver
  11. if (completedDrivers.size >= RETAINED_DRIVERS) {
  12. val toRemove = math.max(RETAINED_DRIVERS / 10, 1)
  13. completedDrivers.trimStart(toRemove)
  14. }
  15. // 加入已经完成的缓存driver队列中
  16. completedDrivers += driver
  17. // 持久化缓存信息
  18. persistenceEngine.removeDriver(driver)
  19. // 更改缓存状态
  20. driver.state = finalState
  21. driver.exception = exception
  22. driver.worker.foreach(w => w.removeDriver(driver))
  23. schedule()
  24. case None =>
  25. logWarning(s"Asked to remove unknown driver: $driverId")
  26. }
  27. }

关于Executor的状态改变如下 :
  1. /**
  2. * Executor状态改变所需要的操作
  3. */
  4. case ExecutorStateChanged(appId, execId, state, message, exitStatus) => {
  5. // 找到executor对应app,然后反过来通过app对应的executor缓存获取executor信息
  6. val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId))
  7. execOption match {
  8. // 成功找到
  9. case Some(exec) => {
  10. // 设置executor的当前状态
  11. val appInfo = idToApp(appId)
  12. exec.state = state
  13. if (state == ExecutorState.RUNNING) { appInfo.resetRetryCount() }
  14. // 向driver同步发送executorUpdate的信息
  15. exec.application.driver ! ExecutorUpdated(execId, state, message, exitStatus)
  16. // 判断 , 如果executor已经完成了
  17. if (ExecutorState.isFinished(state)) {
  18. // Remove this executor from the worker and app
  19. logInfo(s"Removing executor ${exec.fullId} because it is $state")
  20. // 从Application中移除掉executor
  21. appInfo.removeExecutor(exec)
  22. // 从worker中移除exec
  23. exec.worker.removeExecutor(exec)
  24. // 如果executor的状态退出异常
  25. val normalExit = exitStatus == Some(0)
  26. // Only retry certain number of times so we don't go into an infinite loop.
  27. if (!normalExit) {
  28. // 判断Application当前的重试次数是否达到了最大值 , 最大值默认为10
  29. if (appInfo.incrementRetryCount() < ApplicationState.MAX_NUM_RETRY) {
  30. // 没有达到最大值 则继续调度
  31. schedule()
  32. } else {
  33. // 没有达到最大值那就认为executor调度失败 , 并同时认为Application也是失败了 , 将Application也从缓存队列移除掉
  34. val execs = appInfo.executors.values
  35. if (!execs.exists(_.state == ExecutorState.RUNNING)) {
  36. logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " +
  37. s"${appInfo.retryCount} times; removing it")
  38. // 移除掉executor所在的Application
  39. removeApplication(appInfo, ApplicationState.FAILED)
  40. }
  41. }
  42. }
  43. }
  44. }
  45. case None =>
  46. logWarning(s"Got status update for unknown executor $appId/$execId")
  47. }
  48. }


阅读全文
0 0
原创粉丝点击