2.Master主备机制切换源码分析

来源:互联网 发布:林忆莲 野风 知乎 编辑:程序博客网 时间:2024/05/19 20:38

先看下原理图:


从Master的 completeRecovery方法开始分析 , 代码如下:
  1. /**
  2. * 完成主备机切换 , 当主Master挂掉的时候完成StandByMaster的启动
  3. */
  4. def completeRecovery() {
  5. // Ensure "only-once" recovery semantics using a short synchronization period.
  6. synchronized {
  7. if (state != RecoveryState.RECOVERING) { return }
  8. state = RecoveryState.COMPLETING_RECOVERY
  9. }
  10. // Kill off any workers and apps that didn't respond to us.
  11. // 将Application和Worker的信息为UNKNOW的过滤出来,然后便利每一个信息
  12. // 分别调用finishApplication和removeWorker对可能出现死掉或者有故障的Application和Worker进行清理
  13. // 总结 : 1.从内存缓存中(HashMap)移除Worker和Application信息; 2.从相关的组件(Executor和Driver)的内存缓存结构中移除; 3.从持久化存储中移除
  14. workers.filter(_.state == WorkerState.UNKNOWN).foreach(removeWorker)
  15. apps.filter(_.state == ApplicationState.UNKNOWN).foreach(finishApplication)
  16. // Reschedule drivers which were not claimed by any workers
  17. // 重新发布Driver
  18. drivers.filter(_.worker.isEmpty).foreach { d =>
  19. logWarning(s"Driver ${d.id} was not found after master recovery")
  20. if (d.desc.supervise) {
  21. logWarning(s"Re-launching ${d.id}")
  22. relaunchDriver(d)
  23. } else {
  24. removeDriver(d.id, DriverState.ERROR, None)
  25. logWarning(s"Did not re-launch ${d.id} because it was not supervised")
  26. }
  27. }
  28. // 将master的状态更改为ALIVE
  29. state = RecoveryState.ALIVE
  30. // master重新进行资源调度
  31. schedule()
  32. logInfo("Recovery complete - resuming operations!")
  33. }

然后是对worker信息移除 , 代码如下:
  1. /**
  2. * 清理掉UNKNOW状态的worker
  3. */
  4. def removeWorker(worker: WorkerInfo) {
  5. logInfo("Removing worker " + worker.id + " on " + worker.host + ":" + worker.port)
  6. // 设置状态为DEAD
  7. worker.setState(WorkerState.DEAD)
  8. // idToWorker为所有worker信息的缓存队列 , 其实就是一HashMap , 将传递过来的worker的ID从该缓存队列中移除掉
  9. idToWorker -= worker.id
  10. // addressToWorker同idToWorker一样 , 缓存所有worker的地址 , 这里也是将传递过来的worker的address从该缓存队列中移除掉
  11. addressToWorker -= worker.actor.path.address
  12. // 遍历worker中所有的executor , 告诉App所依赖运行的executor信息丢失并移除掉
  13. for (exec <- worker.executors.values) {
  14. logInfo("Telling app of lost executor: " + exec.id)
  15. exec.application.driver ! ExecutorUpdated(
  16. exec.id, ExecutorState.LOST, Some("worker lost"), None)
  17. exec.application.removeExecutor(exec)
  18. }
  19. // 遍历worker中所有的driver , 若被StandByMaster监控则重新启动 , 没有被监控则移除掉driver
  20. for (driver <- worker.drivers.values) {
  21. if (driver.desc.supervise) {
  22. logInfo(s"Re-launching ${driver.id}")
  23. relaunchDriver(driver)
  24. } else {
  25. logInfo(s"Not re-launching ${driver.id} because it was not supervised")
  26. removeDriver(driver.id, DriverState.ERROR, None)
  27. }
  28. }
  29. // 最后将worker的持久化信息移除掉
  30. persistenceEngine.removeWorker(worker)
  31. }

接着是Application信息移除 , 代码如下:
  1. /**
  2. * 结束掉UNKNOW状态的Application
  3. */
  4. def finishApplication(app: ApplicationInfo) {
  5. // 只有这一行代码 , 将Application的状态更改为FINISH , 调用app的重构移除方法
  6. removeApplication(app, ApplicationState.FINISHED)
  7. }
  8. /**
  9. * 结束掉UNKNOW状态的Application
  10. */
  11. def removeApplication(app: ApplicationInfo, state: ApplicationState.Value) {
  12. // 检查master的Application缓存队列(HashSet)中是否包含传递过来的app信息
  13. if (apps.contains(app)) {
  14. logInfo("Removing app " + app.id)
  15. // 在master的Application缓存队列中移除传递过来的app相关信息
  16. apps -= app
  17. idToApp -= app.id
  18. actorToApp -= app.driver
  19. addressToApp -= app.driver.path.address
  20. if (completedApps.size >= RETAINED_APPLICATIONS) {
  21. val toRemove = math.max(RETAINED_APPLICATIONS / 10, 1)
  22. completedApps.take(toRemove).foreach( a => {
  23. appIdToUI.remove(a.id).foreach { ui => webUi.detachSparkUI(ui) }
  24. applicationMetricsSystem.removeSource(a.appSource)
  25. })
  26. completedApps.trimStart(toRemove)
  27. }
  28. completedApps += app // Remember it in our history
  29. waitingApps -= app
  30. // If application events are logged, use them to rebuild the UI
  31. rebuildSparkUI(app)
  32. // 移除app所依赖的executor信息 , 获取executor的actor发送消息给masterUrl杀掉该executor
  33. for (exec <- app.executors.values) {
  34. exec.worker.removeExecutor(exec)
  35. exec.worker.actor ! KillExecutor(masterUrl, exec.application.id, exec.id)
  36. exec.state = ExecutorState.KILLED
  37. }
  38. // 获取app所依赖的driver发送结束掉该App的信息
  39. app.markFinished(state)
  40. if (state != ApplicationState.FINISHED) {
  41. app.driver ! ApplicationRemoved(state.toString)
  42. }
  43. // 从持久化中移除掉app信息
  44. persistenceEngine.removeApplication(app)
  45. // 重新调度
  46. schedule()
  47. // Tell all workers that the application has finished, so they can clean up any app state.
  48. // 告知每一个worker节点该App已经结束掉
  49. workers.foreach { w =>
  50. w.actor ! ApplicationFinished(app.id)
  51. }
  52. }
  53. }

上面两端代码中需要对WorkerInfo和ApplicationInfo信息进行详细了解一下 , 源码如下:
  1. private[spark] class WorkerInfo(
  2. val id: String,
  3. val host: String,
  4. val port: Int,
  5. val cores: Int,
  6. val memory: Int,
  7. val actor: ActorRef,
  8. val webUiPort: Int,
  9. val publicAddress: String)
  1. private[spark] class ApplicationInfo(
  2. val startTime: Long,
  3. val id: String,
  4. val desc: ApplicationDescription,
  5. val submitDate: Date,
  6. val driver: ActorRef,
  7. defaultCores: Int)

最后就是Driver信息的移除和被监控的Driver重新启动代码 , 其实在上面的第一段代码completeRecover中已经贴出:
  1. // Reschedule drivers which were not claimed by any workers
  2. // 重新发布Driver
  3. drivers.filter(_.worker.isEmpty).foreach { d =>
  4. logWarning(s"Driver ${d.id} was not found after master recovery")
  5. if (d.desc.supervise) {
  6. logWarning(s"Re-launching ${d.id}")
  7. relaunchDriver(d)
  8. } else {
  9. removeDriver(d.id, DriverState.ERROR, None)
  10. logWarning(s"Did not re-launch ${d.id} because it was not supervised")
  11. }
  12. }
  13. // 将master的状态更改为ALIVE
  14. state = RecoveryState.ALIVE
  15. // master重新进行资源调度
  16. schedule()
  17. logInfo("Recovery complete - resuming operations!")



阅读全文
0 0