Master的注册机制和状态管理
来源:互联网 发布:熟悉办公软件 编辑:程序博客网 时间:2024/05/22 06:20
master接受注册的对象主要就是:worker,driver,application
注意: executor不会注册给master,executor是注册给driver中的schedulerbackbend的
worker
启动后主动向master注册,所以如果在生产环境下加入新的worker到已经正在运行的Spark集群上,此时不需要重新启动Spark集群就能够使用新加入的worker以提升处理能力
【源码】Worker .scala
private[deploy] object Worker extends Logging { val SYSTEM_NAME = "sparkWorker" val ENDPOINT_NAME = "Worker" def main(argStrings: Array[String]) { SignalLogger.register(log) val conf = new SparkConf val args = new WorkerArguments(argStrings, conf) val rpcEnv = startRpcEnvAndEndpoint(args.host, args.port, args.webUiPort, args.cores, args.memory, args.masters, args.workDir, conf = conf) rpcEnv.awaitTermination() } def startRpcEnvAndEndpoint( host: String, port: Int, webUiPort: Int, cores: Int, memory: Int, masterUrls: Array[String], workDir: String, workerNumber: Option[Int] = None, conf: SparkConf = new SparkConf): RpcEnv = { // The LocalSparkCluster runs multiple local sparkWorkerX RPC Environments val systemName = SYSTEM_NAME + workerNumber.map(_.toString).getOrElse("") val securityMgr = new SecurityManager(conf) val rpcEnv = RpcEnv.create(systemName, host, port, conf, securityMgr) val masterAddresses = masterUrls.map(RpcAddress.fromSparkURL(_)) rpcEnv.setupEndpoint(ENDPOINT_NAME, new Worker(rpcEnv, webUiPort, cores, memory, masterAddresses, systemName, ENDPOINT_NAME, workDir, conf, securityMgr)) rpcEnv } ...}override def onStart() { assert(!registered) logInfo("Starting Spark worker %s:%d with %d cores, %s RAM".format( host, port, cores, Utils.megabytesToString(memory))) logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}") logInfo("Spark home: " + sparkHome) createWorkDir() shuffleService.startIfEnabled() webUi = new WorkerWebUI(this, workDir, webUiPort) webUi.bind() registerWithMaster() metricsSystem.registerSource(workerSource) metricsSystem.start() // Attach the worker metrics servlet handler to the web ui after the metrics system is started. metricsSystem.getServletHandlers.foreach(webUi.attachHandler) }private def registerWithMaster() { // onDisconnected may be triggered multiple times, so don't attempt registration // if there are outstanding registration attempts scheduled. registrationRetryTimer match { case None => registered = false registerMasterFutures = tryRegisterAllMasters() connectionAttemptCount = 0 registrationRetryTimer = Some(forwordMessageScheduler.scheduleAtFixedRate( new Runnable { override def run(): Unit = Utils.tryLogNonFatalError { Option(self).foreach(_.send(ReregisterWithMaster)) } }, INITIAL_REGISTRATION_RETRY_INTERVAL_SECONDS, INITIAL_REGISTRATION_RETRY_INTERVAL_SECONDS, TimeUnit.SECONDS)) case Some(_) => logInfo("Not spawning another attempt to register with the master, since there is an" + " attempt scheduled already.") } }private def tryRegisterAllMasters(): Array[JFuture[_]] = { masterRpcAddresses.map { masterAddress => registerMasterThreadPool.submit(new Runnable { override def run(): Unit = { try { logInfo("Connecting to master " + masterAddress + "...") val masterEndpoint = rpcEnv.setupEndpointRef(Master.SYSTEM_NAME, masterAddress, Master.ENDPOINT_NAME) registerWithMaster(masterEndpoint) } catch { case ie: InterruptedException => // Cancelled case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e) } } }) } }private def registerWithMaster(masterEndpoint: RpcEndpointRef): Unit = { masterEndpoint.ask[RegisterWorkerResponse](RegisterWorker( workerId, host, port, self, cores, memory, webUi.boundPort, publicAddress)) .onComplete { // This is a very fast action so we can use "ThreadUtils.sameThread" case Success(msg) => Utils.tryLogNonFatalError { handleRegisterResponse(msg) } case Failure(e) => logError(s"Cannot register with master: ${masterEndpoint.address}", e) System.exit(1) }(ThreadUtils.sameThread) } private def handleRegisterResponse(msg: RegisterWorkerResponse): Unit = synchronized { msg match { case RegisteredWorker(masterRef, masterWebUiUrl) => logInfo("Successfully registered with master " + masterRef.address.toSparkURL) registered = true changeMaster(masterRef, masterWebUiUrl) forwordMessageScheduler.scheduleAtFixedRate(new Runnable { override def run(): Unit = Utils.tryLogNonFatalError { self.send(SendHeartbeat) } }, 0, HEARTBEAT_MILLIS, TimeUnit.MILLISECONDS) if (CLEANUP_ENABLED) { logInfo( s"Worker cleanup enabled; old application directories will be deleted in: $workDir") forwordMessageScheduler.scheduleAtFixedRate(new Runnable { override def run(): Unit = Utils.tryLogNonFatalError { self.send(WorkDirCleanup) } }, CLEANUP_INTERVAL_MILLIS, CLEANUP_INTERVAL_MILLIS, TimeUnit.MILLISECONDS) } case RegisterWorkerFailed(message) => if (!registered) { logError("Worker registration failed: " + message) System.exit(1) } case MasterInStandby => // Ignore. Master not yet ready. } }
【源码】Master.scala
override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case RegisterWorker( id, workerHost, workerPort, workerRef, cores, memory, workerUiPort, publicAddress) => { logInfo("Registering worker %s:%d with %d cores, %s RAM".format( workerHost, workerPort, cores, Utils.megabytesToString(memory))) if (state == RecoveryState.STANDBY) { context.reply(MasterInStandby) } else if (idToWorker.contains(id)) { context.reply(RegisterWorkerFailed("Duplicate worker ID")) } else { val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory, workerRef, workerUiPort, publicAddress) if (registerWorker(worker)) { persistenceEngine.addWorker(worker) context.reply(RegisteredWorker(self, masterWebUiUrl)) schedule() } else { val workerAddress = worker.endpoint.address logWarning("Worker registration failed. Attempted to re-register worker at same " + "address: " + workerAddress) context.reply(RegisterWorkerFailed("Attempted to re-register worker at same address: " + workerAddress)) } } } ... }...private def registerWorker(worker: WorkerInfo): Boolean = { // There may be one or more refs to dead workers on this same node (w/ different ID's), // remove them. workers.filter { w => (w.host == worker.host && w.port == worker.port) && (w.state == WorkerState.DEAD) }.foreach { w => workers -= w } val workerAddress = worker.endpoint.address if (addressToWorker.contains(workerAddress)) { val oldWorker = addressToWorker(workerAddress) if (oldWorker.state == WorkerState.UNKNOWN) { // A worker registering from UNKNOWN implies that the worker was restarted during recovery. // The old worker must thus be dead, so we will remove it and accept the new worker. removeWorker(oldWorker) } else { logInfo("Attempted to re-register worker at same address: " + workerAddress) return false } } workers += worker idToWorker(worker.id) = worker addressToWorker(workerAddress) = worker true }
master在接收到worker注册的请求后,首先会判断一下当前的master是否是standby的模式,如果是的话就不处理;然后会判断当前master的内存数据结构idToWorker中是否已经有该worker的注册信息,如果有的话此时不会重复注册,创建workerInfo对象来保存注册的worker信息,然后调用registerWorker来执行具体的注册过程,如果worker的状态是否是dead的状态则直接过滤掉,对于unknown装的内容调用removeWorker进行清理(包括清理worker下的executors和driver)
Master对Driver和Executor状态变化的处理
- 对Driver状态变化的处理,remove掉
- Executor挂掉的时候,系统会尝试一定次数的重启(最多重复10次)。如果还不行,就remove掉。
【源码】Master.scala
override def receive: PartialFunction[Any, Unit] = { ... case ExecutorStateChanged(appId, execId, state, message, exitStatus) => { val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId)) execOption match { case Some(exec) => { val appInfo = idToApp(appId) val oldState = exec.state exec.state = state if (state == ExecutorState.RUNNING) { assert(oldState == ExecutorState.LAUNCHING, s"executor $execId state transfer from $oldState to RUNNING is illegal") appInfo.resetRetryCount() } exec.application.driver.send(ExecutorUpdated(execId, state, message, exitStatus)) if (ExecutorState.isFinished(state)) { // Remove this executor from the worker and app logInfo(s"Removing executor ${exec.fullId} because it is $state") // If an application has already finished, preserve its // state to display its information properly on the UI if (!appInfo.isFinished) { appInfo.removeExecutor(exec) } exec.worker.removeExecutor(exec) val normalExit = exitStatus == Some(0) // Only retry certain number of times so we don't go into an infinite loop. if (!normalExit) { //重试10次 if (appInfo.incrementRetryCount() < ApplicationState.MAX_NUM_RETRY) { schedule() } else { val execs = appInfo.executors.values if (!execs.exists(_.state == ExecutorState.RUNNING)) { logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " + s"${appInfo.retryCount} times; removing it") removeApplication(appInfo, ApplicationState.FAILED) } } } } } case None => logWarning(s"Got status update for unknown executor $appId/$execId") } } case DriverStateChanged(driverId, state, exception) => { state match { case DriverState.ERROR | DriverState.FINISHED | DriverState.KILLED | DriverState.FAILED => removeDriver(driverId, state, exception) case _ => throw new Exception(s"Received unexpected state update for driver $driverId: $state") } } ...}
。。。完善中。。。
0 0
- master的注册机制和状态管理
- Master的注册机制和状态管理
- day30:Master的注册机制和状态管理解密
- Master的注册机制和状态管理解密
- Master的注册机制和状态管理详解
- 第30课 Master的注册机制和状态管理解密
- 第30课:Master的注册机制和状态管理解密 课堂笔记
- 大数据IMF传奇行动绝密课程第30课:Master的注册机制和状态管理解密
- Spark系列(六)Master注册机制和状态改变机制
- 0003.spark2.0源码分析(3)--master注册机制与状态管理
- 3.Master注册机制源码分析和状态改变机制源码分析
- Spark Master的注册机制
- spark master注册机制和主备切换源码
- Spark的Master分析2(Master注册机制原理分析)
- Spark的Master分析3(Master状态改变机制分析)
- UITableView的重用和注册机制
- platform设备和驱动的注册机制
- 域名管理与注册机制
- 【洛谷P2907】 【USACO08OPEN】农场周围的道路 水模拟分治
- TensorFlow学习笔记--1.0 版本的一个小坑记录
- POJ3669 Meteor Shower(bfs)
- ubuntu16.04下安装MySQL-pthon,遇到:EnvironmentError: mysql_config not found问题
- 动态加载apk的类
- Master的注册机制和状态管理
- 黄旗山
- 回调机制的一些理解
- STM32F030用IAR+JLINK在线调试无法下载FLASH问题
- Spring Web MVC框架(八) 配置Spring Web MVC
- PAT BASIC 1017 A除以B
- JDBC入门到放弃
- [经济]股票的基本知识和部分专业术语的解释
- 判断apk的安装