spark 1.6.0 core源码分析2 master启动流程
来源:互联网 发布:利用淘宝店做淘宝客 编辑:程序博客网 时间:2024/05/17 03:38
源码位置:org.apache.spark.deploy.master.Master.scala
def main(argStrings: Array[String]) { SignalLogger.register(log) val conf = new SparkConf val args = new MasterArguments(argStrings, conf) val (rpcEnv, _, _) = startRpcEnvAndEndpoint(args.host, args.port, args.webUiPort, conf) rpcEnv.awaitTermination() } /** * Start the Master and return a three tuple of: * (1) The Master RpcEnv * (2) The web UI bound port * (3) The REST server bound port, if any */ def startRpcEnvAndEndpoint( host: String, port: Int, webUiPort: Int, conf: SparkConf): (RpcEnv, Int, Option[Int]) = { val securityMgr = new SecurityManager(conf) val rpcEnv =<strong> RpcEnv.create</strong>(SYSTEM_NAME, host, port, conf, securityMgr) val masterEndpoint = rpcEnv.setupEndpoint(ENDPOINT_NAME, <strong>new Master(rpcEnv, rpcEnv.address, webUiPort, securityMgr, conf)</strong>)//启动8080端口服务 val portsResponse = masterEndpoint.askWithRetry[BoundPortsResponse](BoundPortsRequest) (rpcEnv, portsResponse.webUIPort, portsResponse.restPort) }
private def getRpcEnvFactory(conf: SparkConf): RpcEnvFactory = { val rpcEnvNames = Map( "akka" -> "org.apache.spark.rpc.akka.AkkaRpcEnvFactory", "netty" -> "org.apache.spark.rpc.netty.NettyRpcEnvFactory") val rpcEnvName = conf.get("spark.rpc", "netty") val rpcEnvFactoryClassName = rpcEnvNames.getOrElse(rpcEnvName.toLowerCase, rpcEnvName) Utils.classForName(rpcEnvFactoryClassName).newInstance().asInstanceOf[RpcEnvFactory] } def create( name: String, host: String, port: Int, conf: SparkConf, securityManager: SecurityManager, clientMode: Boolean = false): RpcEnv = { // Using Reflection to create the RpcEnv to avoid to depend on Akka directly val config = RpcEnvConfig(conf, name, host, port, securityManager, clientMode) getRpcEnvFactory(conf).create(config) }
解析spark相关的环境变量及方法参数,创建RpcEnvFactory用于与其它节点的交互,以前版本是akka,spark 1.6.0使用netty,可通过spark-default.xml 配置
spark.rpc 设置使用akka还是netty,启动一个netty NettyRpcEndpointRef
private[netty] class NettyRpcEnvFactory extends RpcEnvFactory with Logging { def create(config: RpcEnvConfig): RpcEnv = { val sparkConf = config.conf // Use JavaSerializerInstance in multiple threads is safe. However, if we plan to support // KryoSerializer in future, we have to use ThreadLocal to store SerializerInstance val javaSerializerInstance = new JavaSerializer(sparkConf).newInstance().asInstanceOf[JavaSerializerInstance] val nettyEnv = new NettyRpcEnv(sparkConf, javaSerializerInstance, config.host, config.securityManager) if (!config.clientMode) { val startNettyRpcEnv: Int => (NettyRpcEnv, Int) = { actualPort => <strong>nettyEnv.startServer(actualPort) //启动netty服务 7077</strong> (nettyEnv, nettyEnv.address.port) } try { Utils.startServiceOnPort(config.port, startNettyRpcEnv, sparkConf, config.name)._1 } catch { case NonFatal(e) => nettyEnv.shutdown() throw e } } nettyEnv }}//start netty 7077端口服务
def startServer(port: Int): Unit = { val bootstraps: java.util.List[TransportServerBootstrap] = if (securityManager.isAuthenticationEnabled()) { java.util.Arrays.asList(new SaslServerBootstrap(transportConf, securityManager)) } else { java.util.Collections.emptyList() } server = transportContext.createServer(host, port, bootstraps) dispatcher.registerRpcEndpoint( RpcEndpointVerifier.NAME, new RpcEndpointVerifier(this, dispatcher)) }
既然创建了NettyRpcEnvFactory,就执行了master的onStart方法。
override def onStart(): Unit = { logInfo("Starting Spark master at " + masterUrl) logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}") webUi = new MasterWebUI(this, webUiPort) webUi.bind() masterWebUiUrl = "http://" + masterPublicAddress + ":" + webUi.boundPort //这里会启一个定时调度,检查timeout的worker进程。如果有worker超时,则将状态置为DEAD,并清理一些内存中关于该worker的信息。如果该worker中有Executor进程,则向driver发送ExecutorUpdated消息,表明该Executor也已经不可用了。如果该worker中有Driver进程,且配置driver是可以relaunch的,则重新调度在可用的worker节点上启动,不然的话就删除该Driver的内存信息。只有在该worker超时很多次之后,才真正删除,之前其实只是让该worker不被选中执行任务而已。 checkForWorkerTimeOutTask = forwardMessageThread.scheduleAtFixedRate(new Runnable { override def run(): Unit = Utils.tryLogNonFatalError { self.send(CheckForWorkerTimeOut) } }, 0, WORKER_TIMEOUT_MS, TimeUnit.MILLISECONDS) if (restServerEnabled) { val port = conf.getInt("spark.master.rest.port", 6066) restServer = Some(new StandaloneRestServer(address.host, port, conf, self, masterUrl)) } restServerBoundPort = restServer.map(_.start()) masterMetricsSystem.registerSource(masterSource) masterMetricsSystem.start() applicationMetricsSystem.start() // Attach the master and app metrics servlet handler to the web ui after the metrics systems are // started. masterMetricsSystem.getServletHandlers.foreach(webUi.attachHandler) applicationMetricsSystem.getServletHandlers.foreach(webUi.attachHandler) val serializer = new JavaSerializer(conf) //master ha 过程 val (persistenceEngine_, leaderElectionAgent_) = RECOVERY_MODE match { case "ZOOKEEPER" => logInfo("Persisting recovery state to ZooKeeper") val zkFactory = new ZooKeeperRecoveryModeFactory(conf, serializer) (zkFactory.createPersistenceEngine(), zkFactory.createLeaderElectionAgent(this)) case "FILESYSTEM" => val fsFactory = new FileSystemRecoveryModeFactory(conf, serializer) (fsFactory.createPersistenceEngine(), fsFactory.createLeaderElectionAgent(this)) case "CUSTOM" => val clazz = Utils.classForName(conf.get("spark.deploy.recoveryMode.factory")) val factory = clazz.getConstructor(classOf[SparkConf], classOf[Serializer]) .newInstance(conf, serializer) .asInstanceOf[StandaloneRecoveryModeFactory] (factory.createPersistenceEngine(), factory.createLeaderElectionAgent(this)) case _ => (new BlackHolePersistenceEngine(), new MonarchyLeaderAgent(this)) } persistenceEngine = persistenceEngine_ leaderElectionAgent = leaderElectionAgent_ }master主动处理的流程大致就完了,之后就接受其他的请求来被动处理。
worker节点的注册方法
override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case RegisterWorker( id, workerHost, workerPort, workerRef, cores, memory, workerUiPort, publicAddress) => { logInfo("Registering worker %s:%d with %d cores, %s RAM".format( workerHost, workerPort, cores, Utils.megabytesToString(memory))) if (state == RecoveryState.STANDBY) { context.reply(MasterInStandby) } else if (idToWorker.contains(id)) { //注册过 context.reply(RegisterWorkerFailed("Duplicate worker ID")) } else { val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory, workerRef, workerUiPort, publicAddress) if (registerWorker(worker)) {//将worker信息加入master内存中 persistenceEngine.addWorker(worker) context.reply(RegisteredWorker(self, masterWebUiUrl)) schedule()//调度 } else { val workerAddress = worker.endpoint.address logWarning("Worker registration failed. Attempted to re-register worker at same " + "address: " + workerAddress) context.reply(RegisterWorkerFailed("Attempted to re-register worker at same address: " + workerAddress)) } } }
/** * Schedule the currently available resources among waiting apps. This method will be called * every time a new app joins or resource availability changes. */ private def schedule(): Unit = { if (state != RecoveryState.ALIVE) { return } // Drivers take strict precedence over executors val shuffledWorkers = Random.shuffle(workers) // Randomization helps balance drivers for (worker <- shuffledWorkers if worker.state == WorkerState.ALIVE) { for (driver <- waitingDrivers) { if (worker.memoryFree >= driver.desc.mem && worker.coresFree >= driver.desc.cores) { launchDriver(worker, driver)//向worker发送LaunchDriver消息 waitingDrivers -= driver } } } startExecutorsOnWorkers()//向worker发送LaunchExecutor消息,并向driver发送ExecutorAdded消息 } private def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc): Unit = { logInfo("Launching executor " + exec.fullId + " on worker " + worker.id) worker.addExecutor(exec) worker.endpoint.send(LaunchExecutor(masterUrl, exec.application.id, exec.id, exec.application.desc, exec.cores, exec.memory)) exec.application.driver.send( ExecutorAdded(exec.id, worker.id, worker.hostPort, exec.cores, exec.memory)) }<pre name="code" class="java">
case RequestSubmitDriver(description)//请求提交Driver消息,记录Driver的信息并调度
case RequestKillDriver(driverId)
case RequestDriverStatus(driverId)
case RegisterApplication(description)//提交Application,记录Application的信息并调度
case ExecutorStateChanged(appId, execId, state, message, exitStatus)
case DriverStateChanged(driverId, state, exception)
case Heartbeat(workerId)//心跳,用于worker节点的保活
case ExecutorStateChanged(appId, execId, state, message, exitStatus)
case DriverStateChanged(driverId, state, exception)
case MasterChangeAcknowledged(appId)
case WorkerSchedulerStateResponse
case AttachCompletedRebuildUI
方法较多,大多数都是看其名知其意
- spark 1.6.0 core源码分析2 master启动流程
- spark core源码分析2 master启动流程
- spark core源码分析2 master启动流程
- spark core源码分析2 master启动流程
- spark 1.6.0 core源码分析4 worker启动流程
- spark 1.6.0 core源码分析3 Master HA
- Spark集群启动之Master、Worker启动流程源码分析
- spark源码分析Master与Worker启动流程篇
- Spark源码分析-master启动
- spark core源码分析4 worker启动流程
- spark core源码分析4 worker启动流程
- spark core源码分析3 Master HA
- spark core源码分析3 Master HA
- HBase 0.1.0版本源码分析--Master启动流程
- spark 1.6.0 core源码分析5 spark提交框架
- spark 1.6.0 core源码分析1 集群启动及任务提交过程
- Spark On YARN启动流程源码分析
- Spark HiveThriftServer2启动流程源码分析
- java集成支付宝支付接口
- PASSION之ASSEMBLY(四)
- 如何解决IMEI缺少最后一位的问题
- BroadCastReceiver的基本使用方法
- L2-005. 集合相似度
- spark 1.6.0 core源码分析2 master启动流程
- IOS图像拉伸解决方案
- GreenDAO—Android ORM框架(一)
- Android编译错误:You have tried to change the API from what has been previously approved.
- 算法导论第六章6.5
- MySQL 一键生成wiki、protobuf等
- 无限轮播图 三种实现方式
- 使用ivy管理jar包
- 如何在mac上安装gradle