spark源码之sparkEnv(2)blockManager

来源:互联网 发布:matlab画网络拓扑图 编辑:程序博客网 时间:2024/06/11 20:32

1 sparkEnv中初始化blockManager

首先,根据是在驱动还是在executor上,选择端口。

   val blockManagerPort = if (isDriver) {      conf.get(DRIVER_BLOCK_MANAGER_PORT)    } else {      conf.get(BLOCK_MANAGER_PORT)    }

有了端口,就可以建立基于netty的rpc系统,传输大文件

    val blockTransferService =      new NettyBlockTransferService(conf, securityManager, bindAddress, advertiseAddress,        blockManagerPort, numUsableCores)

如果是在驱动程序,创建blockManagerMaster ,如果在executor上根据blockManagerMaster 和blockTransferService 创建blockManager。

    val blockManagerMaster = new BlockManagerMaster(registerOrLookupEndpoint(      BlockManagerMaster.DRIVER_ENDPOINT_NAME,      new BlockManagerMasterEndpoint(rpcEnv, isLocal, conf, listenerBus)),      conf, isDriver)    // NB: blockManager is not valid until initialize() is called later.    val blockManager = new BlockManager(executorId, rpcEnv, blockManagerMaster,      serializerManager, conf, memoryManager, mapOutputTracker, shuffleManager,      blockTransferService, securityManager, numUsableCores)

在master上,registerOrLookupEndpoint构造一个BlockManagerMasterEndpoint,并将该点注册在rpcEnv中。

BlockManagerMasterEndpoint 负责通过rpc方式去管理所有节点的 BlockManager。

而在executor上创建manager时,会用setupEndpointRef 方法获取到了 BlockManagerMasterEndpoint的引用 BlockManagerMasterEndpointRef, 同时也会启动自己的 BlockManager。

这样就建立了基于rpc的分布式存储系统。

2 blockTransferService

由于分布式存储,block可能存放在不同的节点。如果需要传输这些block就需要建立通信。

private[spark] class NettyBlockTransferService(    conf: SparkConf,    securityManager: SecurityManager,    bindAddress: String,    override val hostName: String,    _port: Int,    numCores: Int)  extends BlockTransferService {  private val serializer = new JavaSerializer(conf)//序列化器  private val authEnabled = securityManager.isAuthenticationEnabled()//安全管理  private val transportConf = SparkTransportConf.fromSparkConf(conf, "shuffle", numCores)//配置参数,shuffle和核心个数  private[this] var transportContext: TransportContext = _//管理网络I/O上下文  private[this] var server: TransportServer = _  private[this] var clientFactory: TransportClientFactory = _  private[this] var appId: String = _  //初始化网络服务器和客户端  override def init(blockDataManager: BlockDataManager): Unit = {    val rpcHandler = new NettyBlockRpcServer(conf.getAppId, serializer, blockDataManager)    var serverBootstrap: Option[TransportServerBootstrap] = None    var clientBootstrap: Option[TransportClientBootstrap] = None    if (authEnabled) {      serverBootstrap = Some(new AuthServerBootstrap(transportConf, securityManager))      clientBootstrap = Some(new AuthClientBootstrap(transportConf, conf.getAppId, securityManager))    }    transportContext = new TransportContext(transportConf, rpcHandler)    clientFactory = transportContext.createClientFactory(clientBootstrap.toSeq.asJava)    server = createServer(serverBootstrap.toList)    appId = conf.getAppId    logInfo(s"Server created on ${hostName}:${server.getPort}")  }  /** 根据具体的地址建立server */  private def createServer(bootstraps: List[TransportServerBootstrap]): TransportServer = {    def startService(port: Int): (TransportServer, Int) = {      val server = transportContext.createServer(bindAddress, port, bootstraps.asJava)      (server, server.getPort)    }    Utils.startServiceOnPort(_port, startService, conf, getClass.getName)._1  }

blockTransferService 的 fetchBlocks方法,会去其他真正存储节点上去fetch数据, 先获得一个client,然后调用OneForOneBlockFetcher获取block

  override def fetchBlocks(      host: String,      port: Int,      execId: String,      blockIds: Array[String],      listener: BlockFetchingListener,      shuffleFiles: Array[File]): Unit = {    logTrace(s"Fetch blocks from $host:$port (executor id $execId)")    try {      val blockFetchStarter = new RetryingBlockFetcher.BlockFetchStarter {        override def createAndStart(blockIds: Array[String], listener: BlockFetchingListener) {          val client = clientFactory.createClient(host, port)          new OneForOneBlockFetcher(client, appId, execId, blockIds.toArray, listener,            transportConf, shuffleFiles).start()        }      }      val maxRetries = transportConf.maxIORetries()      if (maxRetries > 0) {        // Note this Fetcher will correctly handle maxRetries == 0; we avoid it just in case there's        // a bug in this code. We should remove the if statement once we're sure of the stability.        new RetryingBlockFetcher(transportConf, blockFetchStarter, blockIds, listener).start()      } else {        blockFetchStarter.createAndStart(blockIds, listener)      }    } catch {      case e: Exception =>        logError("Exception while beginning fetchBlocks", e)        blockIds.foreach(listener.onBlockFetchFailure(_, e))    }  }  override def port: Int = server.getPort

3 BlockManagerMaster

在master上,registerOrLookupEndpoint构造一个BlockManagerMasterEndpoint,将该点注册在rpcEnv中。返回一个EndpointRef。

然后其他executor就可以在Master上注册blockManager,驱动端用driverEndpoint.askSync询问并接收这个注册信息。:

  def registerBlockManager(      blockManagerId: BlockManagerId,      maxOnHeapMemSize: Long,      maxOffHeapMemSize: Long,      slaveEndpoint: RpcEndpointRef): BlockManagerId = {    logInfo(s"Registering BlockManager $blockManagerId")    val updatedId = driverEndpoint.askSync[BlockManagerId](      RegisterBlockManager(blockManagerId, maxOnHeapMemSize, maxOffHeapMemSize, slaveEndpoint))    logInfo(s"Registered BlockManager $updatedId")    updatedId  }

除此之外,Master管理其他block

比如移除executor:

  def removeExecutor(execId: String) {    tell(RemoveExecutor(execId))    logInfo("Removed " + execId + " successfully in removeExecutor")  }

从一个节点清除block,驱动必须知道Id

  def removeBlock(blockId: BlockId) {    driverEndpoint.askSync[Boolean](RemoveBlock(blockId))  }

清除所有属于同一个RDD的blocks

  def removeRdd(rddId: Int, blocking: Boolean) {    val future = driverEndpoint.askSync[Future[Seq[Int]]](RemoveRdd(rddId))    future.onFailure {      case e: Exception =>        logWarning(s"Failed to remove RDD $rddId - ${e.getMessage}", e)    }(ThreadUtils.sameThread)    if (blocking) {      timeout.awaitResult(future)    }  }

……

4 BlockManager

建立一个SlaveEndpoint。

  private val slaveEndpoint = rpcEnv.setupEndpoint(    "BlockManagerEndpoint" + BlockManager.ID_GENERATOR.next,    new BlockManagerSlaveEndpoint(rpcEnv, this, mapOutputTracker))

加上blockmanagerId

  var blockManagerId: BlockManagerId = _

BlockManager的实例对象调用initializes的时候才能正常工作。
主要是BlockTransferService(网络通信),ShuffleClient的初始化

def initialize(appId: String): Unit = {    blockTransferService.init(this)    shuffleClient.init(appId)    blockReplicationPolicy = {      val priorityClass = conf.get(        "spark.storage.replication.policy", classOf[RandomBlockReplicationPolicy].getName)      val clazz = Utils.classForName(priorityClass)      val ret = clazz.newInstance.asInstanceOf[BlockReplicationPolicy]      logInfo(s"Using $priorityClass for block replication policy")      ret    }    val id =      BlockManagerId(executorId, blockTransferService.hostName, blockTransferService.port, None)    val idFromMaster = master.registerBlockManager(      id,      maxOnHeapMemory,      maxOffHeapMemory,      slaveEndpoint)//向master注册slaveEndpoint    blockManagerId = if (idFromMaster != null) idFromMaster else id    shuffleServerId = if (externalShuffleServiceEnabled) {      logInfo(s"external shuffle service port = $externalShuffleServicePort")      BlockManagerId(executorId, blockTransferService.hostName, externalShuffleServicePort)    } else {      blockManagerId    }    // Register Executors' configuration with the local shuffle service, if one should exist.    if (externalShuffleServiceEnabled && !blockManagerId.isDriver) {      registerWithExternalShuffleServer()    }    logInfo(s"Initialized BlockManager: $blockManagerId")  }

blockstore

block的存储分为两个层次:

  private[spark] val memoryStore =    new MemoryStore(conf, blockInfoManager, serializerManager, memoryManager, this)  private[spark] val diskStore = new DiskStore(conf, diskBlockManager, securityManager)  memoryManager.setMemoryStore(memoryStore)