Spark源码学习笔记4-SparkEnv

来源:互联网 发布:景区历年游客数据统计 编辑:程序博客网 时间:2024/06/05 07:30

继3-LiveListenerBus之后,我们来看看SparkContext构造函数中初始化的SparkEnv,初始化代码如下:

 // Create the Spark execution environment (cache, map output tracker, etc)    _env = createSparkEnv(_conf, isLocal, listenerBus)    SparkEnv.set(_env)// This function allows components created by SparkEnv to be mocked in unit tests:  private[spark] def createSparkEnv(      conf: SparkConf,      isLocal: Boolean,      listenerBus: LiveListenerBus): SparkEnv = {    SparkEnv.createDriverEnv(conf, isLocal, listenerBus, SparkContext.numDriverCores(master))  }

从调用的函数可以看出SparkContext中的createSparkEnv最后调用的是SparkEnv类(实际是其伴生对象的成员函数)的createDriverEnv,我们可以看看SparkEnv.scala源代码,如下:

package org.apache.spark....../** * :: DeveloperApi :: * Holds all the runtime environment objects for a running Spark instance (either master or worker), * including the serializer, RpcEnv, block manager, map output tracker, etc. Currently * Spark code finds the SparkEnv through a global variable, so all the threads can access the same * SparkEnv. It can be accessed by SparkEnv.get (e.g. after creating a SparkContext). * * NOTE: This is not intended for external use. This is exposed for Shark and may be made private *       in a future release. */@DeveloperApiclass SparkEnv (    val executorId: String,    private[spark] val rpcEnv: RpcEnv,    val serializer: Serializer,    val closureSerializer: Serializer,    val serializerManager: SerializerManager,    val mapOutputTracker: MapOutputTracker,    val shuffleManager: ShuffleManager,    val broadcastManager: BroadcastManager,    val blockManager: BlockManager,    val securityManager: SecurityManager,    val metricsSystem: MetricsSystem,    val memoryManager: MemoryManager,    val outputCommitCoordinator: OutputCommitCoordinator,    val conf: SparkConf) extends Logging {  private[spark] var isStopped = false  private val pythonWorkers = mutable.HashMap[(String, Map[String, String]), PythonWorkerFactory]()  // A general, soft-reference map for metadata needed during HadoopRDD split computation  // (e.g., HadoopFileRDD uses this to cache JobConfs and InputFormats).  private[spark] val hadoopJobMetadata = new MapMaker().softValues().makeMap[String, Any]()  private[spark] var driverTmpDir: Option[String] = None  private[spark] def stop() {    if (!isStopped) {      isStopped = true      pythonWorkers.values.foreach(_.stop())      mapOutputTracker.stop()      shuffleManager.stop()      broadcastManager.stop()      blockManager.stop()      blockManager.master.stop()      metricsSystem.stop()      outputCommitCoordinator.stop()      rpcEnv.shutdown()      rpcEnv.awaitTermination()      // If we only stop sc, but the driver process still run as a services then we need to delete      // the tmp dir, if not, it will create too many tmp dirs.      // We only need to delete the tmp dir create by driver      driverTmpDir match {        case Some(path) =>          try {            Utils.deleteRecursively(new File(path))          } catch {            case e: Exception =>              logWarning(s"Exception while deleting Spark temp dir: $path", e)          }        case None => // We just need to delete tmp dir created by driver, so do nothing on executor      }    }  }  private[spark]  def createPythonWorker(pythonExec: String, envVars: Map[String, String]): java.net.Socket = {    synchronized {      val key = (pythonExec, envVars)      pythonWorkers.getOrElseUpdate(key, new PythonWorkerFactory(pythonExec, envVars)).create()    }  }  private[spark]  def destroyPythonWorker(pythonExec: String, envVars: Map[String, String], worker: Socket) {    synchronized {      val key = (pythonExec, envVars)      pythonWorkers.get(key).foreach(_.stopWorker(worker))    }  }  private[spark]  def releasePythonWorker(pythonExec: String, envVars: Map[String, String], worker: Socket) {    synchronized {      val key = (pythonExec, envVars)      pythonWorkers.get(key).foreach(_.releaseWorker(worker))    }  }}object SparkEnv extends Logging {  @volatile private var env: SparkEnv = _  private[spark] val driverSystemName = "sparkDriver"  private[spark] val executorSystemName = "sparkExecutor"  def set(e: SparkEnv) {    env = e  }  /**   * Returns the SparkEnv.   */  def get: SparkEnv = {    env  }  /**   * Create a SparkEnv for the driver.   */  private[spark] def createDriverEnv(      conf: SparkConf,      isLocal: Boolean,      listenerBus: LiveListenerBus,      numCores: Int,      mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = {    assert(conf.contains(DRIVER_HOST_ADDRESS),      s"${DRIVER_HOST_ADDRESS.key} is not set on the driver!")    assert(conf.contains("spark.driver.port"), "spark.driver.port is not set on the driver!")    val bindAddress = conf.get(DRIVER_BIND_ADDRESS)    val advertiseAddress = conf.get(DRIVER_HOST_ADDRESS)    val port = conf.get("spark.driver.port").toInt    val ioEncryptionKey = if (conf.get(IO_ENCRYPTION_ENABLED)) {      Some(CryptoStreamUtils.createKey(conf))    } else {      None    }    create(      conf,      SparkContext.DRIVER_IDENTIFIER,      bindAddress,      advertiseAddress,      port,      isLocal,      numCores,      ioEncryptionKey,      listenerBus = listenerBus,      mockOutputCommitCoordinator = mockOutputCommitCoordinator    )  }  /**   * Create a SparkEnv for an executor.   * In coarse-grained mode, the executor provides an RpcEnv that is already instantiated.   */  private[spark] def createExecutorEnv(      conf: SparkConf,      executorId: String,      hostname: String,      port: Int,      numCores: Int,      ioEncryptionKey: Option[Array[Byte]],      isLocal: Boolean): SparkEnv = {    val env = create(      conf,      executorId,      hostname,      hostname,      port,      isLocal,      numCores,      ioEncryptionKey    )    SparkEnv.set(env)    env  }  /**   * Helper method to create a SparkEnv for a driver or an executor.   */  private def create(      conf: SparkConf,      executorId: String,      bindAddress: String,      advertiseAddress: String,      port: Int,      isLocal: Boolean,      numUsableCores: Int,      ioEncryptionKey: Option[Array[Byte]],      listenerBus: LiveListenerBus = null,      mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = {    ......  }  /**   * Return a map representation of jvm information, Spark properties, system properties, and   * class paths. Map keys define the category, and map values represent the corresponding   * attributes as a sequence of KV pairs. This is used mainly for SparkListenerEnvironmentUpdate.   */  private[spark]  def environmentDetails(      conf: SparkConf,      schedulingMode: String,      addedJars: Seq[String],      addedFiles: Seq[String]): Map[String, Seq[(String, String)]] = {    import Properties._    val jvmInformation = Seq(      ("Java Version", s"$javaVersion ($javaVendor)"),      ("Java Home", javaHome),      ("Scala Version", versionString)    ).sorted    // Spark properties    // This includes the scheduling mode whether or not it is configured (used by SparkUI)    val schedulerMode =      if (!conf.contains("spark.scheduler.mode")) {        Seq(("spark.scheduler.mode", schedulingMode))      } else {        Seq[(String, String)]()      }    val sparkProperties = (conf.getAll ++ schedulerMode).sorted    // System properties that are not java classpaths    val systemProperties = Utils.getSystemProperties.toSeq    val otherProperties = systemProperties.filter { case (k, _) =>      k != "java.class.path" && !k.startsWith("spark.")    }.sorted    // Class paths including all added jars and files    val classPathEntries = javaClassPath      .split(File.pathSeparator)      .filterNot(_.isEmpty)      .map((_, "System Classpath"))    val addedJarsAndFiles = (addedJars ++ addedFiles).map((_, "Added By User"))    val classPaths = (addedJarsAndFiles ++ classPathEntries).sorted    Map[String, Seq[(String, String)]](      "JVM Information" -> jvmInformation,      "Spark Properties" -> sparkProperties,      "System Properties" -> otherProperties,      "Classpath Entries" -> classPaths)  }}

从SparkEnv伴生类的注释里面可以看出,SparkEnv维持着一个运行的Spark实例(包括master和worker)的几乎所有运行时环境对象,包括serializer, RpcEnv, block manager, map output tracker等等。这些运行时环境对象都是直接通过构造函数传递的(伴生对象调用的)。SparkEnv伴生类还有一个函数stop,函数内部依次停止各个运行时环境。其他几个函数都是PythonWorker的创建/释放/销毁,暂时不知道是用于干什么(应该是与线程池有关,待后续“船到桥头自然直”)。

SparkEnv伴生类具有对私有变量env: SparkEnv的set和get函数。分别为驱动器和执行器创建SparkEnv的private[spark]函数createDriverEnv和createExecutorEnv,这两个函数最后调用的都是private函数creat:

/**   * Helper method to create a SparkEnv for a driver or an executor.   */  private def create(      conf: SparkConf,      executorId: String,      bindAddress: String,      advertiseAddress: String,      port: Int,      isLocal: Boolean,      numUsableCores: Int,      ioEncryptionKey: Option[Array[Byte]],      listenerBus: LiveListenerBus = null,      mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = {    val isDriver = executorId == SparkContext.DRIVER_IDENTIFIER    // Listener bus is only used on the driver    if (isDriver) {      assert(listenerBus != null, "Attempted to create driver SparkEnv with null listener bus!")    }    val securityManager = new SecurityManager(conf, ioEncryptionKey)    ioEncryptionKey.foreach { _ =>      if (!securityManager.isSaslEncryptionEnabled()) {        logWarning("I/O encryption enabled without RPC encryption: keys will be visible on the " +          "wire.")      }    }    val systemName = if (isDriver) driverSystemName else executorSystemName    val rpcEnv = RpcEnv.create(systemName, bindAddress, advertiseAddress, port, conf,      securityManager, clientMode = !isDriver)    // Figure out which port RpcEnv actually bound to in case the original port is 0 or occupied.    // In the non-driver case, the RPC env's address may be null since it may not be listening    // for incoming connections.    if (isDriver) {      conf.set("spark.driver.port", rpcEnv.address.port.toString)    } else if (rpcEnv.address != null) {      conf.set("spark.executor.port", rpcEnv.address.port.toString)      logInfo(s"Setting spark.executor.port to: ${rpcEnv.address.port.toString}")    }    // Create an instance of the class with the given name, possibly initializing it with our conf    def instantiateClass[T](className: String): T = {      val cls = Utils.classForName(className)      // Look for a constructor taking a SparkConf and a boolean isDriver, then one taking just      // SparkConf, then one taking no arguments      try {        cls.getConstructor(classOf[SparkConf], java.lang.Boolean.TYPE)          .newInstance(conf, new java.lang.Boolean(isDriver))          .asInstanceOf[T]      } catch {        case _: NoSuchMethodException =>          try {            cls.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[T]          } catch {            case _: NoSuchMethodException =>              cls.getConstructor().newInstance().asInstanceOf[T]          }      }    }    // Create an instance of the class named by the given SparkConf property, or defaultClassName    // if the property is not set, possibly initializing it with our conf    def instantiateClassFromConf[T](propertyName: String, defaultClassName: String): T = {      instantiateClass[T](conf.get(propertyName, defaultClassName))    }    val serializer = instantiateClassFromConf[Serializer](      "spark.serializer", "org.apache.spark.serializer.JavaSerializer")    logDebug(s"Using serializer: ${serializer.getClass}")    val serializerManager = new SerializerManager(serializer, conf, ioEncryptionKey)    val closureSerializer = new JavaSerializer(conf)    def registerOrLookupEndpoint(        name: String, endpointCreator: => RpcEndpoint):      RpcEndpointRef = {      if (isDriver) {        logInfo("Registering " + name)        rpcEnv.setupEndpoint(name, endpointCreator)      } else {        RpcUtils.makeDriverRef(name, conf, rpcEnv)      }    }    val broadcastManager = new BroadcastManager(isDriver, conf, securityManager)    val mapOutputTracker = if (isDriver) {      new MapOutputTrackerMaster(conf, broadcastManager, isLocal)    } else {      new MapOutputTrackerWorker(conf)    }    // Have to assign trackerEndpoint after initialization as MapOutputTrackerEndpoint    // requires the MapOutputTracker itself    mapOutputTracker.trackerEndpoint = registerOrLookupEndpoint(MapOutputTracker.ENDPOINT_NAME,      new MapOutputTrackerMasterEndpoint(        rpcEnv, mapOutputTracker.asInstanceOf[MapOutputTrackerMaster], conf))    // Let the user specify short names for shuffle managers    val shortShuffleMgrNames = Map(      "sort" -> classOf[org.apache.spark.shuffle.sort.SortShuffleManager].getName,      "tungsten-sort" -> classOf[org.apache.spark.shuffle.sort.SortShuffleManager].getName)    val shuffleMgrName = conf.get("spark.shuffle.manager", "sort")    val shuffleMgrClass = shortShuffleMgrNames.getOrElse(shuffleMgrName.toLowerCase, shuffleMgrName)    val shuffleManager = instantiateClass[ShuffleManager](shuffleMgrClass)    val useLegacyMemoryManager = conf.getBoolean("spark.memory.useLegacyMode", false)    val memoryManager: MemoryManager =      if (useLegacyMemoryManager) {        new StaticMemoryManager(conf, numUsableCores)      } else {        UnifiedMemoryManager(conf, numUsableCores)      }    val blockManagerPort = if (isDriver) {      conf.get(DRIVER_BLOCK_MANAGER_PORT)    } else {      conf.get(BLOCK_MANAGER_PORT)    }    val blockTransferService =      new NettyBlockTransferService(conf, securityManager, bindAddress, advertiseAddress,        blockManagerPort, numUsableCores)    val blockManagerMaster = new BlockManagerMaster(registerOrLookupEndpoint(      BlockManagerMaster.DRIVER_ENDPOINT_NAME,      new BlockManagerMasterEndpoint(rpcEnv, isLocal, conf, listenerBus)),      conf, isDriver)    // NB: blockManager is not valid until initialize() is called later.    val blockManager = new BlockManager(executorId, rpcEnv, blockManagerMaster,      serializerManager, conf, memoryManager, mapOutputTracker, shuffleManager,      blockTransferService, securityManager, numUsableCores)    val metricsSystem = if (isDriver) {      // Don't start metrics system right now for Driver.      // We need to wait for the task scheduler to give us an app ID.      // Then we can start the metrics system.      MetricsSystem.createMetricsSystem("driver", conf, securityManager)    } else {      // We need to set the executor ID before the MetricsSystem is created because sources and      // sinks specified in the metrics configuration file will want to incorporate this executor's      // ID into the metrics they report.      conf.set("spark.executor.id", executorId)      val ms = MetricsSystem.createMetricsSystem("executor", conf, securityManager)      ms.start()      ms    }    val outputCommitCoordinator = mockOutputCommitCoordinator.getOrElse {      new OutputCommitCoordinator(conf, isDriver)    }    val outputCommitCoordinatorRef = registerOrLookupEndpoint("OutputCommitCoordinator",      new OutputCommitCoordinatorEndpoint(rpcEnv, outputCommitCoordinator))    outputCommitCoordinator.coordinatorRef = Some(outputCommitCoordinatorRef)    val envInstance = new SparkEnv(      executorId,      rpcEnv,      serializer,      closureSerializer,      serializerManager,      mapOutputTracker,      shuffleManager,      broadcastManager,      blockManager,      securityManager,      metricsSystem,      memoryManager,      outputCommitCoordinator,      conf)    // Add a reference to tmp dir created by driver, we will delete this tmp dir when stop() is    // called, and we only need to do it for driver. Because driver may run as a service, and if we    // don't delete this tmp dir when sc is stopped, then will create too many tmp dirs.    if (isDriver) {      val sparkFilesDir = Utils.createTempDir(Utils.getLocalDir(conf), "userFiles").getAbsolutePath      envInstance.driverTmpDir = Some(sparkFilesDir)    }    envInstance  }

上述create函数通过传递的参数(包括SparkConf等)准备好了SparkEnv里的运行时环境对象。(明天继续…)

create函数中创建了SecurityManager,主要负责安全管理:

val securityManager = new SecurityManager(conf, ioEncryptionKey)

创建RpcEnv,封装了RPC的具体实现,用于实现RPC功能。RPC是Spark里面的一个核心,后续需要专门研究下。

val rpcEnv = RpcEnv.create(systemName, bindAddress, advertiseAddress, port, conf,      securityManager, clientMode = !isDriver)

通过Java反射机制创建序列化工具Serializer,并已此为参数创建SerializerManager:

val serializer = instantiateClassFromConf[Serializer](      "spark.serializer", "org.apache.spark.serializer.JavaSerializer")

SerializerManager是SparkEnv的组件,支配着Spark其他组件的序列化/压缩/加密,例如Shuffles里的序列化工具的自动选择。(/**
* Component which configures serialization, compression and encryption for various Spark
* components, including automatic selection of which [[Serializer]] to use for shuffles.
*/)

创建BroadcastManager,BroadcastManager以工厂模式(关于工厂模式,可以参考)管理着一个BroadcastFactory类型的工厂,获取Broadcast,Broadcast用于节点间共享数据,后续深入研究。

package org.apache.spark.broadcast......private[spark] class BroadcastManager(    val isDriver: Boolean,    conf: SparkConf,    securityManager: SecurityManager)  extends Logging {  private var initialized = false  private var broadcastFactory: BroadcastFactory = null  initialize()  // Called by SparkContext or Executor before using Broadcast  private def initialize() {    synchronized {      if (!initialized) {        broadcastFactory = new TorrentBroadcastFactory        broadcastFactory.initialize(isDriver, conf, securityManager)        initialized = true      }    }  }  def stop() {    broadcastFactory.stop()  }  private val nextBroadcastId = new AtomicLong(0)  def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean): Broadcast[T] = {    broadcastFactory.newBroadcast[T](value_, isLocal, nextBroadcastId.getAndIncrement())  }  def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean) {    broadcastFactory.unbroadcast(id, removeFromDriver, blocking)  }}

Spark将负责生产Broadcast的工厂进行了抽象,如下:

package org.apache.spark.broadcast....../** * An interface for all the broadcast implementations in Spark (to allow * multiple broadcast implementations). SparkContext uses a user-specified * BroadcastFactory implementation to instantiate a particular broadcast for the * entire Spark job. */private[spark] trait BroadcastFactory {  def initialize(isDriver: Boolean, conf: SparkConf, securityMgr: SecurityManager): Unit  /**   * Creates a new broadcast variable.   *   * @param value value to broadcast   * @param isLocal whether we are in local mode (single JVM process)   * @param id unique id representing this broadcast variable   */  def newBroadcast[T: ClassTag](value: T, isLocal: Boolean, id: Long): Broadcast[T]  def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean): Unit  def stop(): Unit}

Spark目前使用的BroadcastFactory是TorrentBroadcastFactory,该类基于类似BitTorrent协议的实现,后续深入研究。

package org.apache.spark.broadcast....../** * A BitTorrent-like implementation of [[org.apache.spark.broadcast.Broadcast]]. * * The mechanism is as follows: * * The driver divides the serialized object into small chunks and * stores those chunks in the BlockManager of the driver. * * On each executor, the executor first attempts to fetch the object from its BlockManager. If * it does not exist, it then uses remote fetches to fetch the small chunks from the driver and/or * other executors if available. Once it gets the chunks, it puts the chunks in its own * BlockManager, ready for other executors to fetch from. * * This prevents the driver from being the bottleneck in sending out multiple copies of the * broadcast data (one per executor). * * When initialized, TorrentBroadcast objects read SparkEnv.get.conf. * * @param obj object to broadcast * @param id A unique identifier for the broadcast variable. */private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)  extends Broadcast[T](id) with Logging with Serializable

(明天继续…)

创建MapOutputTracker,用于跟踪stage的输出位置(如stage通过MapOutputTracker获取上一依赖的shuffleMapStage的map输出信息),根据driver和executor分别创建不同的MapOutputTracker子类:

val mapOutputTracker = if (isDriver) {      new MapOutputTrackerMaster(conf, broadcastManager, isLocal)    } else {      new MapOutputTrackerWorker(conf)    }

MapOutputTracker.scala文件中主要类的简要信息如下,相关接口和工作原理(MapOutputTrackerMasterRpcEndpointMapOutputTrackerMasterEndpoint)后续展开研究。

package org.apache.spark......private[spark] sealed trait MapOutputTrackerMessageprivate[spark] case class GetMapOutputStatuses(shuffleId: Int)  extends MapOutputTrackerMessageprivate[spark] case object StopMapOutputTracker extends MapOutputTrackerMessageprivate[spark] case class GetMapOutputMessage(shuffleId: Int, context: RpcCallContext)/** RpcEndpoint class for MapOutputTrackerMaster */private[spark] class MapOutputTrackerMasterEndpoint(    override val rpcEnv: RpcEnv, tracker: MapOutputTrackerMaster, conf: SparkConf)  extends RpcEndpoint with Logging {  ......}/** * Class that keeps track of the location of the map output of * a stage. This is abstract because different versions of MapOutputTracker * (driver and executor) use different HashMap to store its metadata. */private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging {......}/** * MapOutputTracker for the driver. */private[spark] class MapOutputTrackerMaster(conf: SparkConf,    broadcastManager: BroadcastManager, isLocal: Boolean)  extends MapOutputTracker(conf) {  ......  }/** * MapOutputTracker for the executors, which fetches map output information from the driver's * MapOutputTrackerMaster. */private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTracker(conf) {......}private[spark] object MapOutputTracker extends Logging {......}

创建MemoryManager

val memoryManager: MemoryManager =      if (useLegacyMemoryManager) {        new StaticMemoryManager(conf, numUsableCores)      } else {        UnifiedMemoryManager(conf, numUsableCores)      }

抽象类MemoryManager的简单描述:

/** * An abstract memory manager that enforces how memory is shared between execution and storage. * * In this context, execution memory refers to that used for computation in shuffles, joins, * sorts and aggregations, while storage memory refers to that used for caching and propagating * internal data across the cluster. There exists one MemoryManager per JVM. */private[spark] abstract class MemoryManager(    conf: SparkConf,    numCores: Int,    onHeapStorageMemory: Long,    onHeapExecutionMemory: Long) extends Logging {    ......  }

创建BlockManager,涉及类NettyBlockTransferServiceBlockManagerMasterBlockManagerMasterEndpoint。这些类需后续深入研究。

val blockTransferService =      new NettyBlockTransferService(conf, securityManager, bindAddress, advertiseAddress,        blockManagerPort, numUsableCores)    val blockManagerMaster = new BlockManagerMaster(registerOrLookupEndpoint(      BlockManagerMaster.DRIVER_ENDPOINT_NAME,      new BlockManagerMasterEndpoint(rpcEnv, isLocal, conf, listenerBus)),      conf, isDriver)    // NB: blockManager is not valid until initialize() is called later.    val blockManager = new BlockManager(executorId, rpcEnv, blockManagerMaster,      serializerManager, conf, memoryManager, mapOutputTracker, shuffleManager,      blockTransferService, securityManager, numUsableCores)

BlockManager的简单描述:

package org.apache.spark.storage....../** * Manager running on every node (driver and executors) which provides interfaces for putting and * retrieving blocks both locally and remotely into various stores (memory, disk, and off-heap). * * Note that [[initialize()]] must be called before the BlockManager is usable. */private[spark] class BlockManager(    executorId: String,    rpcEnv: RpcEnv,    val master: BlockManagerMaster,    val serializerManager: SerializerManager,    val conf: SparkConf,    memoryManager: MemoryManager,    mapOutputTracker: MapOutputTracker,    shuffleManager: ShuffleManager,    val blockTransferService: BlockTransferService,    securityManager: SecurityManager,    numUsableCores: Int)  extends BlockDataManager with BlockEvictionHandler with Logging {  ...... } 
0 0
原创粉丝点击