
来源:互联网 发布:洗数据 编辑:程序博客网 时间:2024/05/29 03:09

1 SparkConf


Spark Driver用于提交用户应用程序,实际可以看作Spark的客户端。




def this() = this(new SparkConf())


  def this(master: String, appName: String, conf: SparkConf) =    this(SparkContext.updatedConf(conf, master, appName))


  def this(      master: String,//集群URL      appName: String,//集群web UI中显示的名字      sparkHome: String = null,//spark所在的节点位置      jars: Seq[String] = Nil,//提交给集群的jar的集合      //worker节点的环境变量      environment: Map[String, String] = Map()) = {    this(SparkContext.updatedConf(new SparkConf(), master, appName, sparkHome, jars, environment))  }




class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Serializable {  import SparkConf._  /** 默认构造函数调用系统默认参数 */  def this() = this(true)  private val settings = new ConcurrentHashMap[String, String]()  @transient private lazy val reader: ConfigReader = {    val _reader = new ConfigReader(new SparkConfigProvider(settings))    _reader.bindEnv(new ConfigProvider {      override def get(key: String): Option[String] = Option(getenv(key))    })    _reader  }  if (loadDefaults) {    loadFromSystemProperties(false)  }



  private[spark] def loadFromSystemProperties(silent: Boolean): SparkConf = {    // Load any spark.* system properties    for ((key, value) <- Utils.getSystemProperties if key.startsWith("spark.")) {      set(key, value, silent)    }    this  }


settings.put(key, value)

2 sparkEnv



SparkEnv 主要使用SparkEnv的createDriverEnv,创建驱动程序的执行环境

private[spark] def createDriverEnv(      conf: SparkConf,      isLocal: Boolean,      listenerBus: LiveListenerBus,//只会出现在驱动节点中      numCores: Int,      mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = {    assert(conf.contains(DRIVER_HOST_ADDRESS),      s"${DRIVER_HOST_ADDRESS.key} is not set on the driver!")    assert(conf.contains("spark.driver.port"), "spark.driver.port is not set on the driver!")    val bindAddress = conf.get(DRIVER_BIND_ADDRESS)    val advertiseAddress = conf.get(DRIVER_HOST_ADDRESS)    val port = conf.get("spark.driver.port").toInt    val ioEncryptionKey = if (conf.get(IO_ENCRYPTION_ENABLED)) {      Some(CryptoStreamUtils.createKey(conf))    } else {      None    }    create(      conf,      SparkContext.DRIVER_IDENTIFIER,      bindAddress,      advertiseAddress,      Option(port),      isLocal,      numCores,      ioEncryptionKey,      listenerBus = listenerBus,      mockOutputCommitCoordinator = mockOutputCommitCoordinator    )  }

sparkEnv 会分别在 drive 和 executor上都会初始化, 启动的时候会根据环境变量知道自己是在 driver program 上 还是在 executor上, 从而启动不同的组件。

val isDriver = executorId == SparkContext.DRIVER_IDENTIFIER//据此判断是驱动还是executor


private[spark] def createExecutorEnv(      conf: SparkConf,      executorId: String,      hostname: String,      numCores: Int,      ioEncryptionKey: Option[Array[Byte]],      isLocal: Boolean): SparkEnv 


val securityManager = new SecurityManager(conf, ioEncryptionKey)



    val rpcEnv = RpcEnv.create(systemName, bindAddress, advertiseAddress, port.getOrElse(-1), conf,      securityManager, clientMode = !isDriver)

3 spark的RPC通信机制

3.1 RpcEndpoint

#RpcEnv通过create构造。private[spark] trait RpcEnvFactory {  def create(config: RpcEnvConfig): RpcEnv}private[spark] trait RpcEndpoint {#  //向RpcEnv注册endpoints  val rpcEnv: RpcEnv#获取该RpcEndpoint 对应的RpcEndpointRef .在运行OnStart函数后才会产生ref,所以必须在运行OnStart后在.self才能得到ref  final def self: RpcEndpointRef = {    require(rpcEnv != null, "rpcEnv has not been initialized")    rpcEnv.endpointRef(this)  }#接收来自RpcEndpointRef.send或者RpcCallContext.reply的消息  def receive: PartialFunction[Any, Unit] = {    case _ => throw new SparkException(self + " does not implement 'receive'")  }#接收由RpcEndpointRef.ask发送的消息,RpcEndpoint端处理完消息后,需要给调用RpcEndpointRef.ask的通信端响应消息(Reply)  def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {    case _ => context.sendFailure(new SparkException(self + " won't reply anything"))  }....

3.2 RpcEndpointRef




def address: RpcAddress


send: 发射后不管,不需要对方回应,RpcEndpoint.receive对应接收

     def send(message: Any): Unit

ask: 发送后在限定的timeout内需要接收reply,只发一次不再重复。RpcEndpoint.receiveAndReply之后需要执行ask。

def ask[T: ClassTag](message: Any, timeout: RpcTimeout): Future[T]


  def askSync[T: ClassTag](message: Any, timeout: RpcTimeout): T = {    val future = ask[T](message, timeout)    timeout.awaitResult(future)  }


3.3 RpcEnv


private[spark] case class RpcAddress(host: String, port: Int) {  def hostPort: String = host + ":" + port  /** Returns a string in the form of "spark://host:port". */  def toSparkURL: String = "spark://" + hostPort  override def toString: String = hostPort}



 def setupEndpoint(name: String, endpoint: RpcEndpoint): RpcEndpointRef





  def create(      name: String,      bindAddress: String,      advertiseAddress: String,      port: Int,      conf: SparkConf,      securityManager: SecurityManager,      clientMode: Boolean): RpcEnv = {    val config = RpcEnvConfig(conf, name, bindAddress, advertiseAddress, port, securityManager,      clientMode)    new NettyRpcEnvFactory().create(config)  }

如果 sparkEnv 是在驱动程序上启动的, 就注册一个 Endpoint, 如果是在executor上启动, 得到的是一个ref, executor通过引用远程调用 驱动中的程序。

sparkEnv 中的registerOrLookupEndpoint方法解释了这个选择

    def registerOrLookupEndpoint(        name: String, endpointCreator: => RpcEndpoint):      RpcEndpointRef = {      if (isDriver) {        logInfo("Registering " + name)        rpcEnv.setupEndpoint(name, endpointCreator)#获取endpoint      } else {        RpcUtils.makeDriverRef(name, conf, rpcEnv)#获取ref      }    }

4 基于Netty的RpcEnv

4.1 通过工厂产生该环境:

private[rpc] class NettyRpcEnvFactory extends RpcEnvFactory with Logging {  def create(config: RpcEnvConfig): RpcEnv = {    val sparkConf = config.conf    // Use JavaSerializerInstance in multiple threads is safe. However, if we plan to support    // KryoSerializer in future, we have to use ThreadLocal to store SerializerInstance    val javaSerializerInstance =      new JavaSerializer(sparkConf).newInstance().asInstanceOf[JavaSerializerInstance]//序列化    val nettyEnv =      new NettyRpcEnv(sparkConf, javaSerializerInstance, config.advertiseAddress,        config.securityManager)//new 出来一个NettyRpcEnv    if (!config.clientMode) {//驱动节点上开启服务器      val startNettyRpcEnv: Int => (NettyRpcEnv, Int) = { actualPort =>        nettyEnv.startServer(config.bindAddress, actualPort)        (nettyEnv, nettyEnv.address.port)//address和端口开启服务器      }      try {        Utils.startServiceOnPort(config.port, startNettyRpcEnv, sparkConf,      } catch {        case NonFatal(e) =>          nettyEnv.shutdown()          throw e      }    }    nettyEnv  }}



      val startNettyRpcEnv: Int => (NettyRpcEnv, Int) = { actualPort =>        nettyEnv.startServer(config.bindAddress, actualPort)        (nettyEnv, nettyEnv.address.port)      }


  def startServer(bindAddress: String, port: Int): Unit = {    val bootstraps: java.util.List[TransportServerBootstrap] =      if (securityManager.isAuthenticationEnabled()) {        java.util.Arrays.asList(new AuthServerBootstrap(transportConf, securityManager))      } else {        java.util.Collections.emptyList()      }    server = transportContext.createServer(bindAddress, port, bootstraps)#//地址加端口加bootstraps创建服务器    dispatcher.registerRpcEndpoint(      RpcEndpointVerifier.NAME, new RpcEndpointVerifier(this, dispatcher))#//向dispatcher验证是否已经存在  }

4.2 nettyRpcEnv


4.2.1 transportContext负责管理网路传输上下文信息


  private val transportContext = new TransportContext(transportConf,    new NettyRpcHandler(dispatcher, this, streamManager))


The handler keeps track of all client instances that communicate with it, so that the RpcEnv
knows which TransportClient instance to use when sending RPCs to a client endpoint

private[netty] class NettyRpcHandler(    dispatcher: Dispatcher,    nettyEnv: NettyRpcEnv,    streamManager: StreamManager) extends RpcHandler with Loggingprivate val remoteAddresses = new ConcurrentHashMap[RpcAddress, RpcAddress]()//hash存储所有的客服端的rpc地址override def receive(      client: TransportClient,      message: ByteBuffer,      callback: RpcResponseCallback): Unit = {    val messageToDispatch = internalReceive(client, message)    dispatcher.postRemoteMessage(messageToDispatch, callback)//实际消息的派遣分发都是调用dispatcher  }

4.2.2 dispatcher:注册,并将信息路由到endpoint点


  private class EndpointData(#对RpcEndpoint和请求队列的包装      val name: String,      val endpoint: RpcEndpoint,      val ref: NettyRpcEndpointRef) {    val inbox = new Inbox(ref, endpoint)  }  private val endpoints: ConcurrentMap[String, EndpointData] =    new ConcurrentHashMap[String, EndpointData]//<名字-point>  private val endpointRefs: ConcurrentMap[RpcEndpoint, RpcEndpointRef] =    new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]//<RpcEndpoint-RpcEndpointRef>,通信点与引用的hash表  // 该队列将那些inbox中有消息的EndpointData入队,即请求队列  private val receivers = new LinkedBlockingQueue[EndpointData]


def registerRpcEndpoint(name: String, endpoint: RpcEndpoint): NettyRpcEndpointRef = {    val addr = RpcEndpointAddress(nettyEnv.address, name)#获取Address    val endpointRef = new NettyRpcEndpointRef(nettyEnv.conf, addr, nettyEnv)#在point所在的节点中,就是RpcEndpointAddress的包裹器    synchronized {      if (stopped) {        throw new IllegalStateException("RpcEnv has been stopped")      }      if (endpoints.putIfAbsent(name, new EndpointData(name, endpoint, endpointRef)) != null) {        throw new IllegalArgumentException(s"There is already an RpcEndpoint called $name")      }      val data = endpoints.get(name)      endpointRefs.put(data.endpoint, data.ref)      receivers.offer(data)  // for the OnStart message    }    endpointRef  }


如果在当前rpc所在的那个节点中生成了NettyRpcEndpointRef(nettyEnv.conf, addr,




  private def postMessage(      endpointName: String,      message: InboxMessage,      callbackIfStopped: (Exception) => Unit): Unit = {    val error = synchronized {      val data = endpoints.get(endpointName)//获取具体EndpointData      if (stopped) {        Some(new RpcEnvStoppedException())      } else if (data == null) {        Some(new SparkException(s"Could not find $endpointName."))      } else {消息发送都是经过inbox进行的        receivers.offer(data)//将data整体插入到请求队列中等待处理        None      }    }    // We don't need to call `onStop` in the `synchronized` block    error.foreach(callbackIfStopped)  }

ThreadPoolExecutor线程池中的多个线程并发的从 receivers 拿出消息,走后面的流程:

/** Thread pool used for dispatching messages. */  private val threadpool: ThreadPoolExecutor = { //如果conf设置了线程数就直接取conf中hash表中的数据,如果没有根据物理核心个数设置  val numThreads = nettyEnv.conf.getInt("spark.rpc.netty.dispatcher.numThreads",      math.max(2, Runtime.getRuntime.availableProcessors()))      //用线程个数初始化线程池    val pool = ThreadUtils.newDaemonFixedThreadPool(numThreads, "dispatcher-event-loop")    for (i <- 0 until numThreads) {      pool.execute(new MessageLoop)//执行操作    }    pool  }  /** Message loop used for dispatching messages. */  private class MessageLoop extends Runnable {    override def run(): Unit = {      try {        while (true) {          try {            val data = receivers.take()//请求队列中的data出队            if (data == PoisonPill) {              // private val PoisonPill = new EndpointData(null, null, null),意味着需要退出循环              receivers.offer(PoisonPill)              return            }            data.inbox.process(Dispatcher.this)//实际执行分发排遣消息          } catch {            case NonFatal(e) => logError(e.getMessage, e)          }        }      } catch {        case ie: InterruptedException => // exit      }    }  }


private[netty] class Inbox(    val endpointRef: NettyRpcEndpointRef,    val endpoint: RpcEndpoint)  extends Logging {  inbox =>    @GuardedBy("this")  protected val messages = new java.util.LinkedList[InboxMessage]()//messages的数据结构是链表  @GuardedBy("this")  private var stopped = false  /** 允许并发处理 */  @GuardedBy("this")  private var enableConcurrent = false  /** 实际用于处理的线程 */  @GuardedBy("this")  private var numActiveThreads = 0  // process前必须先执行Onstart  inbox.synchronized {    messages.add(OnStart)  }  /**   * Process stored messages.   */  def process(dispatcher: Dispatcher): Unit = {    var message: InboxMessage = null    inbox.synchronized {      if (!enableConcurrent && numActiveThreads != 0) {        return      }      message = messages.poll()//获取链表中的第一个      if (message != null) {        numActiveThreads += 1//活跃的线程数      } else {        return      }    }    while (true) {      safelyCall(endpoint) {//根据message调用不同的方法        message match {          case RpcMessage(_sender, content, context) =>            try {//需要reply              endpoint.receiveAndReply(context).applyOrElse[Any, Unit](content, { msg =>                throw new SparkException(s"Unsupported message $message from ${_sender}")              })            } catch {              case NonFatal(e) =>                context.sendFailure(e)                // Throw the exception -- this exception will be caught by the safelyCall function.                // The endpoint's onError function will be called.                throw e            }          //不需要reply          case OneWayMessage(_sender, content) =>            endpoint.receive.applyOrElse[Any, Unit](content, { msg =>              throw new SparkException(s"Unsupported message $message from ${_sender}")            })          //开始          case OnStart =>            endpoint.onStart()            if (!endpoint.isInstanceOf[ThreadSafeRpcEndpoint]) {              inbox.synchronized {                if (!stopped) {                  enableConcurrent = true                }              }            }          case OnStop =>            val activeThreads = inbox.synchronized { inbox.numActiveThreads }            assert(activeThreads == 1,              s"There should be only a single active thread but found $activeThreads threads.")            dispatcher.removeRpcEndpointRef(endpoint)            endpoint.onStop()            assert(isEmpty, "OnStop should be the last message")          case RemoteProcessConnected(remoteAddress) =>            endpoint.onConnected(remoteAddress)          case RemoteProcessDisconnected(remoteAddress) =>            endpoint.onDisconnected(remoteAddress)          case RemoteProcessConnectionError(cause, remoteAddress) =>            endpoint.onNetworkError(cause, remoteAddress)        }      }      inbox.synchronized {        // "enableConcurrent" will be set to false after `onStop` is called, so we should check it        // every time.        if (!enableConcurrent && numActiveThreads != 1) {          // If we are not the only one worker, exit          numActiveThreads -= 1          return        }        message = messages.poll()        if (message == null) {          numActiveThreads -= 1          return        }      }    }  }  def post(message: InboxMessage): Unit = inbox.synchronized {    if (stopped) {      // We already put "OnStop" into "messages", so we should drop further messages      onDrop(message)    } else {      messages.add(message)      false    }  }  def stop(): Unit = inbox.synchronized {    // The following codes should be in `synchronized` so that we can make sure "OnStop" is the last    // message    if (!stopped) {      // We should disable concurrent here. Then when RpcEndpoint.onStop is called, it's the only      // thread that is processing messages. So `RpcEndpoint.onStop` can release its resources      // safely.      enableConcurrent = false      stopped = true      messages.add(OnStop)      // Note: The concurrent events in messages will be processed one by one.    }  }  def isEmpty: Boolean = inbox.synchronized { messages.isEmpty }  /**   * Called when we are dropping a message. Test cases override this to test message dropping.   * Exposed for testing.   */  protected def onDrop(message: InboxMessage): Unit = {    logWarning(s"Drop $message because $endpointRef is stopped")  }  /**   * Calls action closure, and calls the endpoint's onError function in the case of exceptions.   */  private def safelyCall(endpoint: RpcEndpoint)(action: => Unit): Unit = {    try action catch {      case NonFatal(e) =>        try endpoint.onError(e) catch {          case NonFatal(ee) => logError(s"Ignoring error", ee)        }    }  }}

4.2.3 streamManager 提供文件服务

private val streamManager = new NettyStreamManager(this)


  private val files = new ConcurrentHashMap[String, File]()  private val jars = new ConcurrentHashMap[String, File]()  private val dirs = new ConcurrentHashMap[String, File]()
