0003.SparkContext理解

来源：互联网发布：照片比较软件编辑：程序博客网时间：2024/06/05 10:32

1.SparkContex位于项目的源代码路径\spark-master\core\src\main\scala\org\apache\spark\SparkContext.scala

源文件中包括：

（1）类的声明：class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationClient

（2）伴生对象：object SparkContext extends Logging

2.创建SparkContext对象的过程，初始化操作

（1）加载配置文件SparkConf

（2）创建SparkEvn

（3）创建SparkTaskScheduler

（4）创建DAGScheduler

（5）创建SparkUI

（6）启动taskScheduler

遗留问题：没有看到DAGScheduler的启动

450行

_env = createSparkEnv(_conf, isLocal, listenerBus)

SparkEnv.set(_env)

//273行

private[spark] def createSparkEnv(
      conf: SparkConf,
      isLocal: Boolean,
      listenerBus: LiveListenerBus): SparkEnv = {
    SparkEnv.createDriverEnv(conf, isLocal, listenerBus)

}

在514行创建SparkTaskScheduler

(1) // Create and start the scheduler
val (sched, ts) = SparkContext.createTaskScheduler(this, master)
_schedulerBackend = sched

_taskScheduler = ts

(2)创建DAGScheduler

518行

_dagScheduler = new DAGScheduler(this)

_heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet)

(3)启动SparkTaskScheduler

// start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's

// constructor

523行

_taskScheduler.start()

（4）进入createTaskScheduler 方法

2552行

private def createTaskScheduler(
      sc: SparkContext,
      master: String): (SchedulerBackend, TaskScheduler) = {
    // Regular expression used for local[N] and local[*] master formats
    val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r
    // Regular expression for local[N, maxRetries], used in tests with failing tasks
    val LOCAL_N_FAILURES_REGEX = """local\[([0-9]+|\*)\s*,\s*([0-9]+)\]""".r
    // Regular expression for simulating a Spark cluster of [N, cores, memory] locally
    val LOCAL_CLUSTER_REGEX = """local-cluster\[\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*]""".r
    // Regular expression for connecting to Spark deploy clusters
    val SPARK_REGEX = """spark://(.*)""".r
    // Regular expression for connection to Mesos cluster by mesos:// or zk:// url
    val MESOS_REGEX = """(mesos|zk)://.*""".r
    // Regular expression for connection to Simr cluster
    val SIMR_REGEX = """simr://(.*)""".r

    // When running locally, don't try to re-execute tasks on failure.

val MAX_LOCAL_TASK_FAILURES = 1

（5）本地模式：2600行，这里用到模式匹配

case SPARK_REGEX(sparkUrl) =>

val scheduler = new TaskSchedulerImpl(sc)//实例化TaskSchedulerImpl

val masterUrls = sparkUrl.split(",").map("spark://" + _)//构建masterUrls

val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls)//创建backend,

//SparkDeploySchedulerBackend核心是为了启动CoarseGrainedExecutorBackend

//后面会讲SparkDeploySchedulerBackend

scheduler.initialize(backend)//进行初始化

(backend, scheduler)//返回值

（6）SparkTaskSchedulerImpl.class

//126行

def initialize(backend: SchedulerBackend) {
    this.backend = backend
    // temporarily set rootPool name to empty
    rootPool = new Pool("", schedulingMode, 0, 0)
    schedulableBuilder = {
      schedulingMode match {
        case SchedulingMode.FIFO =>
          new FIFOSchedulableBuilder(rootPool)
        case SchedulingMode.FAIR =>
          new FairSchedulableBuilder(rootPool, conf)
      }
    }
    schedulableBuilder.buildPools()

}

（7）SparkUI创建

464行

    _ui =
      if (conf.getBoolean("spark.ui.enabled", true)) {
        Some(SparkUI.createLiveUI(this, _conf, listenerBus, _jobProgressListener,
          _env.securityManager, appName, startTime = startTime))
      } else {
        // For tests, do not enable the UI
        None
      }
    // Bind the UI before starting the task scheduler to communicate
    // the bound port to the cluster manager properly

_ui.foreach(_.bind())

new SparkUI(this)

bind()

Jetty服务器进行绑定

count方法到时runJob进行调用

0 0