1 调用流程图

2 启动脚本

2.1 bin/spark-submit

# For client mode, the driver will be launched in the same JVM that launches# SparkSubmit, so we may need to read the properties file for any extra class# paths, library paths, java options and memory early on. Otherwise, it will# be too late by the time the driver JVM has started.if [[ "$SPARK_SUBMIT_DEPLOY_MODE" == "client" && -f "$SPARK_SUBMIT_PROPERTIES_FILE" ]]; then  # Parse the properties file only if the special configs exist  contains_special_configs=$(    grep -e "spark.driver.extra*\|spark.driver.memory" "$SPARK_SUBMIT_PROPERTIES_FILE" | \    grep -v "^[[:space:]]*#"  )  if [ -n "$contains_special_configs" ]; then    export SPARK_SUBMIT_BOOTSTRAP_DRIVER=1  fifiexec "$SPARK_HOME"/bin/spark-class org.apache.spark.deploy.SparkSubmit "${ORIG_ARGS[@]}"
(1)在client模式,且spark属性配置文件中包含spark.driver*等配置信息,则export SPARK_SUBMIT_BOOTSTRAP_DRIVER=1;

2.2 bin/spark-class

# In Spark submit client mode, the driver is launched in the same JVM as Spark submit itself.# Here we must parse the properties file for relevant "spark.driver.*" configs before launching# the driver JVM itself. Instead of handling this complexity in Bash, we launch a separate JVM# to prepare the launch environment of this driver JVM.if [ -n "$SPARK_SUBMIT_BOOTSTRAP_DRIVER" ]; then  # This is used only if the properties file actually contains these special configs  # Export the environment variables needed by SparkSubmitDriverBootstrapper  export RUNNER  export CLASSPATH  export JAVA_OPTS  export OUR_JAVA_MEM  export SPARK_CLASS=1  shift # Ignore main class (org.apache.spark.deploy.SparkSubmit) and use our own  exec "$RUNNER" org.apache.spark.deploy.SparkSubmitDriverBootstrapper "$@"else  # Note: The format of this command is closely echoed in SparkSubmitDriverBootstrapper.scala  if [ -n "$SPARK_PRINT_LAUNCH_COMMAND" ]; then    echo -n "Spark Command: " 1>&2    echo "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@" 1>&2    echo -e "========================================\n" 1>&2  fi  exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"fi

3 应用程序主类的启动方式


4 SparkSubmit

4.1 main

  def main(args: Array[String]): Unit = {    val appArgs = new SparkSubmitArguments(args)    if (appArgs.verbose) {      printStream.println(appArgs)    }    appArgs.action match {      case SparkSubmitAction.SUBMIT => submit(appArgs)      case SparkSubmitAction.KILL => kill(appArgs)      case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)    }  }

4.2 SparkSubmitArguments

  // Set parameters from command line arguments  parseOpts(args.toList)  // Populate `sparkProperties` map from properties file  mergeDefaultSparkProperties()  // Use `sparkProperties` map along with env vars to fill in any missing parameters  loadEnvironmentArguments()  validateArguments()
    // Action should be SUBMIT unless otherwise specified    action = Option(action).getOrElse(SUBMIT)
  /** Ensure that required fields exists. Call this only once all defaults are loaded. */  private def validateArguments(): Unit = {    action match {      case SUBMIT => validateSubmitArguments()      case KILL => validateKillArguments()      case REQUEST_STATUS => validateStatusRequestArguments()    }  }

4.3 Spark属性参数优先级


4.4 SparkSubmit.submit

    val (childArgs, childClasspath, sysProps, childMainClass) = prepareSubmitEnvironment(args)    def doRunMain(): Unit = {      if (args.proxyUser != null) {       ......      } else {        runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)      }    }     // In standalone cluster mode, there are two submission gateways:     //   (1) The traditional Akka gateway using o.a.s.deploy.Client as a wrapper     //   (2) The new REST-based gateway introduced in Spark 1.3     // The latter is the default behavior as of Spark 1.3, but Spark submit will fail over     // to use the legacy gateway if the master endpoint turns out to be not a REST server.    if (args.isStandaloneCluster && args.useRest) {      try {        printStream.println("Running Spark using the REST application submission protocol.")        doRunMain()      } catch {        // Fail over to use the legacy submission gateway        case e: SubmitRestConnectionException =>          printWarning(s"Master endpoint ${args.master} was not a REST server. " +            "Falling back to legacy submission gateway instead.")          args.useRest = false          submit(args)      }    // In all other modes, just run the main class as prepared    } else {      doRunMain()    }

4.4.1 SparkSubmit.prepareSubmitEnvironment

    // Set the deploy mode; default is client mode    var deployMode: Int = args.deployMode match {      case "client" | null => CLIENT      case "cluster" => CLUSTER      case _ => printErrorAndExit("Deploy mode must be either client or cluster"); -1    }
    // In client mode, launch the application main class directly    // In addition, add the main application jar and any added jars (if any) to the classpath    if (deployMode == CLIENT) {      childMainClass = args.mainClass      if (isUserJar(args.primaryResource)) {        childClasspath += args.primaryResource      }      if (args.jars != null) { childClasspath ++= args.jars.split(",") }      if (args.childArgs != null) { childArgs ++= args.childArgs }    }
    // In standalone cluster mode, use the REST client to submit the application (Spark 1.3+).    // All Spark parameters are expected to be passed to the client through system properties.    if (args.isStandaloneCluster) {      if (args.useRest) {        childMainClass = ""        childArgs += (args.primaryResource, args.mainClass)      } else {        // In legacy standalone cluster mode, use Client as a wrapper around the user class        childMainClass = "org.apache.spark.deploy.Client"        if (args.supervise) { childArgs += "--supervise" }        Option(args.driverMemory).foreach { m => childArgs += ("--memory", m) }        Option(args.driverCores).foreach { c => childArgs += ("--cores", c) }        childArgs += "launch"        childArgs += (args.master, args.primaryResource, args.mainClass)      }      if (args.childArgs != null) {        childArgs ++= args.childArgs      }    }
    // In yarn-cluster mode, use yarn.Client as a wrapper around the user class    if (isYarnCluster) {      childMainClass = "org.apache.spark.deploy.yarn.Client"      ......

4.4.2 SparkSubmit.runMain

  private def runMain(      childArgs: Seq[String],      childClasspath: Seq[String],      sysProps: Map[String, String],      childMainClass: String,      verbose: Boolean): Unit
    if (verbose) {      printStream.println(s"Main class:\n$childMainClass")      printStream.println(s"Arguments:\n${childArgs.mkString("\n")}")      printStream.println(s"System properties:\n${sysProps.mkString("\n")}")      printStream.println(s"Classpath elements:\n${childClasspath.mkString("\n")}")      printStream.println("\n")    }
    for (jar <- childClasspath) {      addJarToClasspath(jar, loader)    }
    for ((key, value) <- sysProps) {      System.setProperty(key, value)    }
    try {      mainClass = Class.forName(childMainClass, true, loader)    } catch {      ...    }
    val mainMethod = mainClass.getMethod("main", new Array[String](0).getClass)
    try {      mainMethod.invoke(null, childArgs.toArray)    } catch {      case t: Throwable =>        throw findCause(t)    }
  • 应用程序主类名;
  • org.apache.spark.deploy.Client;
  • org.apache.spark.deploy.yarn.Client。

