Spark 2.1 Yarn Client jars upload process

来源：互联网发布：海尔网络电视价格编辑：程序博客网时间：2024/06/06 13:06
Yarn Client submit all jars in directory “jars” to a file, and upload it to hdfs.
At first, jars in directory jars are compressed to a zip file like the following format.
file:/home/houzhizhen/usr/local/spark/spark-2.1.0-bin-hadoop2.7/logs/spark-675d0aed-a7f3-4998-9294-f736bfd313ad/__spark_libs__8552288535406100327.zip
/**     * Distribute a file to the cluster.     *     * If the file's path is a "local:" URI, it's actually not distributed. Other files are copied     * to HDFS (if not already there) and added to the application's distributed cache.     *     * @param path URI of the file to distribute.     * @param resType Type of resource being distributed.     * @param destName Name of the file in the distributed cache.     * @param targetDir Subdirectory where to place the file.     * @param appMasterOnly Whether to distribute only to the AM.     * @return A 2-tuple. First item is whether the file is a "local:" URI. Second item is the     *         localized path for non-local paths, or the input `path` for local paths.     *         The localized path will be null if the URI has already been added to the cache.     */    def distribute(        path: String,        resType: LocalResourceType = LocalResourceType.FILE,        destName: Option[String] = None,        targetDir: Option[String] = None,        appMasterOnly: Boolean = false): (Boolean, String) = {      val trimmedPath = path.trim()      val localURI = Utils.resolveURI(trimmedPath)      if (localURI.getScheme != LOCAL_SCHEME) {        if (addDistributedUri(localURI)) {          val localPath = getQualifiedLocalPath(localURI, hadoopConf)          val linkname = targetDir.map(_ + "/").getOrElse("") +            destName.orElse(Option(localURI.getFragment())).getOrElse(localPath.getName())          val destPath = copyFileToRemote(destDir, localPath, replication)          val destFs = FileSystem.get(destPath.toUri(), hadoopConf)          distCacheMgr.addResource(            destFs, hadoopConf, destPath, localResources, resType, linkname, statCache,            appMasterOnly = appMasterOnly)          (false, linkname)        } else {          (false, null)        }      } else {        (true, trimmedPath)      }    }    // If we passed in a keytab, make sure we copy the keytab to the staging directory on    // HDFS, and setup the relevant environment vars, so the AM can login again.    if (loginFromKeytab) {      logInfo("To enable the AM to login from keytab, credentials are being copied over to the AM" +        " via the YARN Secure Distributed Cache.")      val (_, localizedPath) = distribute(keytab,        destName = sparkConf.get(KEYTAB),        appMasterOnly = true)      require(localizedPath != null, "Keytab file already distributed.")    }    /**     * Add Spark to the cache. There are two settings that control what files to add to the cache:     * - if a Spark archive is defined, use the archive. The archive is expected to contain     *   jar files at its root directory.     * - if a list of jars is provided, filter the non-local ones, resolve globs, and     *   add the found files to the cache.     *     * Note that the archive cannot be a "local" URI. If none of the above settings are found,     * then upload all files found in $SPARK_HOME/jars.     */    val sparkArchive = sparkConf.get(SPARK_ARCHIVE)    if (sparkArchive.isDefined) {      val archive = sparkArchive.get      require(!isLocalUri(archive), s"${SPARK_ARCHIVE.key} cannot be a local URI.")      distribute(Utils.resolveURI(archive).toString,        resType = LocalResourceType.ARCHIVE,        destName = Some(LOCALIZED_LIB_DIR))    } else {      sparkConf.get(SPARK_JARS) match {        case Some(jars) =>          // Break the list of jars to upload, and resolve globs.          val localJars = new ArrayBuffer[String]()          jars.foreach { jar =>            if (!isLocalUri(jar)) {              val path = getQualifiedLocalPath(Utils.resolveURI(jar), hadoopConf)              val pathFs = FileSystem.get(path.toUri(), hadoopConf)              pathFs.globStatus(path).filter(_.isFile()).foreach { entry =>                distribute(entry.getPath().toUri().toString(),                  targetDir = Some(LOCALIZED_LIB_DIR))              }            } else {              localJars += jar            }          }          // Propagate the local URIs to the containers using the configuration.          sparkConf.set(SPARK_JARS, localJars)        case None =>          // No configuration, so fall back to uploading local jar files.          logWarning(s"Neither ${SPARK_JARS.key} nor ${SPARK_ARCHIVE.key} is set, falling back " +            "to uploading libraries under SPARK_HOME.")          val jarsDir = new File(YarnCommandBuilderUtils.findJarsDir(            sparkConf.getenv("SPARK_HOME")))          val jarsArchive = File.createTempFile(LOCALIZED_LIB_DIR, ".zip",            new File(Utils.getLocalDir(sparkConf)))          val jarsStream = new ZipOutputStream(new FileOutputStream(jarsArchive))          try {            jarsStream.setLevel(0)            jarsDir.listFiles().foreach { f =>              if (f.isFile && f.getName.toLowerCase().endsWith(".jar") && f.canRead) {                jarsStream.putNextEntry(new ZipEntry(f.getName))                Files.copy(f, jarsStream)                jarsStream.closeEntry()              }            }          } finally {            jarsStream.close()          }          distribute(jarsArchive.toURI.getPath,            resType = LocalResourceType.ARCHIVE,            destName = Some(LOCALIZED_LIB_DIR))      }    }    /**     * Copy user jar to the distributed cache if their scheme is not "local".     * Otherwise, set the corresponding key in our SparkConf to handle it downstream.     */    Option(args.userJar).filter(_.trim.nonEmpty).foreach { jar =>      val (isLocal, localizedPath) = distribute(jar, destName = Some(APP_JAR_NAME))      if (isLocal) {        require(localizedPath != null, s"Path $jar already distributed")        // If the resource is intended for local use only, handle this downstream        // by setting the appropriate property        sparkConf.set(APP_JAR, localizedPath)      }    }    /**     * Do the same for any additional resources passed in through ClientArguments.     * Each resource category is represented by a 3-tuple of:     *   (1) comma separated list of resources in this category,     *   (2) resource type, and     *   (3) whether to add these resources to the classpath     */    val cachedSecondaryJarLinks = ListBuffer.empty[String]    List(      (sparkConf.get(JARS_TO_DISTRIBUTE), LocalResourceType.FILE, true),      (sparkConf.get(FILES_TO_DISTRIBUTE), LocalResourceType.FILE, false),      (sparkConf.get(ARCHIVES_TO_DISTRIBUTE), LocalResourceType.ARCHIVE, false)    ).foreach { case (flist, resType, addToClasspath) =>      flist.foreach { file =>        val (_, localizedPath) = distribute(file, resType = resType)        // If addToClassPath, we ignore adding jar multiple times to distitrbuted cache.        if (addToClasspath) {          if (localizedPath != null) {            cachedSecondaryJarLinks += localizedPath          }        } else {          if (localizedPath == null) {            throw new IllegalArgumentException(s"Attempt to add ($file) multiple times" +              " to the distributed cache.")          }        }      }    }    if (cachedSecondaryJarLinks.nonEmpty) {      sparkConf.set(SECONDARY_JARS, cachedSecondaryJarLinks)    }    if (isClusterMode && args.primaryPyFile != null) {      distribute(args.primaryPyFile, appMasterOnly = true)    }    pySparkArchives.foreach { f => distribute(f) }    // The python files list needs to be treated especially. All files that are not an    // archive need to be placed in a subdirectory that will be added to PYTHONPATH.    sparkConf.get(PY_FILES).foreach { f =>      val targetDir = if (f.endsWith(".py")) Some(LOCALIZED_PYTHON_DIR) else None      distribute(f, targetDir = targetDir)    }    // Update the configuration with all the distributed files, minus the conf archive. The    // conf archive will be handled by the AM differently so that we avoid having to send    // this configuration by other means. See SPARK-14602 for one reason of why this is needed.    distCacheMgr.updateConfiguration(sparkConf)    // Upload the conf archive to HDFS manually, and record its location in the configuration.    // This will allow the AM to know where the conf archive is in HDFS, so that it can be    // distributed to the containers.    //    // This code forces the archive to be copied, so that unit tests pass (since in that case both    // file systems are the same and the archive wouldn't normally be copied). In most (all?)    // deployments, the archive would be copied anyway, since it's a temp file in the local file    // system.    val remoteConfArchivePath = new Path(destDir, LOCALIZED_CONF_ARCHIVE)    val remoteFs = FileSystem.get(remoteConfArchivePath.toUri(), hadoopConf)    sparkConf.set(CACHED_CONF_ARCHIVE, remoteConfArchivePath.toString())    val localConfArchive = new Path(createConfArchive().toURI())    copyFileToRemote(destDir, localConfArchive, replication, force = true,      destName = Some(LOCALIZED_CONF_ARCHIVE))    // Manually add the config archive to the cache manager so that the AM is launched with    // the proper files set up.    distCacheMgr.addResource(      remoteFs, hadoopConf, remoteConfArchivePath, localResources, LocalResourceType.ARCHIVE,      LOCALIZED_CONF_DIR, statCache, appMasterOnly = false)    // Clear the cache-related entries from the configuration to avoid them polluting the    // UI's environment page. This works for client mode; for cluster mode, this is handled    // by the AM.    CACHE_CONFIGS.foreach(sparkConf.remove)    localResources  }
0 0