Spark 2.1 Yarn Client jars upload process
来源:互联网 发布:海尔网络电视价格 编辑:程序博客网 时间:2024/06/06 13:06
Yarn Client submit all jars in directory “jars” to a file, and upload it to hdfs.
At first, jars in directory jars are compressed to a zip file like the following format.
file:/home/houzhizhen/usr/local/spark/spark-2.1.0-bin-hadoop2.7/logs/spark-675d0aed-a7f3-4998-9294-f736bfd313ad/__spark_libs__8552288535406100327.zip
/** * Distribute a file to the cluster. * * If the file's path is a "local:" URI, it's actually not distributed. Other files are copied * to HDFS (if not already there) and added to the application's distributed cache. * * @param path URI of the file to distribute. * @param resType Type of resource being distributed. * @param destName Name of the file in the distributed cache. * @param targetDir Subdirectory where to place the file. * @param appMasterOnly Whether to distribute only to the AM. * @return A 2-tuple. First item is whether the file is a "local:" URI. Second item is the * localized path for non-local paths, or the input `path` for local paths. * The localized path will be null if the URI has already been added to the cache. */ def distribute( path: String, resType: LocalResourceType = LocalResourceType.FILE, destName: Option[String] = None, targetDir: Option[String] = None, appMasterOnly: Boolean = false): (Boolean, String) = { val trimmedPath = path.trim() val localURI = Utils.resolveURI(trimmedPath) if (localURI.getScheme != LOCAL_SCHEME) { if (addDistributedUri(localURI)) { val localPath = getQualifiedLocalPath(localURI, hadoopConf) val linkname = targetDir.map(_ + "/").getOrElse("") + destName.orElse(Option(localURI.getFragment())).getOrElse(localPath.getName()) val destPath = copyFileToRemote(destDir, localPath, replication) val destFs = FileSystem.get(destPath.toUri(), hadoopConf) distCacheMgr.addResource( destFs, hadoopConf, destPath, localResources, resType, linkname, statCache, appMasterOnly = appMasterOnly) (false, linkname) } else { (false, null) } } else { (true, trimmedPath) } } // If we passed in a keytab, make sure we copy the keytab to the staging directory on // HDFS, and setup the relevant environment vars, so the AM can login again. if (loginFromKeytab) { logInfo("To enable the AM to login from keytab, credentials are being copied over to the AM" + " via the YARN Secure Distributed Cache.") val (_, localizedPath) = distribute(keytab, destName = sparkConf.get(KEYTAB), appMasterOnly = true) require(localizedPath != null, "Keytab file already distributed.") } /** * Add Spark to the cache. There are two settings that control what files to add to the cache: * - if a Spark archive is defined, use the archive. The archive is expected to contain * jar files at its root directory. * - if a list of jars is provided, filter the non-local ones, resolve globs, and * add the found files to the cache. * * Note that the archive cannot be a "local" URI. If none of the above settings are found, * then upload all files found in $SPARK_HOME/jars. */ val sparkArchive = sparkConf.get(SPARK_ARCHIVE) if (sparkArchive.isDefined) { val archive = sparkArchive.get require(!isLocalUri(archive), s"${SPARK_ARCHIVE.key} cannot be a local URI.") distribute(Utils.resolveURI(archive).toString, resType = LocalResourceType.ARCHIVE, destName = Some(LOCALIZED_LIB_DIR)) } else { sparkConf.get(SPARK_JARS) match { case Some(jars) => // Break the list of jars to upload, and resolve globs. val localJars = new ArrayBuffer[String]() jars.foreach { jar => if (!isLocalUri(jar)) { val path = getQualifiedLocalPath(Utils.resolveURI(jar), hadoopConf) val pathFs = FileSystem.get(path.toUri(), hadoopConf) pathFs.globStatus(path).filter(_.isFile()).foreach { entry => distribute(entry.getPath().toUri().toString(), targetDir = Some(LOCALIZED_LIB_DIR)) } } else { localJars += jar } } // Propagate the local URIs to the containers using the configuration. sparkConf.set(SPARK_JARS, localJars) case None => // No configuration, so fall back to uploading local jar files. logWarning(s"Neither ${SPARK_JARS.key} nor ${SPARK_ARCHIVE.key} is set, falling back " + "to uploading libraries under SPARK_HOME.") val jarsDir = new File(YarnCommandBuilderUtils.findJarsDir( sparkConf.getenv("SPARK_HOME"))) val jarsArchive = File.createTempFile(LOCALIZED_LIB_DIR, ".zip", new File(Utils.getLocalDir(sparkConf))) val jarsStream = new ZipOutputStream(new FileOutputStream(jarsArchive)) try { jarsStream.setLevel(0) jarsDir.listFiles().foreach { f => if (f.isFile && f.getName.toLowerCase().endsWith(".jar") && f.canRead) { jarsStream.putNextEntry(new ZipEntry(f.getName)) Files.copy(f, jarsStream) jarsStream.closeEntry() } } } finally { jarsStream.close() } distribute(jarsArchive.toURI.getPath, resType = LocalResourceType.ARCHIVE, destName = Some(LOCALIZED_LIB_DIR)) } } /** * Copy user jar to the distributed cache if their scheme is not "local". * Otherwise, set the corresponding key in our SparkConf to handle it downstream. */ Option(args.userJar).filter(_.trim.nonEmpty).foreach { jar => val (isLocal, localizedPath) = distribute(jar, destName = Some(APP_JAR_NAME)) if (isLocal) { require(localizedPath != null, s"Path $jar already distributed") // If the resource is intended for local use only, handle this downstream // by setting the appropriate property sparkConf.set(APP_JAR, localizedPath) } } /** * Do the same for any additional resources passed in through ClientArguments. * Each resource category is represented by a 3-tuple of: * (1) comma separated list of resources in this category, * (2) resource type, and * (3) whether to add these resources to the classpath */ val cachedSecondaryJarLinks = ListBuffer.empty[String] List( (sparkConf.get(JARS_TO_DISTRIBUTE), LocalResourceType.FILE, true), (sparkConf.get(FILES_TO_DISTRIBUTE), LocalResourceType.FILE, false), (sparkConf.get(ARCHIVES_TO_DISTRIBUTE), LocalResourceType.ARCHIVE, false) ).foreach { case (flist, resType, addToClasspath) => flist.foreach { file => val (_, localizedPath) = distribute(file, resType = resType) // If addToClassPath, we ignore adding jar multiple times to distitrbuted cache. if (addToClasspath) { if (localizedPath != null) { cachedSecondaryJarLinks += localizedPath } } else { if (localizedPath == null) { throw new IllegalArgumentException(s"Attempt to add ($file) multiple times" + " to the distributed cache.") } } } } if (cachedSecondaryJarLinks.nonEmpty) { sparkConf.set(SECONDARY_JARS, cachedSecondaryJarLinks) } if (isClusterMode && args.primaryPyFile != null) { distribute(args.primaryPyFile, appMasterOnly = true) } pySparkArchives.foreach { f => distribute(f) } // The python files list needs to be treated especially. All files that are not an // archive need to be placed in a subdirectory that will be added to PYTHONPATH. sparkConf.get(PY_FILES).foreach { f => val targetDir = if (f.endsWith(".py")) Some(LOCALIZED_PYTHON_DIR) else None distribute(f, targetDir = targetDir) } // Update the configuration with all the distributed files, minus the conf archive. The // conf archive will be handled by the AM differently so that we avoid having to send // this configuration by other means. See SPARK-14602 for one reason of why this is needed. distCacheMgr.updateConfiguration(sparkConf) // Upload the conf archive to HDFS manually, and record its location in the configuration. // This will allow the AM to know where the conf archive is in HDFS, so that it can be // distributed to the containers. // // This code forces the archive to be copied, so that unit tests pass (since in that case both // file systems are the same and the archive wouldn't normally be copied). In most (all?) // deployments, the archive would be copied anyway, since it's a temp file in the local file // system. val remoteConfArchivePath = new Path(destDir, LOCALIZED_CONF_ARCHIVE) val remoteFs = FileSystem.get(remoteConfArchivePath.toUri(), hadoopConf) sparkConf.set(CACHED_CONF_ARCHIVE, remoteConfArchivePath.toString()) val localConfArchive = new Path(createConfArchive().toURI()) copyFileToRemote(destDir, localConfArchive, replication, force = true, destName = Some(LOCALIZED_CONF_ARCHIVE)) // Manually add the config archive to the cache manager so that the AM is launched with // the proper files set up. distCacheMgr.addResource( remoteFs, hadoopConf, remoteConfArchivePath, localResources, LocalResourceType.ARCHIVE, LOCALIZED_CONF_DIR, statCache, appMasterOnly = false) // Clear the cache-related entries from the configuration to avoid them polluting the // UI's environment page. This works for client mode; for cluster mode, this is handled // by the AM. CACHE_CONFIGS.foreach(sparkConf.remove) localResources }
0 0
- Spark 2.1 Yarn Client jars upload process
- Neither spark.yarn.jars nor spark.yarn.archive is set
- spark yarn-client和yarn-cluster
- Spark Yarn-cluster 与 Yarn-client
- Spark on Yarn-cluster与Yarn-client
- Spark Yarn-cluster与Yarn-client
- spark on yarn中yarn-cluster与yarn-client区别
- Spark通过YARN-client提交任务不成功
- spark on yarn-client 奇怪问题
- spark-cluster及yarn-client说明
- Spark:Yarn-cluster和Yarn-client区别与联系
- Spark:Yarn-cluster和Yarn-client区别与联系
- Spark:Yarn-cluster和Yarn-client区别与联系
- Spark:Yarn-cluster和Yarn-client区别与联系
- Spark:Yarn-cluster和Yarn-client区别与联系
- Spark:Yarn-cluster和Yarn-client区别与联系
- Spark-submit模式yarn-cluster和yarn-client的区别
- Spark的资源管理以及YARN-Cluster Vs YARN-Client
- UVA
- mongo replSet副本集方式配置、.net core使用MongoDB.Driver创建数据库连接池总结
- android 获取路径目录方法以及判断目录是否存在,创建目录
- 关于python提示no module named win32api问题的解决12
- ovs 命令
- Spark 2.1 Yarn Client jars upload process
- CXF整合Spring之JaxWsProxyFactoryBean调用
- 使用VS+VisualGDB编译调试Linux程序
- shell gzip 压缩命令
- 安卓图片模糊化
- 中点Bresenham算法扫描转换圆心在原点, 半径为8的圆;
- BigWorld Server
- PlacePicker调用导致GooglePlayservier崩溃
- eclipse 头疼的 jsp format