Spark storage系列------3.Spark cache数据块之后对后继Job任务调度的影响，以及后继Job Task执行的影响

来源：互联网发布：java选择文件保存路径编辑：程序博客网时间：2024/06/06 00:25

DAGScheduler.submitStage建立Spark应用的物理执行图，DAGScheduler.submitStage通过调用DAGSchdeuler.getMissingParentStages找到一个Stage的祖宗Stage并把祖宗Stage加入到物理执行图中。在这里如果发现依赖的RDD的全部分区已经存储到了BlockManager，也就是已经成功Cache，那么这个RDD以及它的祖宗RDD不会加入到任务物理执行计划，也就是Cache RDD以及它的祖宗RDD不会给它们分配任务计算生成。DAGSchdeuler.getMissingParentStages方法定义如下：

//找到当前stage的所有parent stage，找到parent stage之后不再继续查找更上一级的祖宗stage  private def getMissingParentStages(stage: Stage): List[Stage] = {    val missing = new HashSet[Stage]    val visited = new HashSet[RDD[_]]    // We are manually maintaining a stack here to prevent StackOverflowError    // caused by recursively visiting    val waitingForVisit = new Stack[RDD[_]]    def visit(rdd: RDD[_]) {      if (!visited(rdd)) {        visited += rdd        /*        * 如果RDD的所有分区已经全部Cache到了BlockManager，则这个RDD以及它的祖宗RDD不会        * 作为制定任务物理执行计划的依据，也就是Cache RDD以及它的祖宗RDD不会任务执行时实际计算生成        * */        val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)        if (rddHasUncachedPartitions) {          //根据RDD的依赖关系切分Stage          for (dep <- rdd.dependencies) {            dep match {              case shufDep: ShuffleDependency[_, _, _] =>                /*找到parent stage之后，不再继续将当前的rdd加入到waitingForVisit栈中                *这个方法会把shufDepp依赖链的所有祖宗Stage加入到DAGScheduler.shuffleToMapStage HashMap中                 */                val mapStage = getShuffleMapStage(shufDep, stage.firstJobId)                /*                *什么时候返回true需要接下来研究，一个stage的parent stage的task提交之后，这个方法可能返回true，                * 导致missing为empty返回，然后这个Stage的task在submitStage方法就可以提交了                * */                if (!mapStage.isAvailable) {                  missing += mapStage                }              case narrowDep: NarrowDependency[_] =>                //没有找到parent stage,将当前的rdd加入到waitingForVisit栈中                waitingForVisit.push(narrowDep.rdd)            }          }        }      }    }    waitingForVisit.push(stage.rdd)    while (waitingForVisit.nonEmpty) {      /*      * 从waitingForVisit栈弹出RDD，继续依赖分析，切割Stage      * */      visit(waitingForVisit.pop())    }    missing.toList  }

DAGSchdeuler.getMissingParentStages方法通过调用DAGSchdeuler.getCacheLocs方法获取并添加RDD cache在BlockManager的数据块信息。Cache在BlockManager的分区数据块信息存储在了DAGScheduler.cacheLocs这个HashMap中，DAGSchdeuler.getCacheLocs方法定义如下：

def getCacheLocs(rdd: RDD[_]): Seq[Seq[TaskLocation]] = cacheLocs.synchronized {    // Note: this doesn't use `getOrElse()` because this method is called O(num tasks) times    if (!cacheLocs.contains(rdd.id)) {      // Note: if the storage level is NONE, we don't need to get locations from block manager.      val locs: Seq[Seq[TaskLocation]] = if (rdd.getStorageLevel == StorageLevel.NONE) {        Seq.fill(rdd.partitions.size)(Nil)      } else {        /*        *rdd已经cache了， 获取rdd每个分区对应的RDDBlockId        * */        val blockIds =          rdd.partitions.indices.map(index => RDDBlockId(rdd.id, index)).toArray[BlockId]        //向驱动发消息找到RDD每个分区存储在了那个节点的BlockManager，将BlockManager所在的节点作为任务启动的节点        blockManagerMaster.getLocations(blockIds).map { bms =>          bms.map(bm => TaskLocation(bm.host, bm.executorId))        }      }      cacheLocs(rdd.id) = locs    }    cacheLocs(rdd.id)  }

可见DAGScheduler.cacheLocs的Key是RDD的id，Value是这个已经Cache RDD的每个分区对应的TaskLocation，TaskLocation根据cache数据块所在的节点地址创建

在DAGScheduler.submitMissingTasks方法创建ShuffleMapTask或者ResultTask的时候，会用到DAGScheduler.cacheLocs 这个HashMap的Value创建。

DAGScheduler.submitMissingTasks通过调用DAGScheduler.getPreferedLocs，DAGScheduler.getPreferedLocs通过调用DAGScheduler.getPreferedLocsInternal方法获取任务所要执行的节点，DAGScheduler.getPreferedLocsInternal法中调用DAGScheduler.getCacheLocs方法如果发现这个分区已经Cache到BlockManager，则将这个分区Cache到的BlockManager对应的TaskLocation作为Task执行的节点。DAGScheduler.getPreferedLocsInternal定义如下：

 private def getPreferredLocsInternal(      rdd: RDD[_],      partition: Int,      visited: HashSet[(RDD[_], Int)]): Seq[TaskLocation] = {    // If the partition has already been visited, no need to re-visit.    // This avoids exponential path exploration.  SPARK-695    if (!visited.add((rdd, partition))) {      // Nil has already been returned for previously visited partitions.      return Nil    }    // If the partition is cached, return the cache locations    val cached = getCacheLocs(rdd)(partition)    if (cached.nonEmpty) {      return cached//cache的节点位置作为task执行的节点，返回这个节点位置对应的TaskLocation对象    }    // If the RDD has some placement preferences (as is the case for input RDDs), get those    /*    * ShuffledRDD事rddPrefs.nonEmpty为false    * */    val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList    if (rddPrefs.nonEmpty) {      return rddPrefs.map(TaskLocation(_))    }    rdd.dependencies.foreach {      case n: NarrowDependency[_] =>        // If the RDD has narrow dependencies, pick the first partition of the first narrow dep        // that has any placement preferences. Ideally we would choose based on transfer sizes,        // but this will do for now.        for (inPart <- n.getParents(partition)) {          val locs = getPreferredLocsInternal(n.rdd, inPart, visited)          if (locs != Nil) {            return locs          }        }      case s: ShuffleDependency[_, _, _] =>        // For shuffle dependencies, pick locations which have at least REDUCER_PREF_LOCS_FRACTION        // of data as preferred locations        if (shuffleLocalityEnabled &&            rdd.partitions.size < SHUFFLE_PREF_REDUCE_THRESHOLD &&            s.rdd.partitions.size < SHUFFLE_PREF_MAP_THRESHOLD) {          // Get the preferred map output locations for this reducer          /*          *根据Stage1 shuffle map操作各个任务的返回值确定Stage2 shuffle reduce操作各个任务的本地性          *在Stage1某个TaskLocation上partition Shuffle后数据占Stage2 partition所有数据的比率大于REDUCER_PREF_LOCS_FRACTION，          * 则这个TaskLocation会作为Stage2任务的TaskLocation          *          * */          val topLocsForReducer = mapOutputTracker.getLocationsWithLargestOutputs(s.shuffleId,            partition, rdd.partitions.size, REDUCER_PREF_LOCS_FRACTION)          if (topLocsForReducer.nonEmpty) {            return topLocsForReducer.get.map(loc => TaskLocation(loc.host, loc.executorId))          }        }      case _ =>    }    Nil  }

后继Job执行的时候，根RDD是已经Cache的RDD，进入了CacheManager.getOrCompute流程通过调用BlockManager.get方法获取已经cache的数据块，进入后继的处理流程。BlockManger.get源码如下：

def getOrCompute[T](      rdd: RDD[T],      partition: Partition,      context: TaskContext,      storageLevel: StorageLevel): Iterator[T] = {    //根据RDD.id和分区index创建存储块的id    val key = RDDBlockId(rdd.id, partition.index)    logDebug(s"Looking for partition $key")    blockManager.get(key) match {      /*      * 如果已经cache了这个块的数据，则从BlockManager读取      * */      case Some(blockResult) =>        // Partition is already materialized, so just return its values        val existingMetrics = context.taskMetrics          .getInputMetricsForReadMethod(blockResult.readMethod)        existingMetrics.incBytesRead(blockResult.bytes)        val iter = blockResult.data.asInstanceOf[Iterator[T]]        new InterruptibleIterator[T](context, iter) {          override def next(): T = {            existingMetrics.incRecordsRead(1)            delegate.next()          }        }      case None =>        // Acquire a lock for loading this partition        // If another thread already holds the lock, wait for it to finish return its results        /*        * 如果没有cache这个块的数据，则计算出本分区的数据，并保存到BlockManager        * */        val storedValues = acquireLockForPartition[T](key)        if (storedValues.isDefined) {          return new InterruptibleIterator[T](context, storedValues.get)        }        // Otherwise, we have to load the partition ourselves        try {          logInfo(s"Partition $key not found, computing it")          val computedValues = rdd.computeOrReadCheckpoint(partition, context)//计算出一个分区的数据          // If the task is running locally, do not persist the result          if (context.isRunningLocally) {            return computedValues          }          // Otherwise, cache the values and keep track of any updates in block statuses          val updatedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]          val cachedValues = putInBlockManager(key, computedValues, storageLevel, updatedBlocks)//cache一个分区的数据          val metrics = context.taskMetrics          val lastUpdatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]())          metrics.updatedBlocks = Some(lastUpdatedBlocks ++ updatedBlocks.toSeq)          new InterruptibleIterator(context, cachedValues)        } finally {          loading.synchronized {            loading.remove(key)            loading.notifyAll()          }        }    }  }

0 0