spark 2.1 RDD compute process

来源：互联网发布：辐射4萌妹捏脸数据编辑：程序博客网时间：2024/06/05 09:57

iterator

 /**   * Internal method to this RDD; will read from cache if applicable, or otherwise compute it.   * This should ''not'' be called by users directly, but is available for implementors of custom   * subclasses of RDD.   */  final def iterator(split: Partition, context: TaskContext): Iterator[T] = {    if (storageLevel != StorageLevel.NONE) {      getOrCompute(split, context)    } else {      computeOrReadCheckpoint(split, context)    }  }

getOrCompute

 private[spark] def getOrCompute(partition: Partition, context: TaskContext): Iterator[T] = {    val blockId = RDDBlockId(id, partition.index)    var readCachedBlock = true    // This method is called on executors, so we need call SparkEnv.get instead of sc.env.    SparkEnv.get.blockManager.getOrElseUpdate(blockId, storageLevel, elementClassTag, () => {      readCachedBlock = false      computeOrReadCheckpoint(partition, context)    }) match {      case Left(blockResult) =>        if (readCachedBlock) {          val existingMetrics = context.taskMetrics().inputMetrics          existingMetrics.incBytesRead(blockResult.bytes)          new InterruptibleIterator[T](context, blockResult.data.asInstanceOf[Iterator[T]]) {            override def next(): T = {              existingMetrics.incRecordsRead(1)              delegate.next()            }          }        } else {          new InterruptibleIterator(context, blockResult.data.asInstanceOf[Iterator[T]])        }      case Right(iter) =>        new InterruptibleIterator(context, iter.asInstanceOf[Iterator[T]])    }  }

computeOrReadCheckpoint

 /**   * Compute an RDD partition or read it from a checkpoint if the RDD is checkpointing.   */  private[spark] def computeOrReadCheckpoint(split: Partition, context: TaskContext): Iterator[T] =  {    if (isCheckpointedAndMaterialized) {      firstParent[T].iterator(split, context)    } else {      compute(split, context)    }  }

isCheckpointedAndMaterialized

  /**   * Return whether this RDD is checkpointed and materialized, either reliably or locally.   * This is introduced as an alias for `isCheckpointed` to clarify the semantics of the   * return value. Exposed for testing.   */  private[spark] def isCheckpointedAndMaterialized: Boolean = isCheckpointed

isCheckpointed

def isCheckpointed: Boolean = checkpointData.exists(_.isCheckpointed)

At rdd is defined, checkpontData is null.

private[spark] var checkpointData: Option[RDDCheckpointData[T]] = None

/**   * Performs the checkpointing of this RDD by saving this. It is called after a job using this RDD   * has completed (therefore the RDD has been materialized and potentially stored in memory).   * doCheckpoint() is called recursively on the parent RDDs.   */  private[spark] def doCheckpoint(): Unit = {    RDDOperationScope.withScope(sc, "checkpoint", allowNesting = false, ignoreParent = true) {      if (!doCheckpointCalled) {        doCheckpointCalled = true        if (checkpointData.isDefined) {          if (checkpointAllMarkedAncestors) {            // TODO We can collect all the RDDs that needs to be checkpointed, and then checkpoint            // them in parallel.            // Checkpoint parents first because our lineage will be truncated after we            // checkpoint ourselves            dependencies.foreach(_.rdd.doCheckpoint())          }          checkpointData.get.checkpoint()        } else {          dependencies.foreach(_.rdd.doCheckpoint())        }      }    }  }

markCheckpointed

  /**   * Changes the dependencies of this RDD from its original parents to a new RDD (`newRDD`)   * created from the checkpoint file, and forget its old dependencies and partitions.   */  private[spark] def markCheckpointed(): Unit = {    clearDependencies()    partitions_ = null    deps = null    // Forget the constructor argument for dependencies too  }

checkpoint

  /**   * Mark this RDD for checkpointing. It will be saved to a file inside the checkpoint   * directory set with `SparkContext#setCheckpointDir` and all references to its parent   * RDDs will be removed. This function must be called before any job has been   * executed on this RDD. It is strongly recommended that this RDD is persisted in   * memory, otherwise saving it on a file will require recomputation.   */  def checkpoint(): Unit = RDDCheckpointData.synchronized {    // NOTE: we use a global lock here due to complexities downstream with ensuring    // children RDD partitions point to the correct parent partitions. In the future    // we should revisit this consideration.    if (context.checkpointDir.isEmpty) {      throw new SparkException("Checkpoint directory has not been set in the SparkContext")    } else if (checkpointData.isEmpty) {      checkpointData = Some(new ReliableRDDCheckpointData(this))    }  }

阅读全文

0 0