spark 2.1 RDDCheckpointData and ReliableRDDCheckpointData

来源:互联网 发布:蓝光原盘播放器软件 编辑:程序博客网 时间:2024/06/14 12:24

When job is finished, it will call rdd.doCheckpoint() to checkpoint the rdd.

    dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)    progressBar.foreach(_.finishAll())    rdd.doCheckpoint()

RDD.checkpoint()

  /**   * Mark this RDD for checkpointing. It will be saved to a file inside the checkpoint   * directory set with `SparkContext#setCheckpointDir` and all references to its parent   * RDDs will be removed. This function must be called before any job has been   * executed on this RDD. It is strongly recommended that this RDD is persisted in   * memory, otherwise saving it on a file will require recomputation.   */  def checkpoint(): Unit = RDDCheckpointData.synchronized {    // NOTE: we use a global lock here due to complexities downstream with ensuring    // children RDD partitions point to the correct parent partitions. In the future    // we should revisit this consideration.    if (context.checkpointDir.isEmpty) {      throw new SparkException("Checkpoint directory has not been set in the SparkContext")    } else if (checkpointData.isEmpty) {      checkpointData = Some(new ReliableRDDCheckpointData(this))    }  }

CheckpointState

/** * Enumeration to manage state transitions of an RDD through checkpointing * * [ Initialized --{@literal >} checkpointing in progress --{@literal >} checkpointed ] */private[spark] object CheckpointState extends Enumeration {  type CheckpointState = Value  val Initialized, CheckpointingInProgress, Checkpointed = Value}

RDDCheckpointData Definition

/** * This class contains all the information related to RDD checkpointing. Each instance of this * class is associated with an RDD. It manages process of checkpointing of the associated RDD, * as well as, manages the post-checkpoint state by providing the updated partitions, * iterator and preferred locations of the checkpointed RDD. */private[spark] abstract class RDDCheckpointData[T: ClassTag](@transient private val rdd: RDD[T])  extends Serializable {  import CheckpointState._  import CheckpointState._  // The checkpoint state of the associated RDD.  protected var cpState = Initialized  // The RDD that contains our checkpointed data  private var cpRDD: Option[CheckpointRDD[T]] = None

isCheckpointed

  // TODO: are we sure we need to use a global lock in the following methods?  /**   * Return whether the checkpoint data for this RDD is already persisted.   */  def isCheckpointed: Boolean = RDDCheckpointData.synchronized {    cpState == Checkpointed  }

checkpoint

/**   * Materialize this RDD and persist its content.   * This is called immediately after the first action invoked on this RDD has completed.   */  final def checkpoint(): Unit = {    // Guard against multiple threads checkpointing the same RDD by    // atomically flipping the state of this RDDCheckpointData    RDDCheckpointData.synchronized {      if (cpState == Initialized) {        cpState = CheckpointingInProgress      } else {        return      }    }    val newRDD = doCheckpoint()    // Update our state and truncate the RDD lineage    RDDCheckpointData.synchronized {      cpRDD = Some(newRDD)      cpState = Checkpointed      rdd.markCheckpointed()    }  }

doCheckpoint() is a abstract method.

/**   * Materialize this RDD and persist its content.   *   * Subclasses should override this method to define custom checkpointing behavior.   * @return the checkpoint RDD created in the process.   */  protected def doCheckpoint(): CheckpointRDD[T]

checkpointRDD

/**   * Return the RDD that contains our checkpointed data.   * This is only defined if the checkpoint state is `Checkpointed`.   */  def checkpointRDD: Option[CheckpointRDD[T]] = RDDCheckpointData.synchronized { cpRDD }

getPartitions

  /**   * Return the partitions of the resulting checkpoint RDD.   * For tests only.   */  def getPartitions: Array[Partition] = RDDCheckpointData.synchronized {    cpRDD.map(_.partitions).getOrElse { Array.empty }  }

ReliableRDDCheckpointData

** * An implementation of checkpointing that writes the RDD data to reliable storage. * This allows drivers to be restarted on failure with previously computed state. */private[spark] class ReliableRDDCheckpointData[T: ClassTag](@transient private val rdd: RDD[T])  extends RDDCheckpointData[T](rdd) with Logging {

checkpoint dir

// The directory to which the associated RDD has been checkpointed to  // This is assumed to be a non-local path that points to some reliable storage  private val cpDir: String =    ReliableRDDCheckpointData.checkpointPath(rdd.context, rdd.id)      .map(_.toString)      .getOrElse { throw new SparkException("Checkpoint dir must be specified.") }

getCheckpointDir

  /**   * Return the directory to which this RDD was checkpointed.   * If the RDD is not checkpointed yet, return None.   */  def getCheckpointDir: Option[String] = RDDCheckpointData.synchronized {    if (isCheckpointed) {      Some(cpDir.toString)    } else {      None    }  }

object ReliableRDDCheckpointData

private[spark] object ReliableRDDCheckpointData extends Logging {  /** Return the path of the directory to which this RDD's checkpoint data is written. */  def checkpointPath(sc: SparkContext, rddId: Int): Option[Path] = {    sc.checkpointDir.map { dir => new Path(dir, s"rdd-$rddId") }  }  /** Clean up the files associated with the checkpoint data for this RDD. */  def cleanCheckpoint(sc: SparkContext, rddId: Int): Unit = {    checkpointPath(sc, rddId).foreach { path =>      path.getFileSystem(sc.hadoopConfiguration).delete(path, true)    }  }}
原创粉丝点击