Spark RDD 缓存

来源:互联网 发布:吐槽王pi知乎 编辑:程序博客网 时间:2024/05/04 08:33


/** * Various [[]] defined and utility functions for creating * new storage levels. */object StorageLevel {  val NONE = new StorageLevel(false, false, false, false)  val DISK_ONLY = new StorageLevel(true, false, false, false)  val DISK_ONLY_2 = new StorageLevel(true, false, false, false, 2)  val MEMORY_ONLY = new StorageLevel(false, true, false, true)  val MEMORY_ONLY_2 = new StorageLevel(false, true, false, true, 2)  val MEMORY_ONLY_SER = new StorageLevel(false, true, false, false)  val MEMORY_ONLY_SER_2 = new StorageLevel(false, true, false, false, 2)  val MEMORY_AND_DISK = new StorageLevel(true, true, false, true)  val MEMORY_AND_DISK_2 = new StorageLevel(true, true, false, true, 2)  val MEMORY_AND_DISK_SER = new StorageLevel(true, true, false, false)  val MEMORY_AND_DISK_SER_2 = new StorageLevel(true, true, false, false, 2)  val OFF_HEAP = new StorageLevel(true, true, true, false, 1)...


  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */  def persist(): this.type = persist(StorageLevel.MEMORY_ONLY)  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */  def cache(): this.type = persist()

可以看出cache其实就是调用persist默认的内存级别进行缓存,/* Persist this RDD with the default storage level (MEMORY_ONLY). /,就是说cache其实是一个快捷方法,实际上还是persist()为主,persist是可以传入根据需要的StorageLevel进行缓存的

/**   * Set this RDD's storage level to persist its values across operations after the first time   * it is computed. This can only be used to assign a new storage level if the RDD does not   * have a storage level set yet. Local checkpointing is an exception.   */  def persist(newLevel: StorageLevel): this.type = {    if (isLocallyCheckpointed) {      // This means the user previously called localCheckpoint(), which should have already      // marked this RDD for persisting. Here we should override the old storage level with      // one that is explicitly requested by the user (after adapting it to use disk).      persist(LocalRDDCheckpointData.transformStorageLevel(newLevel), allowOverride = true)    } else {      persist(newLevel, allowOverride = false)    }  }


scala> val rd1=sc.makeRDD((1 to 20),4)rd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[10] at makeRDD at <console>:24scala> val>(f,f*f))rd2: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[12] at map at <console>:26scala> rd2.cacheres13: rd2.type = MapPartitionsRDD[12] at map at <console>:26scala> rd2.collectres10: Array[(Int, Int)] = Array((1,1), (2,4), (3,9), (4,16), (5,25), (6,36), (7,49), (8,64), (9,81), (10,100), (11,121), (12,144), (13,169), (14,196), (15,225), (16,256), (17,289), (18,324), (19,361), (20,400))scala> val>(f._1+f._2))rd3: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[14] at map at <console>:28scala> rd3.collectres12: Array[Int] = Array(2, 6, 12, 20, 30, 42, 56, 72, 90, 110, 132, 156, 182, 210, 240, 272, 306, 342, 380, 420)


  /**   * Internal method to this RDD; will read from cache if applicable, or otherwise compute it.   * This should ''not'' be called by users directly, but is available for implementors of custom   * subclasses of RDD.   */  final def iterator(split: Partition, context: TaskContext): Iterator[T] = {    if (storageLevel != StorageLevel.NONE) {      getOrCompute(split, context)    } else {      computeOrReadCheckpoint(split, context)    }  }private[spark] def getOrCompute(partition: Partition, context: TaskContext): Iterator[T] = {    val blockId = RDDBlockId(id, partition.index)    var readCachedBlock = true    // This method is called on executors, so we need call SparkEnv.get instead of sc.env.    SparkEnv.get.blockManager.getOrElseUpdate(blockId, storageLevel, elementClassTag, () => {      readCachedBlock = false      computeOrReadCheckpoint(partition, context)    }) match {      case Left(blockResult) =>        if (readCachedBlock) {          val existingMetrics = context.taskMetrics().inputMetrics          existingMetrics.incBytesRead(blockResult.bytes)          new InterruptibleIterator[T](context,[Iterator[T]]) {            override def next(): T = {              existingMetrics.incRecordsRead(1)                }          }        } else {          new InterruptibleIterator(context,[Iterator[T]])        }      case Right(iter) =>        new InterruptibleIterator(context, iter.asInstanceOf[Iterator[T]])    }  }