spark的StorageLevel注解

来源:互联网 发布:红猫网络加速器 编辑:程序博客网 时间:2024/05/16 09:58
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *    http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.spark.storageimport java.io.{Externalizable, IOException, ObjectInput, ObjectOutput}import java.util.concurrent.ConcurrentHashMapimport org.apache.spark.annotation.DeveloperApiimport org.apache.spark.memory.MemoryModeimport org.apache.spark.util.Utils/** * :: DeveloperApi :: * Flags for controlling the storage of an RDD. Each StorageLevel records whether to use memory, * or ExternalBlockStore, whether to drop the RDD to disk if it falls out of memory or * ExternalBlockStore, whether to keep the data in memory in a serialized format, and whether * to replicate the RDD partitions on multiple nodes. * * The [[org.apache.spark.storage.StorageLevel$]] singleton object contains some static constants * for commonly useful storage levels. To create your own storage level object, use the * factory method of the singleton object (`StorageLevel(...)`). */ //主构造器私有,必须使用辅助构造器初始化@DeveloperApiclass StorageLevel private(    private var _useDisk: Boolean,    private var _useMemory: Boolean,    private var _useOffHeap: Boolean,    private var _deserialized: Boolean,    private var _replication: Int = 1)  extends Externalizable {  // TODO: Also add fields for caching priority, dataset ID, and flushing.  private def this(flags: Int, replication: Int) {    this((flags & 8) != 0, (flags & 4) != 0, (flags & 2) != 0, (flags & 1) != 0, replication)  }  //辅助构造器  def this() = this(false, true, false, false)  // For deserialization  def useDisk: Boolean = _useDisk  def useMemory: Boolean = _useMemory  def useOffHeap: Boolean = _useOffHeap  def deserialized: Boolean = _deserialized  def replication: Int = _replication  //副本数不能超过40  assert(replication < 40, "Replication restricted to be less than 40 for calculating hash codes")  //堆外存储不支持反序列化存储  if (useOffHeap) {    require(!deserialized, "Off-heap storage level does not support deserialized storage")  }  //spark包可访问  //memorymode的getter,指示是堆外存储还是堆内存储  private[spark] def memoryMode: MemoryMode = {    if (useOffHeap) MemoryMode.OFF_HEAP    else MemoryMode.ON_HEAP  }  override def clone(): StorageLevel = {    new StorageLevel(useDisk, useMemory, useOffHeap, deserialized, replication)  }  //判断两个StorageLevel实例是否相等  override def equals(other: Any): Boolean = other match {    case s: StorageLevel =>      s.useDisk == useDisk &&      s.useMemory == useMemory &&      s.useOffHeap == useOffHeap &&      s.deserialized == deserialized &&      s.replication == replication    case _ =>      false  }  //StorageLevel实例是否有效的标志,要么使用memory或磁盘进行缓存,且副本数要大于0  def isValid: Boolean = (useMemory || useDisk) && (replication > 0)  //将当前存储级别转化为整数表示。是利用二进制数实现一个使用存储的判等,如果只使用_useDisk,最终的ret是1000:  def toInt: Int = {    var ret = 0    if (_useDisk) {      ret |= 8    }    if (_useMemory) {      ret |= 4    }    if (_useOffHeap) {      ret |= 2    }    if (_deserialized) {      ret |= 1    }    ret  }  //序列化的时候调用的函数  override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException {    out.writeByte(toInt)    out.writeByte(_replication)  }  //反序列化的时候调用  override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException {    val flags = in.readByte()    _useDisk = (flags & 8) != 0    _useMemory = (flags & 4) != 0    _useOffHeap = (flags & 2) != 0    _deserialized = (flags & 1) != 0    _replication = in.readByte()  }  @throws(classOf[IOException])  private def readResolve(): Object = StorageLevel.getCachedStorageLevel(this)  override def toString: String = {    val disk = if (useDisk) "disk" else ""    val memory = if (useMemory) "memory" else ""    val heap = if (useOffHeap) "offheap" else ""    val deserialize = if (deserialized) "deserialized" else ""    val output =      Seq(disk, memory, heap, deserialize, s"$replication replicas").filter(_.nonEmpty)    s"StorageLevel(${output.mkString(", ")})"  }    //改为的标志数字:存储标志转化为二进制后的值乘以41再加上40,如果replication允许超过40,则无法准确还原出存储标志位,  //而且可能造成多种不同存储标志位的存储层级具备相同的hashcode  override def hashCode(): Int = toInt * 41 + replication  def description: String = {    var result = ""    result += (if (useDisk) "Disk " else "")    if (useMemory) {      result += (if (useOffHeap) "Memory (off heap) " else "Memory ")    }    result += (if (deserialized) "Deserialized " else "Serialized ")    result += s"${replication}x Replicated"    result  }}/** * Various [[org.apache.spark.storage.StorageLevel]] defined and utility functions for creating * new storage levels. */ //StorageLevel伴生对象,主要用于定义类似于JAVA中静态成员和方法object StorageLevel {  val NONE = new StorageLevel(false, false, false, false)  val DISK_ONLY = new StorageLevel(true, false, false, false)  val DISK_ONLY_2 = new StorageLevel(true, false, false, false, 2)  val MEMORY_ONLY = new StorageLevel(false, true, false, true)  val MEMORY_ONLY_2 = new StorageLevel(false, true, false, true, 2)  val MEMORY_ONLY_SER = new StorageLevel(false, true, false, false)  val MEMORY_ONLY_SER_2 = new StorageLevel(false, true, false, false, 2)  val MEMORY_AND_DISK = new StorageLevel(true, true, false, true)  val MEMORY_AND_DISK_2 = new StorageLevel(true, true, false, true, 2)  val MEMORY_AND_DISK_SER = new StorageLevel(true, true, false, false)  val MEMORY_AND_DISK_SER_2 = new StorageLevel(true, true, false, false, 2)  val OFF_HEAP = new StorageLevel(true, true, true, false, 1)  /**   * :: DeveloperApi ::   * Return the StorageLevel object with the specified name.   */  @DeveloperApi  def fromString(s: String): StorageLevel = s match {    case "NONE" => NONE    case "DISK_ONLY" => DISK_ONLY    case "DISK_ONLY_2" => DISK_ONLY_2    case "MEMORY_ONLY" => MEMORY_ONLY    case "MEMORY_ONLY_2" => MEMORY_ONLY_2    case "MEMORY_ONLY_SER" => MEMORY_ONLY_SER    case "MEMORY_ONLY_SER_2" => MEMORY_ONLY_SER_2    case "MEMORY_AND_DISK" => MEMORY_AND_DISK    case "MEMORY_AND_DISK_2" => MEMORY_AND_DISK_2    case "MEMORY_AND_DISK_SER" => MEMORY_AND_DISK_SER    case "MEMORY_AND_DISK_SER_2" => MEMORY_AND_DISK_SER_2    case "OFF_HEAP" => OFF_HEAP    case _ => throw new IllegalArgumentException(s"Invalid StorageLevel: $s")  }  /**   * :: DeveloperApi ::   * Create a new StorageLevel object.   */  @DeveloperApi  def apply(      useDisk: Boolean,      useMemory: Boolean,      useOffHeap: Boolean,      deserialized: Boolean,      replication: Int): StorageLevel = {    getCachedStorageLevel(      new StorageLevel(useDisk, useMemory, useOffHeap, deserialized, replication))  }  /**   * :: DeveloperApi ::   * Create a new StorageLevel object without setting useOffHeap.   */  @DeveloperApi  def apply(      useDisk: Boolean,      useMemory: Boolean,      deserialized: Boolean,      replication: Int = 1): StorageLevel = {    getCachedStorageLevel(new StorageLevel(useDisk, useMemory, false, deserialized, replication))  }  /**   * :: DeveloperApi ::   * Create a new StorageLevel object from its integer representation.   */  @DeveloperApi  def apply(flags: Int, replication: Int): StorageLevel = {    getCachedStorageLevel(new StorageLevel(flags, replication))  }  /**   * :: DeveloperApi ::   * Read StorageLevel object from ObjectInput stream.   */  @DeveloperApi  def apply(in: ObjectInput): StorageLevel = {    val obj = new StorageLevel()    obj.readExternal(in)    getCachedStorageLevel(obj)  }  //创建一个并发hashmap storageLevelCache来存储存储层级,其键为StoreageLevel类型,其值也为StorageLevel类型  private[spark] val storageLevelCache = new ConcurrentHashMap[StorageLevel, StorageLevel]()  //先将存储层级进行缓存,并返回该存储层级  private[spark] def getCachedStorageLevel(level: StorageLevel): StorageLevel = {    storageLevelCache.putIfAbsent(level, level)    storageLevelCache.get(level)  }}

0 0
原创粉丝点击