spark 2.1 Aggregator

来源:互联网 发布:ubuntu使用拼音输入法 编辑:程序博客网 时间:2024/06/05 09:16

Aggregator

/** * :: DeveloperApi :: * A set of functions used to aggregate data. * * @param createCombiner function to create the initial value of the aggregation. * @param mergeValue function to merge a new value into the aggregation result. * @param mergeCombiners function to merge outputs from multiple mergeValue function. */@DeveloperApicase class Aggregator[K, V, C] (    createCombiner: V => C,    mergeValue: (C, V) => C,    mergeCombiners: (C, C) => C) {

combineValuesByKey

  def combineValuesByKey(      iter: Iterator[_ <: Product2[K, V]],      context: TaskContext): Iterator[(K, C)] = {    val combiners = new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners)    combiners.insertAll(iter)    updateMetrics(context, combiners)    combiners.iterator  }

combineCombinersByKey

def combineCombinersByKey(      iter: Iterator[_ <: Product2[K, C]],      context: TaskContext): Iterator[(K, C)] = {    val combiners = new ExternalAppendOnlyMap[K, C, C](identity, mergeCombiners, mergeCombiners)    combiners.insertAll(iter)    updateMetrics(context, combiners)    combiners.iterator  }

updateMetrics

 /** Update task metrics after populating the external map. */  private def updateMetrics(context: TaskContext, map: ExternalAppendOnlyMap[_, _, _]): Unit = {    Option(context).foreach { c =>      c.taskMetrics().incMemoryBytesSpilled(map.memoryBytesSpilled)      c.taskMetrics().incDiskBytesSpilled(map.diskBytesSpilled)      c.taskMetrics().incPeakExecutionMemory(map.peakMemoryUsedBytes)    }  }

ExternalAppendOnlyMap

/** * :: DeveloperApi :: * An append-only map that spills sorted content to disk when there is insufficient space for it * to grow. * * This map takes two passes over the data: * *   (1) Values are merged into combiners, which are sorted and spilled to disk as necessary *   (2) Combiners are read from disk and merged together * * The setting of the spill threshold faces the following trade-off: If the spill threshold is * too high, the in-memory map may occupy more memory than is available, resulting in OOM. * However, if the spill threshold is too low, we spill frequently and incur unnecessary disk * writes. This may lead to a performance regression compared to the normal case of using the * non-spilling AppendOnlyMap. */@DeveloperApiclass ExternalAppendOnlyMap[K, V, C](    createCombiner: V => C,    mergeValue: (C, V) => C,    mergeCombiners: (C, C) => C,    serializer: Serializer = SparkEnv.get.serializer,    blockManager: BlockManager = SparkEnv.get.blockManager,    context: TaskContext = TaskContext.get(),    serializerManager: SerializerManager = SparkEnv.get.serializerManager)  extends Spillable[SizeTracker](context.taskMemoryManager())   with Serializable  with Logging  with Iterable[(K, C)] {

fields and constructors

if (context == null) {    throw new IllegalStateException(      "Spillable collections should not be instantiated outside of tasks")  }  // Backwards-compatibility constructor for binary compatibility  def this(      createCombiner: V => C,      mergeValue: (C, V) => C,      mergeCombiners: (C, C) => C,      serializer: Serializer,      blockManager: BlockManager) {    this(createCombiner, mergeValue, mergeCombiners, serializer, blockManager, TaskContext.get())  }  @volatile private var currentMap = new SizeTrackingAppendOnlyMap[K, C]  private val spilledMaps = new ArrayBuffer[DiskMapIterator]  private val sparkConf = SparkEnv.get.conf  private val diskBlockManager = blockManager.diskBlockManager  /**   * Size of object batches when reading/writing from serializers.   *   * Objects are written in batches, with each batch using its own serialization stream. This   * cuts down on the size of reference-tracking maps constructed when deserializing a stream.   *   * NOTE: Setting this too low can cause excessive copying when serializing, since some serializers   * grow internal data structures by growing + copying every time the number of objects doubles.   */  private val serializerBatchSize = sparkConf.getLong("spark.shuffle.spill.batchSize", 10000)  // Number of bytes spilled in total  private var _diskBytesSpilled = 0L  def diskBytesSpilled: Long = _diskBytesSpilled  // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided  private val fileBufferSize =    sparkConf.getSizeAsKb("spark.shuffle.file.buffer", "32k").toInt * 1024  // Write metrics  private val writeMetrics: ShuffleWriteMetrics = new ShuffleWriteMetrics()  // Peak size of the in-memory map observed so far, in bytes  private var _peakMemoryUsedBytes: Long = 0L  def peakMemoryUsedBytes: Long = _peakMemoryUsedBytes  private val keyComparator = new HashComparator[K]  private val ser = serializer.newInstance()  @volatile private var readingIterator: SpillableIterator = null  /**   * Number of files this map has spilled so far.   * Exposed for testing.   */  private[collection] def numSpills: Int = spilledMaps.size
/**   * Insert the given key and value into the map.   */  def insert(key: K, value: V): Unit = {    insertAll(Iterator((key, value)))  }  /**   * Insert the given iterator of keys and values into the map.   *   * When the underlying map needs to grow, check if the global pool of shuffle memory has   * enough room for this to happen. If so, allocate the memory required to grow the map;   * otherwise, spill the in-memory map to disk.   *   * The shuffle memory usage of the first trackMemoryThreshold entries is not tracked.   */  def insertAll(entries: Iterator[Product2[K, V]]): Unit = {    if (currentMap == null) {      throw new IllegalStateException(        "Cannot insert new elements into a map after calling iterator")    }    // An update function for the map that we reuse across entries to avoid allocating    // a new closure each time    var curEntry: Product2[K, V] = null    val update: (Boolean, C) => C = (hadVal, oldVal) => {      if (hadVal) mergeValue(oldVal, curEntry._2) else createCombiner(curEntry._2)    }    while (entries.hasNext) {      curEntry = entries.next()      val estimatedSize = currentMap.estimateSize()      if (estimatedSize > _peakMemoryUsedBytes) {        _peakMemoryUsedBytes = estimatedSize      }      if (maybeSpill(currentMap, estimatedSize)) {        currentMap = new SizeTrackingAppendOnlyMap[K, C]      }      currentMap.changeValue(curEntry._1, update)      addElementsRead()    }  }  /**   * Insert the given iterable of keys and values into the map.   *   * When the underlying map needs to grow, check if the global pool of shuffle memory has   * enough room for this to happen. If so, allocate the memory required to grow the map;   * otherwise, spill the in-memory map to disk.   *   * The shuffle memory usage of the first trackMemoryThreshold entries is not tracked.   */  def insertAll(entries: Iterable[Product2[K, V]]): Unit = {    insertAll(entries.iterator)  }

maybeSpill

/**   * Spills the current in-memory collection to disk if needed. Attempts to acquire more   * memory before spilling.   *   * @param collection collection to spill to disk   * @param currentMemory estimated size of the collection in bytes   * @return true if `collection` was spilled to disk; false otherwise   */  protected def maybeSpill(collection: C, currentMemory: Long): Boolean = {    var shouldSpill = false    if (elementsRead % 32 == 0 && currentMemory >= myMemoryThreshold) {      // Claim up to double our current memory from the shuffle memory pool      val amountToRequest = 2 * currentMemory - myMemoryThreshold      val granted = acquireMemory(amountToRequest)      myMemoryThreshold += granted      // If we were granted too little memory to grow further (either tryToAcquire returned 0,      // or we already had more memory than myMemoryThreshold), spill the current collection      shouldSpill = currentMemory >= myMemoryThreshold    }    shouldSpill = shouldSpill || _elementsRead > numElementsForceSpillThreshold    // Actually spill    if (shouldSpill) {      _spillCount += 1      logSpillage(currentMemory)      spill(collection)      _elementsRead = 0      _memoryBytesSpilled += currentMemory      releaseMemory()    }    shouldSpill  }
原创粉丝点击