spark 2.1 Aggregator
来源:互联网 发布:ubuntu使用拼音输入法 编辑:程序博客网 时间:2024/06/05 09:16
Aggregator
/** * :: DeveloperApi :: * A set of functions used to aggregate data. * * @param createCombiner function to create the initial value of the aggregation. * @param mergeValue function to merge a new value into the aggregation result. * @param mergeCombiners function to merge outputs from multiple mergeValue function. */@DeveloperApicase class Aggregator[K, V, C] ( createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C) {
combineValuesByKey
def combineValuesByKey( iter: Iterator[_ <: Product2[K, V]], context: TaskContext): Iterator[(K, C)] = { val combiners = new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners) combiners.insertAll(iter) updateMetrics(context, combiners) combiners.iterator }
combineCombinersByKey
def combineCombinersByKey( iter: Iterator[_ <: Product2[K, C]], context: TaskContext): Iterator[(K, C)] = { val combiners = new ExternalAppendOnlyMap[K, C, C](identity, mergeCombiners, mergeCombiners) combiners.insertAll(iter) updateMetrics(context, combiners) combiners.iterator }
updateMetrics
/** Update task metrics after populating the external map. */ private def updateMetrics(context: TaskContext, map: ExternalAppendOnlyMap[_, _, _]): Unit = { Option(context).foreach { c => c.taskMetrics().incMemoryBytesSpilled(map.memoryBytesSpilled) c.taskMetrics().incDiskBytesSpilled(map.diskBytesSpilled) c.taskMetrics().incPeakExecutionMemory(map.peakMemoryUsedBytes) } }
ExternalAppendOnlyMap
/** * :: DeveloperApi :: * An append-only map that spills sorted content to disk when there is insufficient space for it * to grow. * * This map takes two passes over the data: * * (1) Values are merged into combiners, which are sorted and spilled to disk as necessary * (2) Combiners are read from disk and merged together * * The setting of the spill threshold faces the following trade-off: If the spill threshold is * too high, the in-memory map may occupy more memory than is available, resulting in OOM. * However, if the spill threshold is too low, we spill frequently and incur unnecessary disk * writes. This may lead to a performance regression compared to the normal case of using the * non-spilling AppendOnlyMap. */@DeveloperApiclass ExternalAppendOnlyMap[K, V, C]( createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, serializer: Serializer = SparkEnv.get.serializer, blockManager: BlockManager = SparkEnv.get.blockManager, context: TaskContext = TaskContext.get(), serializerManager: SerializerManager = SparkEnv.get.serializerManager) extends Spillable[SizeTracker](context.taskMemoryManager()) with Serializable with Logging with Iterable[(K, C)] {
fields and constructors
if (context == null) { throw new IllegalStateException( "Spillable collections should not be instantiated outside of tasks") } // Backwards-compatibility constructor for binary compatibility def this( createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, serializer: Serializer, blockManager: BlockManager) { this(createCombiner, mergeValue, mergeCombiners, serializer, blockManager, TaskContext.get()) } @volatile private var currentMap = new SizeTrackingAppendOnlyMap[K, C] private val spilledMaps = new ArrayBuffer[DiskMapIterator] private val sparkConf = SparkEnv.get.conf private val diskBlockManager = blockManager.diskBlockManager /** * Size of object batches when reading/writing from serializers. * * Objects are written in batches, with each batch using its own serialization stream. This * cuts down on the size of reference-tracking maps constructed when deserializing a stream. * * NOTE: Setting this too low can cause excessive copying when serializing, since some serializers * grow internal data structures by growing + copying every time the number of objects doubles. */ private val serializerBatchSize = sparkConf.getLong("spark.shuffle.spill.batchSize", 10000) // Number of bytes spilled in total private var _diskBytesSpilled = 0L def diskBytesSpilled: Long = _diskBytesSpilled // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided private val fileBufferSize = sparkConf.getSizeAsKb("spark.shuffle.file.buffer", "32k").toInt * 1024 // Write metrics private val writeMetrics: ShuffleWriteMetrics = new ShuffleWriteMetrics() // Peak size of the in-memory map observed so far, in bytes private var _peakMemoryUsedBytes: Long = 0L def peakMemoryUsedBytes: Long = _peakMemoryUsedBytes private val keyComparator = new HashComparator[K] private val ser = serializer.newInstance() @volatile private var readingIterator: SpillableIterator = null /** * Number of files this map has spilled so far. * Exposed for testing. */ private[collection] def numSpills: Int = spilledMaps.size
/** * Insert the given key and value into the map. */ def insert(key: K, value: V): Unit = { insertAll(Iterator((key, value))) } /** * Insert the given iterator of keys and values into the map. * * When the underlying map needs to grow, check if the global pool of shuffle memory has * enough room for this to happen. If so, allocate the memory required to grow the map; * otherwise, spill the in-memory map to disk. * * The shuffle memory usage of the first trackMemoryThreshold entries is not tracked. */ def insertAll(entries: Iterator[Product2[K, V]]): Unit = { if (currentMap == null) { throw new IllegalStateException( "Cannot insert new elements into a map after calling iterator") } // An update function for the map that we reuse across entries to avoid allocating // a new closure each time var curEntry: Product2[K, V] = null val update: (Boolean, C) => C = (hadVal, oldVal) => { if (hadVal) mergeValue(oldVal, curEntry._2) else createCombiner(curEntry._2) } while (entries.hasNext) { curEntry = entries.next() val estimatedSize = currentMap.estimateSize() if (estimatedSize > _peakMemoryUsedBytes) { _peakMemoryUsedBytes = estimatedSize } if (maybeSpill(currentMap, estimatedSize)) { currentMap = new SizeTrackingAppendOnlyMap[K, C] } currentMap.changeValue(curEntry._1, update) addElementsRead() } } /** * Insert the given iterable of keys and values into the map. * * When the underlying map needs to grow, check if the global pool of shuffle memory has * enough room for this to happen. If so, allocate the memory required to grow the map; * otherwise, spill the in-memory map to disk. * * The shuffle memory usage of the first trackMemoryThreshold entries is not tracked. */ def insertAll(entries: Iterable[Product2[K, V]]): Unit = { insertAll(entries.iterator) }
maybeSpill
/** * Spills the current in-memory collection to disk if needed. Attempts to acquire more * memory before spilling. * * @param collection collection to spill to disk * @param currentMemory estimated size of the collection in bytes * @return true if `collection` was spilled to disk; false otherwise */ protected def maybeSpill(collection: C, currentMemory: Long): Boolean = { var shouldSpill = false if (elementsRead % 32 == 0 && currentMemory >= myMemoryThreshold) { // Claim up to double our current memory from the shuffle memory pool val amountToRequest = 2 * currentMemory - myMemoryThreshold val granted = acquireMemory(amountToRequest) myMemoryThreshold += granted // If we were granted too little memory to grow further (either tryToAcquire returned 0, // or we already had more memory than myMemoryThreshold), spill the current collection shouldSpill = currentMemory >= myMemoryThreshold } shouldSpill = shouldSpill || _elementsRead > numElementsForceSpillThreshold // Actually spill if (shouldSpill) { _spillCount += 1 logSpillage(currentMemory) spill(collection) _elementsRead = 0 _memoryBytesSpilled += currentMemory releaseMemory() } shouldSpill }
阅读全文
0 0
- spark 2.1 Aggregator
- Spark-Dependency/Aggregator
- Spark aggregator ExternalIterator 流程图
- Spark Core Aggregator
- Aggregator
- Aggregator
- Aggregator Transformation
- oracle string aggregator
- camel EIP 之Aggregator
- 解读opentsdb之aggregator
- What is a News Aggregator?
- Camel企业级集成模式--Aggregator
- Linq学习笔记--聚合函数/Aggregator
- Linq学习笔记--聚合函数/Aggregator
- 第一节 常用组件 之 Aggregator 概述
- Caliburn Micro Part 4: The Event Aggregator
- Storm Aggregator原理与底层框架调用
- java设计模式进阶_event-aggregator
- vfork,进程树,exec函数族,system函数
- 老司机浅谈linux系统学习技巧
- css3布局的若干笔记总结
- SNMP/MIB/LOCATE/GREP/OID/SNMP V1 / V2c Error Messages
- AnkuLua: 遊戲自動化|不限時間玩工具App
- spark 2.1 Aggregator
- Jenkins+Maven+SVN快速搭建持续集成环境(转)
- eclipse导入后将普通项目变为java项目build path,no action aviliable
- 『ORACLE』更改主机IP地址(11g)
- c/c++中const区别
- java实现选择排序
- kotlin 入门第一课 开发环境配置
- IOS端K线系列之分时图-整体搭建
- c++学习(五)