之前一直疑惑Shuffle过程中的读和写究竟是在哪里实现的,一直误解读和写都是在RDD的转换过程中实现的,但是追踪代码reduceByKey,却只找到了生成ShuffledRDD的过程,然后在ShuffledRDD中的compute函数中有读取过程,那么写入过程究竟在哪里呢??
- PairRDDFunctions
-
- def combineByKey[C](createCombiner: V => C,
- mergeValue: (C, V) => C,
- mergeCombiners: (C, C) => C,
- partitioner: Partitioner,
- mapSideCombine: Boolean = true,
- serializer: Serializer = null): RDD[(K, C)] = {
-
- val aggregator = new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners)
- if (self.partitioner == Some(partitioner)) {
-
- self.mapPartitionsWithContext((context, iter) => {
- new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
- }, preservesPartitioning = true)
- } else if (mapSideCombine) {
-
- val combined = self.mapPartitionsWithContext((context, iter) => {
- aggregator.combineValuesByKey(iter, context)
- }, preservesPartitioning = true)
- val partitioned = new ShuffledRDD[K, C, (K, C)](combined, partitioner)
- .setSerializer(serializer)
- partitioned.mapPartitionsWithContext((context, iter) => {
- new InterruptibleIterator(context, aggregator.combineCombinersByKey(iter, context))
- }, preservesPartitioning = true)
- } else {
-
- val values = new ShuffledRDD[K, V, (K, V)](self, partitioner).setSerializer(serializer)
- values.mapPartitionsWithContext((context, iter) => {
- new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
- }, preservesPartitioning = true)
- }
- }
观察compute方法,会看到是如何去取上一个stage生成的数据的。
-
-
- package org.apache.spark.rdd
-
- import org.apache.spark._
- import org.apache.spark.annotation.DeveloperApi
- import org.apache.spark.serializer.Serializer
-
- private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {
- override val index: Int = idx
- override def hashCode(): Int = idx
- }
-
-
-
-
-
-
-
-
-
-
-
- @DeveloperApi
- class ShuffledRDD[K, V, C](
- @transient var prev: RDD[_ <: Product2[K, V]],
- part: Partitioner)
- extends RDD[(K, C)](prev.context, Nil) {
-
- private var serializer: Option[Serializer] = None
-
- private var keyOrdering: Option[Ordering[K]] = None
-
- private var aggregator: Option[Aggregator[K, V, C]] = None
-
- private var mapSideCombine: Boolean = false
-
-
- def setSerializer(serializer: Serializer): ShuffledRDD[K, V, C] = {
- this.serializer = Option(serializer)
- this
- }
-
-
- def setKeyOrdering(keyOrdering: Ordering[K]): ShuffledRDD[K, V, C] = {
- this.keyOrdering = Option(keyOrdering)
- this
- }
-
-
- def setAggregator(aggregator: Aggregator[K, V, C]): ShuffledRDD[K, V, C] = {
- this.aggregator = Option(aggregator)
- this
- }
-
-
- def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {
- this.mapSideCombine = mapSideCombine
- this
- }
-
- override def getDependencies: Seq[Dependency[_]] = {
- List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))
- }
-
- override val partitioner = Some(part)
-
- override def getPartitions: Array[Partition] = {
- Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
- }
-
- override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
- val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
- SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
- .read()
- .asInstanceOf[Iterator[(K, C)]]
- }
-
- override def clearDependencies() {
- super.clearDependencies()
- prev = null
- }
- }
后来想到ShuffleMapTask,这个名字就很可以,打开代码看看。发现代码很简单,直接粗暴的把结果通过ShuffleManger写入到磁盘。
-
-
- package org.apache.spark.scheduler
-
- import java.nio.ByteBuffer
-
- import scala.language.existentials
-
- import org.apache.spark._
- import org.apache.spark.broadcast.Broadcast
- import org.apache.spark.rdd.RDD
- import org.apache.spark.shuffle.ShuffleWriter
-
-
-
-
-
-
-
-
-
-
-
-
-
- private[spark] class ShuffleMapTask(
- stageId: Int,
- taskBinary: Broadcast[Array[Byte]],
- partition: Partition,
- @transient private var locs: Seq[TaskLocation])
- extends Task[MapStatus](stageId, partition.index) with Logging {
-
-
- def this(partitionId: Int) {
- this(0, null, new Partition { override def index: Int = 0 }, null)
- }
-
- @transient private val preferredLocs: Seq[TaskLocation] = {
- if (locs == null) Nil else locs.toSet.toSeq
- }
-
- override def runTask(context: TaskContext): MapStatus = {
-
- val deserializeStartTime = System.currentTimeMillis()
- val ser = SparkEnv.get.closureSerializer.newInstance()
- val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
- ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
- _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
-
- metrics = Some(context.taskMetrics)
- var writer: ShuffleWriter[Any, Any] = null
- try {
- val manager = SparkEnv.get.shuffleManager
- writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
- writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
- return writer.stop(success = true).get
- } catch {
- case e: Exception =>
- try {
- if (writer != null) {
- writer.stop(success = false)
- }
- } catch {
- case e: Exception =>
- log.debug("Could not stop writer", e)
- }
- throw e
- }
- }
-
- override def preferredLocations: Seq[TaskLocation] = preferredLocs
-
- override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId)
- }
根据Stage的划分机制,只要出现ShuffleDependency,那么前面的任务就会被包装成为ShuffleMapTask,然后在ShuffleMapTask中把前面的Stage的output进行分区然后输出到硬盘,这样就可以不用考虑这次stage的final RDD的类型了,做到了shuflle write和RDD逻辑的解耦。
ShuffleManager后续发生的事情,参考https://github.com/JerryLead/SparkInternals/blob/master/markdown/4-shuffleDetails.md