
来源:互联网 发布:淘宝拍图技巧 编辑:程序博客网 时间:2024/05/29 03:22

最近从源码角度温习之前学的Spark的基础,在RDD的Dependency这一节中,关于一些Transition操作是Narrow Dependency还是Shuffle Dependency。



def groupByKey(): RDD[(K, Iterable[V])] = self.withScope {    groupByKey(defaultPartitioner(self))  }


def groupByKey(partitioner: Partitioner): RDD[(K, Iterable[V])] = self.withScope {    // groupByKey shouldn't use map side combine because map side combine does not    // reduce the amount of data shuffled and requires all map side data be inserted    // into a hash table, leading to more objects in the old gen.    val createCombiner = (v: V) => CompactBuffer(v)    val mergeValue = (buf: CompactBuffer[V], v: V) => buf += v    val mergeCombiners = (c1: CompactBuffer[V], c2: CompactBuffer[V]) => c1 ++= c2    val bufs = combineByKeyWithClassTag[CompactBuffer[V]](      createCombiner, mergeValue, mergeCombiners, partitioner, mapSideCombine = false)    bufs.asInstanceOf[RDD[(K, Iterable[V])]]  }


def combineByKeyWithClassTag[C](      createCombiner: V => C,      mergeValue: (C, V) => C,      mergeCombiners: (C, C) => C,      partitioner: Partitioner,      mapSideCombine: Boolean = true,      serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {    require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0    if (keyClass.isArray) {      if (mapSideCombine) {        throw new SparkException("Cannot use map-side combining with array keys.")      }      if (partitioner.isInstanceOf[HashPartitioner]) {        throw new SparkException("HashPartitioner cannot partition array keys.")      }    }    val aggregator = new Aggregator[K, V, C](      self.context.clean(createCombiner),      self.context.clean(mergeValue),      self.context.clean(mergeCombiners))    if (self.partitioner == Some(partitioner)) {      self.mapPartitions(iter => {        val context = TaskContext.get()        new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))      }, preservesPartitioning = true)    } else {      new ShuffledRDD[K, V, C](self, partitioner)        .setSerializer(serializer)        .setAggregator(aggregator)        .setMapSideCombine(mapSideCombine)    }  }


groupByKey还有另外两个函数:groupByKey(numPartitions: Int)和groupByKey(partitioner: Partitioner)。这两个函数会有新的分区方式。


0 0