spark 2.1 reduce

来源：互联网发布：网购数据编辑：程序博客网时间：2024/06/16 10:33

  /**   * Merge the values for each key using an associative and commutative reduce function. This will   * also perform the merging locally on each mapper before sending results to a reducer, similarly   * to a "combiner" in MapReduce.   */  def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] = self.withScope {    combineByKeyWithClassTag[V]((v: V) => v, func, func, partitioner)  }

/**   * :: Experimental ::   * Generic function to combine the elements for each key using a custom set of aggregation   * functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined type" C   *   * Users provide three functions:   *   *  - `createCombiner`, which turns a V into a C (e.g., creates a one-element list)   *  - `mergeValue`, to merge a V into a C (e.g., adds it to the end of a list)   *  - `mergeCombiners`, to combine two C's into a single one.   *   * In addition, users can control the partitioning of the output RDD, and whether to perform   * map-side aggregation (if a mapper can produce multiple items with the same key).   *   * @note V and C can be different -- for example, one might group an RDD of type   * (Int, Int) into an RDD of type (Int, Seq[Int]).   */  @Experimental  def combineByKeyWithClassTag[C](      createCombiner: V => C,      mergeValue: (C, V) => C,      mergeCombiners: (C, C) => C,      partitioner: Partitioner,      mapSideCombine: Boolean = true,      serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {    require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0    if (keyClass.isArray) {      if (mapSideCombine) {        throw new SparkException("Cannot use map-side combining with array keys.")      }      if (partitioner.isInstanceOf[HashPartitioner]) {        throw new SparkException("HashPartitioner cannot partition array keys.")      }    }    val aggregator = new Aggregator[K, V, C](      self.context.clean(createCombiner),      self.context.clean(mergeValue),      self.context.clean(mergeCombiners))    if (self.partitioner == Some(partitioner)) {      self.mapPartitions(iter => {        val context = TaskContext.get()        new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))      }, preservesPartitioning = true)    } else {      new ShuffledRDD[K, V, C](self, partitioner)        .setSerializer(serializer)        .setAggregator(aggregator)        .setMapSideCombine(mapSideCombine)    }  }

阅读全文

0 0