之MapPartitionsWithIndexOperator

来源:互联网 发布:特征提取算法如何优化 编辑:程序博客网 时间:2024/04/20 07:36

小经典

效果:17/08/17 14:56:36 INFO TaskSchedulerImpl: Adding task set 0.0 with 3 tasks
17/08/17 14:56:36 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, executor driver, partition 0, PROCESS_LOCAL, 4825 bytes)
17/08/17 14:56:36 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
partitionId:0value:1
partitionId:0value:2
partitionId:0value:3
17/08/17 14:56:37 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 703 bytes result sent to driver
17/08/17 14:56:37 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, localhost, executor driver, partition 1, PROCESS_LOCAL, 4825 bytes)
17/08/17 14:56:37 INFO Executor: Running task 1.0 in stage 0.0 (TID 1)
partitionId:1value:4
partitionId:1value:5
partitionId:1value:6
17/08/17 14:56:37 INFO Executor: Finished task 1.0 in stage 0.0 (TID 1). 660 bytes result sent to driver
17/08/17 14:56:37 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 183 ms on localhost (executor driver) (1/3)
17/08/17 14:56:37 INFO TaskSetManager: Starting task 2.0 in stage 0.0 (TID 2, localhost, executor driver, partition 2, PROCESS_LOCAL, 4882 bytes)
17/08/17 14:56:37 INFO Executor: Running task 2.0 in stage 0.0 (TID 2)
partitionId:2value:7
partitionId:2value:8
partitionId:2value:9
partitionId:2value:10
17/08/17 14:56:37 INFO Executor: Finished task 2.0 in stage 0.0 (TID 2). 621 bytes result sent to driver
17/08/17 14:56:37 INFO TaskSetManager: Finished task 1.0 in stage 0.0 (TID 1) in 59 ms on localhost (executor driver) (2/3)
17/08/17 14:56:37 INFO TaskSetManager: Finished task 2.0 in stage 0.0 (TID 2) in 39 ms on localhost (executor driver) (3/3)
17/08/17 14:56:37 INFO DAGScheduler: ResultStage 0 (collect at MapPartitionsWithIndexOperator.scala:44) finished in 0.256 s


import org.apache.spark.SparkConf

import org.apache.spark.SparkContext
import scala.collection.mutable.ListBuffer

object MapPartitionsWithIndexOperator {
  def main(args: Array[String]): Unit = {
    /**
     * 创建一个设置Spark运行参数的对象
     *     SparkConf对象可以设置运行模式,设置Application的名称
     * 设置Application执行所需要的资源情况
     */
    val conf = new SparkConf()
      .setMaster("local")
      .setAppName("Map_Operator")

    /**
     * 创建一个SparkContext的上下文对象
     * SparkContext是通往集群的 唯一通道
     * 负责任务分发,以及任务失败后的重试工作
     */
    val sc = new SparkContext(conf)
    
    /**
     * makeRDD方法的第一个参数代表的是RDD中的 元素
     * 第二个参数:RDD的分区数
     * rdd[Int]
     */
    val rdd = sc.makeRDD(1 to 10,3)
    
   /**
    * mapPartitions这个算子遍历的单位是partition
    *     会将一个partition的数据量全部加载到一个集合里面
    */
  val mapPartitionsWithIndexRDD = rdd.mapPartitionsWithIndex((index,iterator)=>{
    val list = new ListBuffer[Int]()
    while (iterator.hasNext) {
      val num = iterator.next()
      println("partitionId:" + index + "value:" + num)
      list+=num
    }
    list.iterator
  }, false).collect()
    
    /**
     * 释放资源
     */
    sc.stop()
  }
}
原创粉丝点击