parallelize中指定partition个数的详解

来源：互联网发布：网络直播什么时候火的编辑：程序博客网时间：2024/06/05 00:58

val rdd = sc.parallelize(List(1,2,3,4,5,6,7),3)生成RDD的时候，RDD中的partition是如何决定的

def parallelize[T: ClassTag](

seq: Seq[T],

numSlices: Int = defaultParallelism): RDD[T]

numSlices如果指定则按照指定的partition数量，如果不指定则按照默认值defaultParallelism

def defaultParallelism: Int = {

assertNotStopped()

taskScheduler.defaultParallelism

}

要根据createTaskScheduler方法中指定的集群运行模式来确定taskScheduler类型

private def createTaskScheduler(

sc: SparkContext,

master: String,

deployMode: String): (SchedulerBackend, TaskScheduler) = {

import SparkMasterRegex._

// When running locally, don't try to re-execute tasks on failure.

val MAX_LOCAL_TASK_FAILURES = 1

master match {

case "local" =>

val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)

val backend = new LocalSchedulerBackend(sc.getConf, scheduler, 1)

scheduler.initialize(backend)

(backend, scheduler)

case LOCAL_N_REGEX(threads) =>

def localCpuCount: Int = Runtime.getRuntime.availableProcessors()

// local[*] estimates the number of cores on the machine; local[N] uses exactly N threads.

val threadCount = if (threads == "*") localCpuCount else threads.toInt

if (threadCount <= 0) {

throw new SparkException(s"Asked to run locally with $threadCount threads")

}

val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)

val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)

scheduler.initialize(backend)

(backend, scheduler)

case LOCAL_N_FAILURES_REGEX(threads, maxFailures) =>

def localCpuCount: Int = Runtime.getRuntime.availableProcessors()

// local[*, M] means the number of cores on the computer with M failures

// local[N, M] means exactly N threads with M failures

val threadCount = if (threads == "*") localCpuCount else threads.toInt

val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true)

val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)

scheduler.initialize(backend)

(backend, scheduler)

case SPARK_REGEX(sparkUrl) =>

val scheduler = new TaskSchedulerImpl(sc)

val masterUrls = sparkUrl.split(",").map("spark://" + _)

val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls)

scheduler.initialize(backend)

(backend, scheduler)

case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>

// Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang.

val memoryPerSlaveInt = memoryPerSlave.toInt

if (sc.executorMemory > memoryPerSlaveInt) {

throw new SparkException(

"Asked to launch cluster with %d MB RAM / worker but requested %d MB/worker".format(

memoryPerSlaveInt, sc.executorMemory))

}

val scheduler = new TaskSchedulerImpl(sc)

val localCluster = new LocalSparkCluster(

numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt, sc.conf)

val masterUrls = localCluster.start()

val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls)

scheduler.initialize(backend)

backend.shutdownCallback = (backend: StandaloneSchedulerBackend) => {

localCluster.stop()

}

(backend, scheduler)

case MESOS_REGEX(mesosUrl) =>

MesosNativeLibrary.load()

val scheduler = new TaskSchedulerImpl(sc)

val coarseGrained = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = true)

val backend = if (coarseGrained) {

new MesosCoarseGrainedSchedulerBackend(scheduler, sc, mesosUrl, sc.env.securityManager)

} else {

new MesosFineGrainedSchedulerBackend(scheduler, sc, mesosUrl)

}

scheduler.initialize(backend)

(backend, scheduler)

case masterUrl =>

val cm = getClusterManager(masterUrl) match {

case Some(clusterMgr) => clusterMgr

case None => throw new SparkException("Could not parse Master URL: '" + master + "'")

}

try {

val scheduler = cm.createTaskScheduler(sc, masterUrl)

val backend = cm.createSchedulerBackend(sc, masterUrl, scheduler)

cm.initialize(scheduler, backend)

(backend, scheduler)

} catch {

case se: SparkException => throw se

case NonFatal(e) =>

throw new SparkException("External scheduler cannot be instantiated", e)

}

以TaskSchedulerImpl为例

override def defaultParallelism(): Int = backend.defaultParallelism()

以LocalSchedulerBackend为例

override def defaultParallelism(): Int =

scheduler.conf.getInt("spark.default.parallelism", totalCores)

从源码可以看到，首先获取conf中的spark.default.parallelism配置参数，如果没有配置，则默认使用totalCores，找到totalCores是创建LocalSchedulerBackend时传进来的

private[spark] class LocalSchedulerBackend(

conf: SparkConf,

scheduler: TaskSchedulerImpl,

val totalCores: Int)

那么回到createTaskScheduler方法

val backend = new LocalSchedulerBackend(sc.getConf, scheduler, 1)

val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)

自此parallelize方法如果没有指定partition数量，默认有几个partition取决于local，如果是local则默认是1，如果local中有参数则partition数量为参数

conf.setMaster("local[2]")

也可以在代码中设置

conf.set("spark.default.parallelism","2")

partition默认数量

parallelize中的partition数量优先级sc.parallelize(List(1,2,3,4,5,6,7),3)> conf.set("spark.default.parallelism","2") > conf.setMaster("local")

最后的partition数量为3

阅读全文

0 0