parallelize中指定partition个数的详解
来源:互联网 发布:网络直播什么时候火的 编辑:程序博客网 时间:2024/06/05 00:58
val rdd = sc.parallelize(List(1,2,3,4,5,6,7),3)生成RDD的时候,RDD中的partition是如何决定的
def parallelize[T: ClassTag](
seq: Seq[T],
numSlices: Int = defaultParallelism): RDD[T]
numSlices如果指定则按照指定的partition数量,如果不指定则按照默认值defaultParallelism
def defaultParallelism: Int = {
assertNotStopped()
taskScheduler.defaultParallelism
}
要根据createTaskScheduler方法中指定的集群运行模式来确定taskScheduler类型
private def createTaskScheduler(
sc: SparkContext,
master: String,
deployMode: String): (SchedulerBackend, TaskScheduler) = {
import SparkMasterRegex._
// When running locally, don't try to re-execute tasks on failure.
val MAX_LOCAL_TASK_FAILURES = 1
master match {
case "local" =>
val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
val backend = new LocalSchedulerBackend(sc.getConf, scheduler, 1)
scheduler.initialize(backend)
(backend, scheduler)
case LOCAL_N_REGEX(threads) =>
def localCpuCount: Int = Runtime.getRuntime.availableProcessors()
// local[*] estimates the number of cores on the machine; local[N] uses exactly N threads.
val threadCount = if (threads == "*") localCpuCount else threads.toInt
if (threadCount <= 0) {
throw new SparkException(s"Asked to run locally with $threadCount threads")
}
val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)
scheduler.initialize(backend)
(backend, scheduler)
case LOCAL_N_FAILURES_REGEX(threads, maxFailures) =>
def localCpuCount: Int = Runtime.getRuntime.availableProcessors()
// local[*, M] means the number of cores on the computer with M failures
// local[N, M] means exactly N threads with M failures
val threadCount = if (threads == "*") localCpuCount else threads.toInt
val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true)
val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)
scheduler.initialize(backend)
(backend, scheduler)
case SPARK_REGEX(sparkUrl) =>
val scheduler = new TaskSchedulerImpl(sc)
val masterUrls = sparkUrl.split(",").map("spark://" + _)
val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls)
scheduler.initialize(backend)
(backend, scheduler)
case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>
// Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang.
val memoryPerSlaveInt = memoryPerSlave.toInt
if (sc.executorMemory > memoryPerSlaveInt) {
throw new SparkException(
"Asked to launch cluster with %d MB RAM / worker but requested %d MB/worker".format(
memoryPerSlaveInt, sc.executorMemory))
}
val scheduler = new TaskSchedulerImpl(sc)
val localCluster = new LocalSparkCluster(
numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt, sc.conf)
val masterUrls = localCluster.start()
val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls)
scheduler.initialize(backend)
backend.shutdownCallback = (backend: StandaloneSchedulerBackend) => {
localCluster.stop()
}
(backend, scheduler)
case MESOS_REGEX(mesosUrl) =>
MesosNativeLibrary.load()
val scheduler = new TaskSchedulerImpl(sc)
val coarseGrained = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = true)
val backend = if (coarseGrained) {
new MesosCoarseGrainedSchedulerBackend(scheduler, sc, mesosUrl, sc.env.securityManager)
} else {
new MesosFineGrainedSchedulerBackend(scheduler, sc, mesosUrl)
}
scheduler.initialize(backend)
(backend, scheduler)
case masterUrl =>
val cm = getClusterManager(masterUrl) match {
case Some(clusterMgr) => clusterMgr
case None => throw new SparkException("Could not parse Master URL: '" + master + "'")
}
try {
val scheduler = cm.createTaskScheduler(sc, masterUrl)
val backend = cm.createSchedulerBackend(sc, masterUrl, scheduler)
cm.initialize(scheduler, backend)
(backend, scheduler)
} catch {
case se: SparkException => throw se
case NonFatal(e) =>
throw new SparkException("External scheduler cannot be instantiated", e)
}
}
}
以TaskSchedulerImpl为例
override def defaultParallelism(): Int = backend.defaultParallelism()
以LocalSchedulerBackend为例
override def defaultParallelism(): Int =
scheduler.conf.getInt("spark.default.parallelism", totalCores)
从源码可以看到,首先获取conf中的spark.default.parallelism配置参数,如果没有配置,则默认使用totalCores,找到totalCores是创建LocalSchedulerBackend时传进来的
private[spark] class LocalSchedulerBackend(
conf: SparkConf,
scheduler: TaskSchedulerImpl,
val totalCores: Int)
那么回到createTaskScheduler方法
val backend = new LocalSchedulerBackend(sc.getConf, scheduler, 1)
val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)
自此parallelize方法如果没有指定partition数量,默认有几个partition取决于local,如果是local则默认是1,如果local中有参数则partition数量为参数
conf.setMaster("local[2]")
也可以在代码中设置
conf.set("spark.default.parallelism","2")
partition默认数量
parallelize中的partition数量优先级sc.parallelize(List(1,2,3,4,5,6,7),3)> conf.set("spark.default.parallelism","2") > conf.setMaster("local")
最后的partition数量为3
- parallelize中指定partition个数的详解
- 查找字符串中指定字符的个数
- 导入本地2--插入参数中指定个数的值
- Oracle实现获得字符串中指定字符个数的方法
- 统计正整数中指定数字的个数(4分)
- 微软推出的codehunt编码游戏很有意思 返回字符串中指定字符的个数
- C#,.net获取字符串中指定字符串的个数、所在位置与替换字符串
- 使用JavaScript获得tr有多少个(html中指定元素的个数)
- Excel中如何统计一行中指定类型元素的个数
- 设计一个算法,求非空二叉树中指定的第k层(k>1)的叶子节点的个数
- 删除文件中指定的部分内容
- VC中指定include文件的目录
- DropDownList中指定任意的选中项
- DataGridView中指定的单元格不能编辑
- Ext TreePanel中选中指定的节点
- 如何删除spfile中指定的参数
- 获取xml中指定节点的值
- 样式中指定调用的效果
- 《第一行代码Android》---读书笔记
- 亿级Web系统搭建——单机到分布式集群
- Unity3D中UGUI事件系统简述及使用方法总结
- 在Word2013中插入自选图形时自动创建绘图画布
- 机器蛇运动算法(二)--蠕动运动
- parallelize中指定partition个数的详解
- 使用 Go-Ethereum 1.7.2搭建以太坊私有链
- 电流型和电压型PHY
- sql server中如何循环添加字段
- VS2015发布和部署Webservice
- Android RecyclerView批量更新notifyItemRangeChanged
- javascript 动态画心
- Maven的环境搭建
- shiro框架的认证功能