第7课:Spark Streaming源码解读之JobScheduler内幕实现和深度思考
来源:互联网 发布:迅雷你的网络未连接 编辑:程序博客网 时间:2024/05/17 04:04
本期内容:
1. JobScheduler内幕实现
2. JobScheduler深度思考
所有工作的关键都是jobScheduler
SparkStreaming至少要设置两条线程是因为一条用于接收数据,一条用于计算,调度和执行分离开。
源码
StreamingContext.scala
Start()
/**
* Start the execution of the streams.
*
* @throws IllegalStateException if the StreamingContext is already stopped.
*/
def start(): Unit = synchronized {
state match{
case INITIALIZED=>
startSite.set(DStream.getCreationSite())
StreamingContext.ACTIVATION_LOCK.synchronized {
StreamingContext.assertNoOtherContextIsActive()
try {
validate()
// Start the streaming scheduler in a new thread, so that thread local properties
// like call sites and job groups can be reset without affecting those of the
// current thread.
//调度层面启动的新线程
ThreadUtils.runInNewThread("streaming-start") {
sparkContext.setCallSite(startSite.get)
sparkContext.clearJobGroup()
sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL,"false")
scheduler.start()
}
state = StreamingContextState.ACTIVE
} catch{
case NonFatal(e) =>
logError("Error starting the context, marking it as stopped", e)
scheduler.stop(false)
state = StreamingContextState.STOPPED
throw e
}
StreamingContext.setActiveContext(this)
}
shutdownHookRef = ShutdownHookManager.addShutdownHook(
StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
// Registering Streaming Metrics at the start of the StreamingContext
assert(env.metricsSystem !=null)
env.metricsSystem.registerSource(streamingSource)
uiTab.foreach(_.attach())
logInfo("StreamingContext started")
case ACTIVE=>
logWarning("StreamingContext has already been started")
case STOPPED=>
throw new IllegalStateException("StreamingContext has already been stopped")
}
}
runInNewThread
defrunInNewThread[T](
threadName: String,
isDaemon: Boolean = true)(body: =>T): T= {
@volatile varexception: Option[Throwable] = None
@volatile varresult: T= null.asInstanceOf[T]
val thread =new Thread(threadName) {
override def run(): Unit = {
try {
result = body
} catch {
case NonFatal(e) =>
exception = Some(e)
}
}
}
thread.setDaemon(isDaemon)
thread.start()
thread.join()
exception match {
case Some(realException) =>
// Remove the part of the stack that shows method calls into this helper method
// This means drop everything from the top until the stack element
// ThreadUtils.runInNewThread(), and then drop that as well (hence the `drop(1)`).
val baseStackTrace = Thread.currentThread().getStackTrace().dropWhile(
! _.getClassName.contains(this.getClass.getSimpleName)).drop(1)
// Remove the part of the new thread stack that shows methods call from this helper method
val extraStackTrace = realException.getStackTrace.takeWhile(
! _.getClassName.contains(this.getClass.getSimpleName))
// Combine the two stack traces, with a place holder just specifying that there
// was a helper method used, without any further details of the helper
val placeHolderStackElem =new StackTraceElement(
s"... run in separate thread using${ThreadUtils.getClass.getName.stripSuffix("$")} ..",
" ","", -1)
val finalStackTrace = extraStackTrace ++Seq(placeHolderStackElem) ++ baseStackTrace
// Update the stack trace and rethrow the exception in the caller thread
realException.setStackTrace(finalStackTrace)
throw realException
case None =>
result
}
}
private[streaming]val scheduler= newJobScheduler(this)
jobScheduler.scala下面
private valjobExecutor =
ThreadUtils.newDaemonFixedThreadPool(numConcurrentJobs,"streaming-job-executor")
newDaemonFixedThreadPool
/**
* Wrapper over newFixedThreadPool. Thread names are formatted as prefix-ID, where ID is a
* unique, sequentially assigned integer.
*/
def newDaemonFixedThreadPool(nThreads: Int, prefix:String): ThreadPoolExecutor = {
val threadFactory =namedThreadFactory(prefix)
Executors.newFixedThreadPool(nThreads, threadFactory).asInstanceOf[ThreadPoolExecutor]
}
scheduler.start()
def start(): Unit = synchronized {
if (eventLoop!= null)return // scheduler has already been started
logDebug("Starting JobScheduler")
eventLoop =new EventLoop[JobSchedulerEvent]("JobScheduler") {
override protected def onReceive(event: JobSchedulerEvent): Unit =processEvent(event)
override protected def onError(e:Throwable): Unit = reportError("Error in job scheduler", e)
}
eventLoop.start()
// attach rate controllers of input streams to receive batch completion updates
for {
inputDStream <- ssc.graph.getInputStreams
rateController <- inputDStream.rateController
} ssc.addStreamingListener(rateController)
listenerBus.start(ssc.sparkContext)
receiverTracker =new ReceiverTracker(ssc)
inputInfoTracker =new InputInfoTracker(ssc)
receiverTracker.start()
jobGenerator.start()
logInfo("Started JobScheduler")
}
JobGenerator.start
/** Start generation of jobs */
def start(): Unit = synchronized {
if (eventLoop!= null)return // generator has already been started
// Call checkpointWriter here to initialize it before eventLoop uses it to avoid a deadlock.
// See SPARK-10125
checkpointWriter
eventLoop = newEventLoop[JobGeneratorEvent]("JobGenerator") {
override protected def onReceive(event: JobGeneratorEvent): Unit = processEvent(event)
override protected def onError(e:Throwable): Unit = {
jobScheduler.reportError("Error in job generator", e)
}
}
eventLoop.start()
if (ssc.isCheckpointPresent) {
restart()
} else {
startFirstTime()
}
}
ForEachDStream
private[streaming]
class ForEachDStream[T: ClassTag] (
parent: DStream[T],
foreachFunc: (RDD[T], Time) => Unit,
displayInnerRDDOps: Boolean
) extends DStream[Unit](parent.ssc) {
override def dependencies:List[DStream[_]] =List(parent)
override def slideDuration: Duration = parent.slideDuration
override def compute(validTime: Time): Option[RDD[Unit]] = None
override def generateJob(time: Time): Option[Job] = {
parent.getOrCompute(time) match {
case Some(rdd) =>
val jobFunc = () => createRDDWithLocalProperties(time, displayInnerRDDOps) {
foreachFunc(rdd, time)
}
Some(new Job(time, jobFunc))
case None => None
}
}
}
DStreamGraph.scala
def generateJobs(time: Time): Seq[Job] = {
logDebug("Generating jobs for time " + time)
val jobs =this.synchronized {
outputStreams.flatMap { outputStream =>
val jobOption = outputStream.generateJob(time)
jobOption.foreach(_.setCallSite(outputStream.creationSite))
jobOption
}
}
logDebug("Generated " + jobs.length +" jobs for time " + time)
jobs
}
private valoutputStreams =new ArrayBuffer[DStream[_]]()
JobGenerator. generateJobs
JobSet基于时间生成的job, streamIdToInputInfos是job要处理的数据
/** Generate jobs and perform checkpoint for the given`time`. */
private def generateJobs(time: Time) {
// Set the SparkEnv in this thread, so that job generation code can access the environment
// Example: BlockRDDs are created in this thread, and it needs to access BlockManager
// Update: This is probably redundant after threadlocal stuff in SparkEnv has been removed.
SparkEnv.set(ssc.env)
Try {
jobScheduler.receiverTracker.allocateBlocksToBatch(time)// allocate received blocks to batch
graph.generateJobs(time)// generate jobs using allocated block
} match{
case Success(jobs) =>
val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToInputInfos))
case Failure(e) =>
jobScheduler.reportError("Error generating jobs for time "+ time, e)
}
eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater =false))
}
def submitJobSet(jobSet: JobSet) {
if (jobSet.jobs.isEmpty) {
logInfo("No jobs added for time " + jobSet.time)
} else {
listenerBus.post(StreamingListenerBatchSubmitted(jobSet.toBatchInfo))
jobSets.put(jobSet.time, jobSet)
jobSet.jobs.foreach(job => jobExecutor.execute(newJobHandler(job)))
logInfo("Added jobs for time " + jobSet.time)
}
}
JobScheduler
private classJobHandler(job: Job) extendsRunnable withLogging {
import JobScheduler._
def run() {
try {
val formattedTime = UIUtils.formatBatchTime(
job.time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS =false)
val batchUrl =s"/streaming/batch/?id=${job.time.milliseconds}"
val batchLinkText =s"[output operation ${job.outputOpId}, batch time${formattedTime}]"
ssc.sc.setJobDescription(
s"""Streaming job from <a href="$batchUrl">$batchLinkText</a>""")
ssc.sc.setLocalProperty(BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
ssc.sc.setLocalProperty(OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)
// We need to assign `eventLoop` to a temp variable. Otherwise, because
// `JobScheduler.stop(false)` may set `eventLoop` to null when this method is running, then
// it's possible that when `post` is called, `eventLoop` happens to null.
var _eventLoop =eventLoop
if (_eventLoop !=null) {
_eventLoop.post(JobStarted(job,clock.getTimeMillis()))
// Disable checks for existing output directories in jobs launched by the streaming
// scheduler, since we may need to write output to an existing directory during checkpoint
// recovery; see SPARK-4835 for more details.
PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
job.run()
}
_eventLoop = eventLoop
if (_eventLoop !=null) {
_eventLoop.post(JobCompleted(job, clock.getTimeMillis()))
}
} else {
// JobScheduler has been stopped.
}
} finally {
ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY,null)
ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY,null)
}
}
}
private vareventLoop: EventLoop[JobSchedulerEvent] =null
def start(): Unit = synchronized {
if (eventLoop!= null)return // scheduler has already been started
logDebug("Starting JobScheduler")
eventLoop =new EventLoop[JobSchedulerEvent]("JobScheduler") {
override protected def onReceive(event: JobSchedulerEvent): Unit =processEvent(event)
override protected def onError(e:Throwable): Unit = reportError("Error in job scheduler", e)
}
eventLoop.start()
private defprocessEvent(event: JobSchedulerEvent) {
try {
event match {
case JobStarted(job, startTime) =>handleJobStart(job, startTime)
case JobCompleted(job, completedTime) => handleJobCompletion(job, completedTime)
case ErrorReported(m, e) => handleError(m, e)
}
} catch {
case e:Throwable =>
reportError("Error in job scheduler", e)
}
}
private defhandleJobStart(job: Job, startTime: Long) {
val jobSet =jobSets.get(job.time)
val isFirstJobOfJobSet = !jobSet.hasStarted
jobSet.handleJobStart(job)
if (isFirstJobOfJobSet) {
// "StreamingListenerBatchStarted" should be posted after calling "handleJobStart" to get the
// correct "jobSet.processingStartTime".
listenerBus.post(StreamingListenerBatchStarted(jobSet.toBatchInfo))
}
job.setStartTime(startTime)
listenerBus.post(StreamingListenerOutputOperationStarted(job.toOutputOperationInfo))
logInfo("Starting job " + job.id +" from job set of time " + jobSet.time)
}
def handleJobStart(job: Job) {
if (processingStartTime< 0)processingStartTime = System.currentTimeMillis()
}
回到JobScheduler.JobHandler内部类,run方法中job.run
把业务逻辑封装到function里面
class Job(valtime: Time, func: () => _) {
private var _id:String = _
private var _outputOpId: Int = _
private var isSet= false
private var _result: Try[_] =null
private var _callSite: CallSite =null
private var _startTime: Option[Long] = None
private var _endTime: Option[Long] = None
def run() {
_result =Try(func())
}
def result: Try[_] = {
if (_result== null) {
throw new IllegalStateException("Cannot access result before job finishes")
}
_result
}
ForEachDStream.scala
override defgenerateJob(time: Time): Option[Job] = {
parent.getOrCompute(time) match {
case Some(rdd) =>
val jobFunc = () => createRDDWithLocalProperties(time, displayInnerRDDOps) {
foreachFunc(rdd, time)
}
Some(new Job(time, jobFunc))
case None => None
}
}
想改变作业并发的执行,可改变其并发度(JobScheduler)
private valnumConcurrentJobs = ssc.conf.getInt("spark.streaming.concurrentJobs",1)
博主:罗白莲
资料来源于:王家林(Spark版本定制班课程)
新浪微博:http://www.weibo.com/ilovepains
- 第7课:Spark Streaming源码解读之JobScheduler内幕实现和深度思考
- 第7课:Spark Streaming源码解读之JobScheduler内幕实现和深度思考
- 第7课:Spark Streaming源码解读之JobScheduler内幕实现和深度思考
- 第7课:Spark Streaming源码解读之JobScheduler内幕实现和深度思考
- 第7课:Spark Streaming源码解读之JobScheduler内幕实现和深度思考
- Spark定制班第7课:Spark Streaming源码解读之JobScheduler内幕实现和深度思考
- Spark定制班第7课:Spark Streaming源码解读之JobScheduler内幕实现和深度思考
- Spark 定制版:007~Spark Streaming源码解读之JobScheduler内幕实现和深度思考
- Spark Streaming源码解读之JobScheduler内幕实现和深度思考
- Spark Streaming源码解读之JobScheduler内幕实现和深度思考
- 第128课: Spark Streaming源码经典解读系列之三:JobScheduler工作内幕源
- 第6课:Spark Streaming源码解读之Job动态生成和深度思考
- 第6课:Spark Streaming源码解读之Job动态生成和深度思考
- 第6课:Spark Streaming源码解读之Job动态生成和深度思考
- 第6课:Spark Streaming源码解读之Job动态生成和深度思考
- 第6课:Spark Streaming源码解读之Job动态生成和深度思考
- Spark定制班第6课:Spark Streaming源码解读之Job动态生成和深度思考
- Spark定制班第6课:Spark Streaming源码解读之Job动态生成和深度思考
- Java 常用排序算法
- 浅谈 EOF,BOF
- 杂记 - 老书店
- 智能网联汽车发展现状与趋势分析
- hzauoj Problem H: Eat Candy (暴力)
- 第7课:Spark Streaming源码解读之JobScheduler内幕实现和深度思考
- Angular入门2
- Building Web Apps in WebView
- 使用MATLAB工具箱TOOLBOX_calib标定摄像头过程
- Linux is not Matrix——日志搜集平台ELK(I)
- 搭建一个私有registry服务(基础版本)
- 归纳法证明汉诺塔解析式思路
- 67. Add Binary
- 360开源的类Redis存储系统:Pika