第6课:Spark Streaming源码解读之Job动态生成和深度思考

来源:互联网 发布:人才网站源码 编辑:程序博客网 时间:2024/05/20 04:12

本期内容:

1.SparkStreaming job生成深度思考

1.SparkStreaming job生成源码解析

 

基于DStream Graph动态生成jobs, JobSchedulersparkStreaming作业生成和提交给集群的核心.

SparkStreaming中除了定时生成的job,还有其他的方式生成job(各种聚合,状态)

1.DStream大体分为三种类型:

 

1、输入的DStreams

2、输出的DStreams,是一个逻辑级别的Action,它是SparkStreaming框架提出的,它的底层还是会被翻译成物理级别的Action,所谓物理级别的ActionRDDAction

3、中间的业务处理的Transformation

2.产生DStream有两种方式:

1直接基于数据源

2针对其他的DStream产生新的DStream

其实一切处理都是流处理的形式,只是表现形式不一样而已,一切批处理终将被流处理所取代。

3.每个BatchDuration的时候JobGenerator都会产生Job

4.DStream的依赖关系翻译成RDD之间的依赖关系

按时间触发batch Interval逻辑级别的job,是由底层物理级别的RDD去执行的. DStreamaction操作也是逻辑级别的。

最后一个RDD的操作是action会触发job,里面的实现就是方法的封装。这个方法里面是基于DStream的依赖生成的RDD之间的依赖关系,其中最后一个操作就是action,但是由于它在方法中,这个方法没有执行,所以不会执行。翻译的时候是把DStream的依赖关系,翻译成了RDD的依赖关系。放入队列中进行管理。最后一个DStreamaction操作被翻译成最后一个RDDaction操作,翻译后内容是一块内容,被放入函数体里面。因为函数体还没有执行,所以里面的RDD才没有执行。当我们的jobScheduler看见要调度这个job的时候转过来在线程池中拿出线程来执行这个方法

 

5.DStream动态生成的三大核心

JobScheduler, JobGenerator, ReceiverTracker

源码

JobGenerator.scala

start

def start(): Unit = synchronized {
 
if (eventLoop!= null)return // scheduler has already been started

 
logDebug("Starting JobScheduler")
 
eventLoop =new EventLoop[JobSchedulerEvent]("JobScheduler") {
   
override protected def onReceive(event: JobSchedulerEvent): Unit =processEvent(event)

   
override protected def onError(e:Throwable): Unit = reportError("Error in job scheduler", e)
 
}
  eventLoop.start()

if(ssc.isCheckpointPresent) {
   
restart()
  } else {
   
startFirstTime()
  }
}

 

2.EventLooprun

 

private valeventThread =new Thread(name) {
 
setDaemon(true)

 
override def run(): Unit = {
 
   try {
     
while (!stopped.get) {
       
val event =eventQueue.take()
       
try {
         
onReceive(event)
        } catch {
         
case NonFatal(e) => {
           
try {
             
onError(e)
            } catch {
             
case NonFatal(e) => logError("Unexpected error in "+ name, e)
           
}
          }
        }
      }
    } catch {
     
case ie:InterruptedException =>// exit even if eventQueue is not empty
     
case NonFatal(e) => logError("Unexpected error in "+ name, e)
   
}
  }
}

 

/** Processes all events */
private def processEvent(event: JobGeneratorEvent) {
 
logDebug("Got event " + event)
 
event match {
   
case GenerateJobs(time) =>generateJobs(time)
   
case ClearMetadata(time) =>clearMetadata(time)
   
case DoCheckpoint(time, clearCheckpointDataLater) =>
     
doCheckpoint(time, clearCheckpointDataLater)
    case ClearCheckpointData(time) => clearCheckpointData(time)
 
}
}

generateJobs

job生成和提交的过程

/** Generate jobs and perform checkpoint for the given`time`*/
private def generateJobs(time: Time) {
 
// Set the SparkEnv in this thread, so that job generation code can access the environment
 
// Example: BlockRDDs are created in this thread, and it needs to access BlockManager
  // Update: This is probably redundant after threadlocal stuff in SparkEnv has been removed.
 
SparkEnv.set(ssc.env)
 
Try {

//根据具体的时间分配要处理的数据
   
jobScheduler.receiverTracker.allocateBlocksToBatch(time)// allocate received blocks to batch
   
graph
.generateJobs(time)// generate jobs using allocated block
 
} match{
   
case Success(jobs) =>
     
val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
     
jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToInputInfos))
    case Failure(e) =>
     
jobScheduler.reportError("Error generating jobs for time "+ time, e)
 
}
  eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater =false))
}

def generateJobs(time: Time): Seq[Job] = {
 
logDebug("Generating jobs for time " + time)
 
val jobs =this.synchronized {
   
outputStreams.flatMap { outputStream =>
     
val jobOption = outputStream.generateJob(time)
     
jobOption.foreach(_.setCallSite(outputStream.creationSite))
     
jobOption
    }
  }
  logDebug("Generated " + jobs.length +" jobs for time " + time)
 
 jobs
}

 

DStream.generateJob

基于DStream物化成的RDD

/**
 
* Generate a SparkStreaming job for the given time. This is an internal method that
 * should not be called directly. This default implementation creates a job
 * that materializes the corresponding RDD. Subclasses of DStream may override this
 * to generate their own jobs.
 */
private[streaming]def generateJob(time: Time): Option[Job] = {
 
getOrCompute(time) match{
   
case Some(rdd) => {
     
val jobFunc = () => {
       
val emptyFunc = { (iterator:Iterator[T]) => {} }
       
context.sparkContext.runJob(rdd, emptyFunc)
      }
      Some(new Job(time, jobFunc))
   
}
    case None => None
 
}
}

Job.scala

/**
 
*
@return the output op id of this Job. Each Job has a unique output op id in the same JobSet.
 
*/
def outputOpId: Int = {
 
if (!isSet) {
   
throw new IllegalStateException("Cannot access number before calling setId")
 
}
  _outputOpId
}

 

DStream.getOrCompute

/**
 
* Get the RDD corresponding to the given time; either retrieve it from cache
 * or compute-and-cache it.
 */
private[streaming]final def getOrCompute(time: Time): Option[RDD[T]] = {
 
// If RDD was already generated, then retrieve it from HashMap,
 
// or else compute the RDD
  generatedRDDs
.get(time).orElse {
   
// Compute the RDD if time is valid (e.g. correct time in a sliding window)
   
// of RDD generation, else generate nothing.
   
if (isTimeValid(time)) {

     
val rddOption =createRDDWithLocalProperties(time, displayInnerRDDOps =false) {
       
// Disable checks for existing output directories in jobs launched by the streaming
       
// scheduler, since we may need to write output to an existing directory during checkpoint
        // recovery; see SPARK-4835 for more details. We need to have this call here because
        // compute() might cause Spark jobs to be launched.
       
PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
         
compute(time)  //基于时间生成RDD
       
}
      }
      rddOption.foreach { case newRDD =>
       
// Register the generated RDD for caching and checkpointing
       
if (storageLevel!= StorageLevel.NONE) {
         
newRDD.persist(storageLevel)
         
logDebug(s"Persisting RDD ${newRDD.id} for time$time to$storageLevel")
       
}
        if (checkpointDuration!= null&& (time - zeroTime).isMultipleOf(checkpointDuration)) {
         
newRDD.checkpoint()
          logInfo(s"Marking RDD ${newRDD.id} for time $time for checkpointing")
       
}
        generatedRDDs.put(time, newRDD)
     
}
      rddOption
    } else {
   
  None
    }
  }
}

JobGenerator.scala

startFirstTime

/** Starts the generator for the first time */
private def startFirstTime() {
 
val startTime =new Time(timer.getStartTime())
 
graph.start(startTime -graph.batchDuration)
 
timer.start(startTime.milliseconds)
 
logInfo("Started JobGenerator at " + startTime)
}

timer

private valtimer =new RecurringTimer(clock,ssc.graph.batchDuration.milliseconds,
 
longTime => eventLoop.post(GenerateJobs(newTime(longTime))), "JobGenerator")

RecurringTimer.start

/**
 
* Start at the given start time.
 */
def start(startTime: Long): Long = synchronized {
 
nextTime= startTime
 
thread.start()
 
logInfo("Started timer for " + name +" at time " +nextTime)
 
nextTime
}

//后台线程

private valthread =new Thread("RecurringTimer - "+ name) {
 
setDaemon(true)
 
override def run() {loop }
}


/**
 
* Repeatedly call the callback every interval.
 */
private def loop() {
 
try {
   
while (!stopped) {
     
triggerActionForNextInterval()
    }
    triggerActionForNextInterval()
  } catch {
   
case e:InterruptedException =>
 
}
}

 

private deftriggerActionForNextInterval(): Unit = {
 
clock.waitTillTime(nextTime)
 
callback(nextTime)
 
prevTime =nextTime
 
nextTime
+= period
 
logDebug("Callback for " + name +" called at time " +prevTime)
}

callback是谁给的,需要看RecurringTimer在哪里实例化的

class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name:String)
 
extends Logging {

是在JobGenerator.scala中实例化RecurringTimer的

private valtimer =new RecurringTimer(clock,ssc.graph.batchDuration.milliseconds,
 
longTime => eventLoop.post(GenerateJobs(newTime(longTime))), "JobGenerator")

jobScheduler.submitJobSet

def submitJobSet(jobSet: JobSet) {
 
if (jobSet.jobs.isEmpty) {
   
logInfo("No jobs added for time " + jobSet.time)
 
} else {
   
listenerBus.post(StreamingListenerBatchSubmitted(jobSet.toBatchInfo))
   
jobSets.put(jobSet.time, jobSet)
   
jobSet.jobs.foreach(job => jobExecutor.execute(newJobHandler(job)))
   
logInfo("Added jobs for time " + jobSet.time)
 
}
}

private valjobSets: java.util.Map[Time, JobSet] =new ConcurrentHashMap[Time, JobSet]

 

JobHandler就是一个Runnable接口

private classJobHandler(job: Job) extendsRunnable withLogging {
   
import JobScheduler._

   
def run() {
     
try {
       
val formattedTime = UIUtils.formatBatchTime(
         
job.time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS =false)
       
val batchUrl =s"/streaming/batch/?id=${job.time.milliseconds}"
       
val
batchLinkText =s"[output operation ${job.outputOpId}, batch time${formattedTime}]"

       
ssc.sc.setJobDescription(
         
s"""Streaming job from <a href="$batchUrl">$batchLinkText</a>""")
       
ssc.sc.setLocalProperty(BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
       
ssc.sc.setLocalProperty(OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)

       
// We need to assign `eventLoop` to a temp variable. Otherwise, because
       
// `JobScheduler.stop(false)` may set `eventLoop` to null when this method is running, then
        // it's possible that when `post` is called, `eventLoop` happens to null.
       
var _eventLoop =eventLoop
       
if (_eventLoop !=null) {
         
_eventLoop.post(JobStarted(job, clock.getTimeMillis()))
         
// Disable checks for existing output directories in jobs launched by the streaming
         
// scheduler, since we may need to write output to an existing directory during checkpoint
          // recovery; see SPARK-4835 for more details.
         
PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
           
job.run()
          }
          _eventLoop = eventLoop
         
if (_eventLoop !=null) {
           
_eventLoop.post(JobCompleted(job, clock.getTimeMillis()))
         
}
        } else {
         
// JobScheduler has been stopped.
       
}
     
} finally {
       
ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY,null)
       
ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY,null)
     
}
    }
  }
}

jobfunction调用

classJob(valtime: Time, func: () => _) {
 
private var _id:String = _
 
private var _outputOpId: Int =_
 
private var isSet= false
 
private var
_result: Try[_] =null
 
private var
_callSite: CallSite =null
 
private var
_startTime: Option[Long] =None
 
private var _endTime: Option[Long] =None

 
def run() {
   
_result =Try(func())
 
}

 

 

博主:罗白莲
资料来源于:王家林(Spark版本定制班课程)
新浪微博:http://www.weibo.com/ilovepains

 

0 0
原创粉丝点击