第6课：Spark Streaming源码解读之Job动态生成和深度思考

来源：互联网发布：人才网站源码编辑：程序博客网时间：2024/05/20 04:12

本期内容：

1.SparkStreaming job生成深度思考

1.SparkStreaming job生成源码解析

基于DStream Graph动态生成jobs, JobScheduler是sparkStreaming作业生成和提交给集群的核心.

SparkStreaming中除了定时生成的job,还有其他的方式生成job(各种聚合，状态)

1.DStream大体分为三种类型：

1、输入的DStreams；

2、输出的DStreams，是一个逻辑级别的Action，它是SparkStreaming框架提出的，它的底层还是会被翻译成物理级别的Action，所谓物理级别的Action是RDD的Action；

3、中间的业务处理的Transformation；

2.产生DStream有两种方式：

1直接基于数据源

2针对其他的DStream产生新的DStream

其实一切处理都是流处理的形式，只是表现形式不一样而已，一切批处理终将被流处理所取代。

3.每个BatchDuration的时候JobGenerator都会产生Job。

4.把DStream的依赖关系翻译成RDD之间的依赖关系

按时间触发batch Interval逻辑级别的job，是由底层物理级别的RDD去执行的. DStream的action操作也是逻辑级别的。

最后一个RDD的操作是action会触发job，里面的实现就是方法的封装。这个方法里面是基于DStream的依赖生成的RDD之间的依赖关系，其中最后一个操作就是action，但是由于它在方法中，这个方法没有执行，所以不会执行。翻译的时候是把DStream的依赖关系，翻译成了RDD的依赖关系。放入队列中进行管理。最后一个DStream的action操作被翻译成最后一个RDD的action操作，翻译后内容是一块内容，被放入函数体里面。因为函数体还没有执行，所以里面的RDD才没有执行。当我们的jobScheduler看见要调度这个job的时候转过来在线程池中拿出线程来执行这个方法

5.DStream动态生成的三大核心：

JobScheduler, JobGenerator, ReceiverTracker

源码

JobGenerator.scala

start

def start(): Unit = synchronized {
if (eventLoop!= null)return // scheduler has already been started

logDebug("Starting JobScheduler")
eventLoop =new EventLoop[JobSchedulerEvent]("JobScheduler") {
override protected def onReceive(event: JobSchedulerEvent): Unit =processEvent(event)

override protected def onError(e:Throwable): Unit = reportError("Error in job scheduler", e)
}
eventLoop.start()

if(ssc.isCheckpointPresent) {
restart()
} else {
startFirstTime()
}
}

2.EventLoop中run

private valeventThread =new Thread(name) {
setDaemon(true)

override def run(): Unit = {
    try {
      while (!stopped.get) {
        val event =eventQueue.take()
        try {
          onReceive(event)
        } catch {
          case NonFatal(e) => {
            try {
              onError(e)
            } catch {
              case NonFatal(e) => logError("Unexpected error in "+ name, e)
            }
          }
        }
      }
    } catch {
      case ie:InterruptedException =>// exit even if eventQueue is not empty
      case NonFatal(e) => logError("Unexpected error in "+ name, e)
    }
}
}

/** Processes all events */
private def processEvent(event: JobGeneratorEvent) {
logDebug("Got event " + event)
event match {
    case GenerateJobs(time) =>generateJobs(time)
    case ClearMetadata(time) =>clearMetadata(time)
    case DoCheckpoint(time, clearCheckpointDataLater) =>
      doCheckpoint(time, clearCheckpointDataLater)
    case ClearCheckpointData(time) => clearCheckpointData(time)
}
}

generateJobs

job生成和提交的过程

/** Generate jobs and perform checkpoint for the given`time`. */
private def generateJobs(time: Time) {
// Set the SparkEnv in this thread, so that job generation code can access the environment
// Example: BlockRDDs are created in this thread, and it needs to access BlockManager
// Update: This is probably redundant after threadlocal stuff in SparkEnv has been removed.
SparkEnv.set(ssc.env)
Try {

//根据具体的时间分配要处理的数据
    jobScheduler.receiverTracker.allocateBlocksToBatch(time)// allocate received blocks to batch
    graph.generateJobs(time)// generate jobs using allocated block
} match{
    case Success(jobs) =>
      val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
      jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToInputInfos))
    case Failure(e) =>
      jobScheduler.reportError("Error generating jobs for time "+ time, e)
}
eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater =false))
}

def generateJobs(time: Time): Seq[Job] = {
logDebug("Generating jobs for time " + time)
val jobs =this.synchronized {
    outputStreams.flatMap { outputStream =>
      val jobOption = outputStream.generateJob(time)
      jobOption.foreach(_.setCallSite(outputStream.creationSite))
      jobOption
    }
}
logDebug("Generated " + jobs.length +" jobs for time " + time)
  jobs
}

DStream.generateJob

基于DStream物化成的RDD

/**
* Generate a SparkStreaming job for the given time. This is an internal method that
* should not be called directly. This default implementation creates a job
* that materializes the corresponding RDD. Subclasses of DStream may override this
* to generate their own jobs.
*/
private[streaming]def generateJob(time: Time): Option[Job] = {
getOrCompute(time) match{
    case Some(rdd) => {
      val jobFunc = () => {
        val emptyFunc = { (iterator:Iterator[T]) => {} }
        context.sparkContext.runJob(rdd, emptyFunc)
      }
      Some(new Job(time, jobFunc))
    }
    case None => None
}
}

Job.scala

/**
* @return the output op id of this Job. Each Job has a unique output op id in the same JobSet.
*/
def outputOpId: Int = {
if (!isSet) {
throw new IllegalStateException("Cannot access number before calling setId")
}
_outputOpId
}

DStream.getOrCompute

/**
* Get the RDD corresponding to the given time; either retrieve it from cache
* or compute-and-cache it.
*/
private[streaming]final def getOrCompute(time: Time): Option[RDD[T]] = {
// If RDD was already generated, then retrieve it from HashMap,
// or else compute the RDD
generatedRDDs.get(time).orElse {
    // Compute the RDD if time is valid (e.g. correct time in a sliding window)
    // of RDD generation, else generate nothing.
    if (isTimeValid(time)) {

      val rddOption =createRDDWithLocalProperties(time, displayInnerRDDOps =false) {
        // Disable checks for existing output directories in jobs launched by the streaming
        // scheduler, since we may need to write output to an existing directory during checkpoint
        // recovery; see SPARK-4835 for more details. We need to have this call here because
        // compute() might cause Spark jobs to be launched.
        PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
          compute(time) //基于时间生成RDD
        }
      }
      rddOption.foreach { case newRDD =>
        // Register the generated RDD for caching and checkpointing
        if (storageLevel!= StorageLevel.NONE) {
          newRDD.persist(storageLevel)
          logDebug(s"Persisting RDD ${newRDD.id} for time$time to$storageLevel")
        }
        if (checkpointDuration!= null&& (time - zeroTime).isMultipleOf(checkpointDuration)) {
          newRDD.checkpoint()
          logInfo(s"Marking RDD ${newRDD.id} for time $time for checkpointing")
        }
        generatedRDDs.put(time, newRDD)
      }
      rddOption
    } else {
      None
    }
}
}

JobGenerator.scala

startFirstTime

/** Starts the generator for the first time */
private def startFirstTime() {
val startTime =new Time(timer.getStartTime())
graph.start(startTime -graph.batchDuration)
timer.start(startTime.milliseconds)
logInfo("Started JobGenerator at " + startTime)
}

timer

private valtimer =new RecurringTimer(clock,ssc.graph.batchDuration.milliseconds,
longTime => eventLoop.post(GenerateJobs(newTime(longTime))), "JobGenerator")

RecurringTimer.start

/**
* Start at the given start time.
*/
def start(startTime: Long): Long = synchronized {
nextTime= startTime
thread.start()
logInfo("Started timer for " + name +" at time " +nextTime)
nextTime
}

//后台线程

private valthread =new Thread("RecurringTimer - "+ name) {
setDaemon(true)
override def run() {loop }
}

/**
* Repeatedly call the callback every interval.
*/
private def loop() {
try {
    while (!stopped) {
      triggerActionForNextInterval()
    }
    triggerActionForNextInterval()
  } catch {
    case e:InterruptedException =>
}
}

private deftriggerActionForNextInterval(): Unit = {
clock.waitTillTime(nextTime)
callback(nextTime)
prevTime =nextTime
nextTime += period
logDebug("Callback for " + name +" called at time " +prevTime)
}

看callback是谁给的，需要看RecurringTimer在哪里实例化的

class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name:String)
extends Logging {

是在JobGenerator.scala中实例化RecurringTimer的

private valtimer =new RecurringTimer(clock,ssc.graph.batchDuration.milliseconds,
longTime => eventLoop.post(GenerateJobs(newTime(longTime))), "JobGenerator")

jobScheduler.submitJobSet

def submitJobSet(jobSet: JobSet) {
if (jobSet.jobs.isEmpty) {
    logInfo("No jobs added for time " + jobSet.time)
} else {
    listenerBus.post(StreamingListenerBatchSubmitted(jobSet.toBatchInfo))
    jobSets.put(jobSet.time, jobSet)
    jobSet.jobs.foreach(job => jobExecutor.execute(newJobHandler(job)))
    logInfo("Added jobs for time " + jobSet.time)
}
}

private valjobSets: java.util.Map[Time, JobSet] =new ConcurrentHashMap[Time, JobSet]

JobHandler就是一个Runnable接口

private classJobHandler(job: Job) extendsRunnable withLogging {
    import JobScheduler._

    def run() {
      try {
        val formattedTime = UIUtils.formatBatchTime(
          job.time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS =false)
        val batchUrl =s"/streaming/batch/?id=${job.time.milliseconds}"
        val batchLinkText =s"[output operation ${job.outputOpId}, batch time${formattedTime}]"

        ssc.sc.setJobDescription(
          s"""Streaming job from <a href="$batchUrl">$batchLinkText</a>""")
        ssc.sc.setLocalProperty(BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
        ssc.sc.setLocalProperty(OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)

        // We need to assign `eventLoop` to a temp variable. Otherwise, because
        // `JobScheduler.stop(false)` may set `eventLoop` to null when this method is running, then
        // it's possible that when `post` is called, `eventLoop` happens to null.
        var _eventLoop =eventLoop
        if (_eventLoop !=null) {
          _eventLoop.post(JobStarted(job, clock.getTimeMillis()))
          // Disable checks for existing output directories in jobs launched by the streaming
          // scheduler, since we may need to write output to an existing directory during checkpoint
          // recovery; see SPARK-4835 for more details.
          PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
            job.run()
          }
          _eventLoop = eventLoop
          if (_eventLoop !=null) {
            _eventLoop.post(JobCompleted(job, clock.getTimeMillis()))
          }
        } else {
          // JobScheduler has been stopped.
        }
      } finally {
        ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY,null)
        ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY,null)
      }
    }
}
}

job的function调用

classJob(valtime: Time, func: () => _) {
private var _id:String = _
private var _outputOpId: Int =_
private var isSet= false
private var _result: Try[_] =null
private var _callSite: CallSite =null
private var _startTime: Option[Long] =None
private var _endTime: Option[Long] =None

def run() {
_result =Try(func())
}

博主：罗白莲
资料来源于：王家林（Spark版本定制班课程）
新浪微博：http://www.weibo.com/ilovepains

0 0