第6课：Spark Streaming源码解读之Job动态生成和深度思考

来源：互联网发布：百度地图纠偏数据库编辑：程序博客网时间：2024/05/20 02:28

1. DStream三种类型：

1) 输入的DStreams: Kafka，Socket，Flume；

2) 输出的DStreams，是一个逻辑级的Action，它是SparkStreaming框架提出的，底层还是会被翻译成物理级别的Action，所谓物理级别的Action是RDD的Action；

3) 中间的Transformation, 业务逻辑处理

2. 产生DStream有两种方式：基于(1)数据源和(2)其他的DStream而产生

1）直接基于数据源构建DStream；

2）针对其他DStream进行操作产生的新DStream；

3.一切不是流处理的数据或和流处理没有关系的数据都没有价值的数据。

SparkStream以时间为触发的流处理

val ssc = new StreamingContext(conf, Seconds(5))

每5秒中JobGenerator都会产生一个Job，逻辑级别的。

逻辑级别的Job：有个Job，说怎么做，但没有做，谁去做，由底层物理级别的Action去触发的。

DStream的Action操作也是逻辑级别的，作为Runnable接口进行封装, 没有直接生成物理级别的Job，让我们有机会调度优化.

DStream依赖关系与RDD依赖关系

把DStream的依赖关系翻译成RDD之间的依赖关系，由于DStream依赖关系的最后一个一定是Action的操作，翻译成RDD的时候，RDD最后一个也是Action级别的操作，如果翻译的时候，直接执行了，它就直接生成了Job，就没有所谓的队列之类，它也就不受管理了，会把这个翻译的RDD放在一个方法中，只不过是方法的定义的部分，我们所说的物理级别的RDD，实际上确实是被翻译成了RDD，只不过RDD所有的翻译的内容，都是在一个方法中，这个方法还没有执行，所以方法中的RDD无法执行，当JobScheduler要调度这个Job的时候，就转过来在线程池中拿出一条线程执行刚才的封装的方法。

源码

JobGenerator：负责Job的生成；

JobScheduler：负责Job的调度；

ReceiverTracker：负责接收数据；

JobGenerator&ReceiverTracker其实是JobScheduler的成员。

//JobScheduler的start方法中有两个调用

    receiverTracker.start()
    jobGenerator.start()

//JobGenerator的start方法
    if (eventLoop != null) return // generator has already been started
     ……
    //匿名内部类
    eventLoop = new EventLoop[JobGeneratorEvent]("JobGenerator") {
      //重载了EventLoop的方法
      override protected def onReceive(event: JobGeneratorEvent): Unit = processEvent(event)
      override protected def onError(e: Throwable): Unit = {
        jobScheduler.reportError("Error in job generator", e)
      }
    }
    eventLoop.start()//启动eventLoop 

    if (ssc.isCheckpointPresent) {
      restart()
    } else {
      startFirstTime()
    }
  }

/** Starts the generator for the first time */
private def startFirstTime() {
  val startTime = new Time(timer.getStartTime())
  graph.start(startTime - graph.batchDuration)
  timer.start(startTime.milliseconds)
  logInfo("Started JobGenerator at " + startTime)
}

//Timer的Start方法，线程调用
def start(startTime: Long): Long = synchronized {
  nextTime = startTime
  thread.start()
  logInfo("Started timer for " + name + " at time " + nextTime)
  nextTime
}

// 下面的EventLoop的定义及其中的start方法
 /**
 * An event loop to receive events from the caller and process all events in the event thread. It
 * will start an exclusive event thread to process all events.
 *
 * Note: The event queue will grow indefinitely. So subclasses should make sure `onReceive` can
 * handle events in time to avoid the potential OOM.
 */
private[spark] abstract class EventLoop[E](name: String) extends Logging {

  private val eventQueue: BlockingQueue[E] = new LinkedBlockingDeque[E]()

  private val stopped = new AtomicBoolean(false)

  private val eventThread = new Thread(name) {
    setDaemon(true);  //线程后台运行

    override def run(): Unit = {
      try {
        while (!stopped.get) {
          val event = eventQueue.take()  //不断地取消息
          try {
            onReceive(event)
          } catch {
            case NonFatal(e) => {
              try {
                onError(e)
              } catch {
                case NonFatal(e) => logError("Unexpected error in " + name, e)
              }
            }
          }
        }
      } catch {
        case ie: InterruptedException => // exit even if eventQueue is not empty
        case NonFatal(e) => logError("Unexpected error in " + name, e)
      }
    }

  }

  def start(): Unit = {
    if (stopped.get) {
      throw new IllegalStateException(name + " has already been stopped")
    }
    // Call onStart before starting the event thread to make sure it happens before onReceive
    onStart()
    eventThread.start()
  }

/**
   * Invoked in the event thread when polling events from the event queue.
   *
   * Note: Should avoid calling blocking actions in `onReceive`, or the event thread will be blocked
   * and cannot process events in time. If you want to call some blocking actions, run them in
   * another thread.
   */
  protected def onReceive(event: E): Unit
……
｝

//JobGenerator的processEvent方法

/** Processes all events */

private def processEvent(event: JobGeneratorEvent) {

logDebug("Got event " + event)

event match {

case GenerateJobs(time) => generateJobs(time) //以time作为参数

case ClearMetadata(time) => clearMetadata(time)

case DoCheckpoint(time, clearCheckpointDataLater) =>

doCheckpoint(time, clearCheckpointDataLater)

case ClearCheckpointData(time) => clearCheckpointData(time)

}

/** Generate jobs and perform checkpoint for the given `time`.  */
private def generateJobs(time: Time) {
  // Set the SparkEnv in this thread, so that job generation code can access the environment
  // Example: BlockRDDs are created in this thread, and it needs to access BlockManager
  // Update: This is probably redundant after threadlocal stuff in SparkEnv has been removed.
  SparkEnv.set(ssc.env)
  Try {
    jobScheduler.receiverTracker.allocateBlocksToBatch(time) // allocate received blocks to batch
    graph.generateJobs(time) // generate jobs using allocated block
  } match {
    case Success(jobs) =>
      val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
      jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToInputInfos))  //*** submitJobSet
    case Failure(e) =>
      jobScheduler.reportError("Error generating jobs for time " + time, e)
  }
  eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater = false))
}

graph的generateJobs方法

def generateJobs(time: Time): Seq[Job] = {
  logDebug("Generating jobs for time " + time)
  val jobs = this.synchronized {
    outputStreams.flatMap { outputStream =>
      val jobOption = outputStream.generateJob(time）//整个DStream的最后一个
      jobOption.foreach(_.setCallSite(outputStream.creationSite))
      jobOption
    }
  }
  logDebug("Generated " + jobs.length + " jobs for time " + time)
  jobs
}

DStream的generateJob

/**
 * Generate a SparkStreaming job for the given time. This is an internal method that
 * should not be called directly. This default implementation creates a job
 * that materializes the corresponding RDD. Subclasses of DStream may override this
 * to generate their own jobs.
 */
private[streaming] def generateJob(time: Time): Option[Job] = {
  getOrCompute(time) match {
    case Some(rdd) => {
      val jobFunc = () => {
        val emptyFunc = { (iterator: Iterator[T]) => {} }
        context.sparkContext.runJob(rdd, emptyFunc)
      }
      Some(new Job(time, jobFunc))
    }
    case None => None
  }
}

private val timer = new RecurringTimer(clock, ssc.graph.batchDuration.milliseconds,

  longTime => eventLoop.post(GenerateJobs(new Time(longTime))), "JobGenerator")

RecurringTimer定义

class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: String)
  extends Logging {

  private val thread = new Thread("RecurringTimer - " + name) {
    setDaemon(true)
    override def run() { loop }
  }

/**
 * Start at the given start time.
 */
def start(startTime: Long): Long = synchronized {
  nextTime = startTime
  thread.start()
  logInfo("Started timer for " + name + " at time " + nextTime)
  nextTime
}

  /**
   * Repeatedly call the callback every interval.
   */
  private def loop() {
    try {
      while (!stopped) {
        triggerActionForNextInterval()
      }
      triggerActionForNextInterval()
    } catch {
      case e: InterruptedException =>
    }
  }
}

***submitJobSet方法

def submitJobSet(jobSet: JobSet) {
  if (jobSet.jobs.isEmpty) {
    logInfo("No jobs added for time " + jobSet.time)
  } else {
    listenerBus.post(StreamingListenerBatchSubmitted(jobSet.toBatchInfo))
    jobSets.put(jobSet.time, jobSet)
    jobSet.jobs.foreach(job => jobExecutor.execute(new JobHandler(job)))
    logInfo("Added jobs for time " + jobSet.time)
  }
}

JobHandler是继承了Runnable方法

private class JobHandler(job: Job) extends Runnable with Logging {
    import JobScheduler._

    def run() {
      try {
        val formattedTime = UIUtils.formatBatchTime(
          job.time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS = false)
        val batchUrl = s"/streaming/batch/?id=${job.time.milliseconds}"
        val batchLinkText = s"[output operation ${job.outputOpId}, batch time ${formattedTime}]"

        ssc.sc.setJobDescription(
          s"""Streaming job from <a href="$batchUrl">$batchLinkText</a>""")
        ssc.sc.setLocalProperty(BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
        ssc.sc.setLocalProperty(OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)

        // We need to assign `eventLoop` to a temp variable. Otherwise, because
        // `JobScheduler.stop(false)` may set `eventLoop` to null when this method is running, then
        // it's possible that when `post` is called, `eventLoop` happens to null.
        var _eventLoop = eventLoop
        if (_eventLoop != null) {
          _eventLoop.post(JobStarted(job, clock.getTimeMillis()))
          // Disable checks for existing output directories in jobs launched by the streaming
          // scheduler, since we may need to write output to an existing directory during checkpoint
          // recovery; see SPARK-4835 for more details.
          PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
            job.run()
          }
          _eventLoop = eventLoop
          if (_eventLoop != null) {
            _eventLoop.post(JobCompleted(job, clock.getTimeMillis()))
          }
        } else {
          // JobScheduler has been stopped.
        }
      } finally {
        ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, null)
        ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, null)
      }
    }
  }
}

0 0