第7课:Spark Streaming源码解读之JobScheduler内幕实现和深度思考

每个线程都有自己的私有属性。‘// Start the streaming scheduler in a new thread, so that thread local properties like call sites and job groups can be reset without affecting those of the current thread.’


/**   * Start the execution of the streams.   *   * @throws IllegalStateException if the StreamingContext is already stopped.   */  def start(): Unit = synchronized {    state match {      case INITIALIZED =>        startSite.set(DStream.getCreationSite())        StreamingContext.ACTIVATION_LOCK.synchronized {          StreamingContext.assertNoOtherContextIsActive()          try {            validate()            // Start the streaming scheduler in a new thread, so that thread local properties            // like call sites and job groups can be reset without affecting those of the            // current thread.            ThreadUtils.runInNewThread("streaming-start") {              sparkContext.setCallSite(startSite.get)              sparkContext.clearJobGroup()              sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")              scheduler.start()            }            state = StreamingContextState.ACTIVE          } catch {            case NonFatal(e) =>              logError("Error starting the context, marking it as stopped", e)              scheduler.stop(false)              state = StreamingContextState.STOPPED              throw e          }          StreamingContext.setActiveContext(this)        }        shutdownHookRef = ShutdownHookManager.addShutdownHook(          StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)        // Registering Streaming Metrics at the start of the StreamingContext        assert(env.metricsSystem != null)        env.metricsSystem.registerSource(streamingSource)        uiTab.foreach(_.attach())        logInfo("StreamingContext started")      case ACTIVE =>        logWarning("StreamingContext has already been started")      case STOPPED =>        throw new IllegalStateException("StreamingContext has already been stopped")    }  }


  def runInNewThread[T](      threadName: String,      isDaemon: Boolean = true)(body: => T): T = {    @volatile var exception: Option[Throwable] = None    @volatile var result: T = null.asInstanceOf[T]    val thread = new Thread(threadName) {      override def run(): Unit = {        try {          result = body        } catch {          case NonFatal(e) =>            exception = Some(e)        }      }    }    thread.setDaemon(isDaemon)    thread.start()    thread.join()

jobscheduler是在streamingContext实例化时实例化的。 jobscheduler在实例化时实例化了jobgenerator.
jobscheduler:* This class schedules jobs to be run on Spark. It uses the JobGenerator to generate the jobs and runs them using a thread pool.*/


  // Use of ConcurrentHashMap.keySet later causes an odd runtime problem due to Java 7/8 diff  // https://gist.github.com/AlainODea/1375759b8720a3f9f094  private val jobSets: java.util.Map[Time, JobSet] = new ConcurrentHashMap[Time, JobSet]  private val numConcurrentJobs = ssc.conf.getInt("spark.streaming.concurrentJobs", 1)  private val jobExecutor =    ThreadUtils.newDaemonFixedThreadPool(numConcurrentJobs, "streaming-job-executor")  private val jobGenerator = new JobGenerator(this)  val clock = jobGenerator.clock  val listenerBus = new StreamingListenerBus()  // These two are created only when scheduler starts.  // eventLoop not being null means the scheduler has been started and not stopped  var receiverTracker: ReceiverTracker = null  // A tracker to track all the input stream information as well as processed record number  var inputInfoTracker: InputInfoTracker = null  private var eventLoop: EventLoop[JobSchedulerEvent] = null


  /**   * Wrapper over newFixedThreadPool. Thread names are formatted as prefix-ID, where ID is a   * unique, sequentially assigned integer.   */  def newDaemonFixedThreadPool(nThreads: Int, prefix: String): ThreadPoolExecutor = {    val threadFactory = namedThreadFactory(prefix)    Executors.newFixedThreadPool(nThreads, threadFactory).asInstanceOf[ThreadPoolExecutor]  }


    /**     * Creates a thread pool that reuses a fixed number of threads     * operating off a shared unbounded queue, using the provided     * ThreadFactory to create new threads when needed.  At any point,     * at most {@code nThreads} threads will be active processing     * tasks.  If additional tasks are submitted when all threads are     * active, they will wait in the queue until a thread is     * available.  If any thread terminates due to a failure during     * execution prior to shutdown, a new one will take its place if     * needed to execute subsequent tasks.  The threads in the pool will     * exist until it is explicitly {@link ExecutorService#shutdown     * shutdown}.     *     * @param nThreads the number of threads in the pool     * @param threadFactory the factory to use when creating new threads     * @return the newly created thread pool     * @throws NullPointerException if threadFactory is null     * @throws IllegalArgumentException if {@code nThreads <= 0}     */    public static ExecutorService newFixedThreadPool(int nThreads, ThreadFactory threadFactory) {        return new ThreadPoolExecutor(nThreads, nThreads,                                      0L, TimeUnit.MILLISECONDS,                                      new LinkedBlockingQueue<Runnable>(),                                      threadFactory);    }

val listenerBus = new StreamingListenerBus()非常重要!

// These two are created only when scheduler starts.
// eventLoop not being null means the scheduler has been started and not stopped
var receiverTracker: ReceiverTracker = null
// A tracker to track all the input stream information as well as processed record number
var inputInfoTracker: InputInfoTracker = null


  def start(): Unit = synchronized {    if (eventLoop != null) return // scheduler has already been started    logDebug("Starting JobScheduler")    eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {      override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)      override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)    }    eventLoop.start()    // attach rate controllers of input streams to receive batch completion updates    for {      inputDStream <- ssc.graph.getInputStreams      rateController <- inputDStream.rateController    } ssc.addStreamingListener(rateController)    listenerBus.start(ssc.sparkContext)    receiverTracker = new ReceiverTracker(ssc)    inputInfoTracker = new InputInfoTracker(ssc)    receiverTracker.start()    jobGenerator.start()    logInfo("Started JobScheduler")  }



 /**   * Apply a function to each RDD in this DStream. This is an output operator, so   * 'this' DStream will be registered as an output stream and therefore materialized.   * @param foreachFunc foreachRDD function   * @param displayInnerRDDOps Whether the detailed callsites and scopes of the RDDs generated   *                           in the `foreachFunc` to be displayed in the UI. If `false`, then   *                           only the scopes and callsites of `foreachRDD` will override those   *                           of the RDDs on the display.   */  private def foreachRDD(      foreachFunc: (RDD[T], Time) => Unit,      displayInnerRDDOps: Boolean): Unit = {    new ForEachDStream(this,      context.sparkContext.clean(foreachFunc, false), displayInnerRDDOps).register()  }
** * An internal DStream used to represent output operations like DStream.foreachRDD. * @param parent        Parent DStream * @param foreachFunc   Function to apply on each RDD generated by the parent DStream * @param displayInnerRDDOps Whether the detailed callsites and scopes of the RDDs generated *                           by `foreachFunc` will be displayed in the UI; only the scope and *                           callsite of `DStream.foreachRDD` will be displayed. */private[streaming]class ForEachDStream[T: ClassTag] (    parent: DStream[T],    foreachFunc: (RDD[T], Time) => Unit,    displayInnerRDDOps: Boolean  ) extends DStream[Unit](parent.ssc) {  override def dependencies: List[DStream[_]] = List(parent)  override def slideDuration: Duration = parent.slideDuration  override def compute(validTime: Time): Option[RDD[Unit]] = None  override def generateJob(time: Time): Option[Job] = {    parent.getOrCompute(time) match {      case Some(rdd) =>        val jobFunc = () => createRDDWithLocalProperties(time, displayInnerRDDOps) {          foreachFunc(rdd, time)        }        Some(new Job(time, jobFunc))      case None => None    }  }}


