第19课：Spark Streaming架构设计和运行机制大总结

来源：互联网发布：photoshop. mac百度云编辑：程序博客网时间：2024/06/06 15:46

1.sparkStreaming中架构设计和运行机制

2.SparkStreaming深度思考

Spark Streaming的本质是在Spark的RDD的基础之上加上了Timer，Timer不断的运行触发周而复始的接受数据，产生Job处理数据。

因为时间是流动的，所以加上了时间维度的话，RDD也是流动的，所谓RDD的流动，就是每隔时间间隔，都会有新的RDD实例产生，这个实例来自于DStream，RDD之间的依赖关系的模板DAG也是DStream之间的依赖关系的模板，也就是所谓的DStreamGraph。

DStream是RDD的模板，DStreamGraph是RDD DAG的模板，DStream和DStreamGraph只是这个时间间隔Batch Duration中，整个RDD产生的一部分，或者说功能的一部分而已。

Spark Streaming推出这个DStream和DStreamGraph，只是在时间维度下，方便我们管理整个RDD的生命周期的方方面面。

ReceivedBlockTracker.scala

//分配所有未分配的块给定的批次。本次活动将得到写入到写前日志（如果启用）
def allocateBlocksToBatch(batchTime: Time): Unit= synchronized {
if (lastAllocatedBatchTime== null ||batchTime >lastAllocatedBatchTime) {
    val streamIdToBlocks= streamIds.map { streamId =>
        (streamId,getReceivedBlockQueue(streamId).dequeueAll(x => true))
    }.toMap
    val allocatedBlocks= AllocatedBlocks(streamIdToBlocks)
    if (writeToLog(BatchAllocationEvent(batchTime,allocatedBlocks))) {
      //接收到的数据根据KEY放进去lastAllocatedBatchTime
      timeToAllocatedBlocks.put(batchTime, allocatedBlocks)
      lastAllocatedBatchTime = batchTime
    } else {
      logInfo(s"Possibly processed batch $batchTimeneed to be processed again in WAL recovery")
    }
} else {
    // This situation occurs when:
    // 1. WAL is ended withBatchAllocationEvent, but without BatchCleanupEvent,
    // possibly processed batch job orhalf-processed batch job need to be processed again,
    // so the batchTime will be equal tolastAllocatedBatchTime.
    // 2. Slow checkpointing makesrecovered batch time older than WAL recovered
    // lastAllocatedBatchTime.
    // This situation will only occurs inrecovery time.
    logInfo(s"Possiblyprocessed batch$batchTime need to be processed again in WAL recovery")
}
}

ReceiverSuppervisor

怎么产生数据的ReceiverSuppervisor,ReceiverSupervisorImpl ,BlockGenerator

/** Start receiver*/
def startReceiver(): Unit = synchronized {
try {
    if (onReceiverStart()){
      logInfo("Starting receiver")
      receiverState = Started
      receiver.onStart()
      logInfo("Called receiver onStart")
    } else {
      // The driver refused us
      stop("Registeredunsuccessfully because Driver refused to start receiver "+ streamId, None)
    }
} catch {
    case NonFatal(t) =>
      stop("Error starting receiver " +streamId, Some(t))
}
}

ReceiverSupervisorImpl

//写数据的时候是通过ReceivedBlockHandler
private val receivedBlockHandler: ReceivedBlockHandler = {
if (WriteAheadLogUtils.enableReceiverLog(env.conf)){
    if (checkpointDirOption.isEmpty){
      throw new SparkException(
        "Cannot enable receiver write-ahead log withoutcheckpoint directory set. "+
          "Please use streamingContext.checkpoint() to set thecheckpoint directory. "+
          "See documentation for more details.")
    }
    new WriteAheadLogBasedBlockHandler(env.blockManager,receiver.streamId,
      receiver.storageLevel, env.conf,hadoopConf, checkpointDirOption.get)
} else {
    new BlockManagerBasedBlockHandler(env.blockManager,receiver.storageLevel)
}
}

/** RemoteRpcEndpointRef for the ReceiverTracker */
private val trackerEndpoint= RpcUtils.makeDriverRef("ReceiverTracker", env.conf, env.rpcEnv)

/** RpcEndpointRef for receivingmessages from the ReceiverTracker in the driver */
private val endpoint= env.rpcEnv.setupEndpoint(
"Receiver-" + streamId+"-" + System.currentTimeMillis(),new ThreadSafeRpcEndpoint {
    override val rpcEnv:RpcEnv = env.rpcEnv

    override def receive: PartialFunction[Any, Unit] = {
      case StopReceiver=>
        logInfo("Received stop signal")
        ReceiverSupervisorImpl.this.stop("Stoppedby driver", None)
      case CleanupOldBlocks(threshTime)=>
        logDebug("Received delete old batch signal")
        cleanupOldBlocks(threshTime)
      case UpdateRateLimit(eps)=>
        logInfo(s"Received a new rate limit: $eps.")
        registeredBlockGenerators.foreach { bg =>
          bg.updateRate(eps)
        }
    }
})

BlockGenerator

private val blockIntervalTimer=
new RecurringTimer(clock,blockIntervalMs, updateCurrentBuffer,"BlockGenerator")

/** Change thebuffer to which single records are added to. */
private def updateCurrentBuffer(time: Long): Unit = {
try {
    var newBlock:Block = null
    synchronized {
      if (currentBuffer.nonEmpty) {
        val newBlockBuffer= currentBuffer
        currentBuffer = new ArrayBuffer[Any]
        val blockId= StreamBlockId(receiverId, time -blockIntervalMs)
        listener.onGenerateBlock(blockId)
        newBlock = new Block(blockId, newBlockBuffer)
      }
    }

    if (newBlock!= null) {
      blocksForPushing.put(newBlock) // put is blocking when queue is full
    }
} catch {
    case ie: InterruptedException =>
      logInfo("Block updating timer thread was interrupted")
    case e: Exception =>
      reportError("Error in block updating thread", e)
}
}

/** Start blockgenerating and pushing threads. */
def start(): Unit = synchronized {
if (state== Initialized) {
    state = Active
    blockIntervalTimer.start()
    blockPushingThread.start()
    logInfo("Started BlockGenerator")
} else {
    throw new SparkException(
      s"Cannot start BlockGenerator as its not in theInitialized state [state =$state]")
}
}

blockIntervalTimer.start()

/**
* Start at the earliest time it canstart based on the period.
*/
def start(): Long = {
start(getStartTime())
}

/**
* Start at the given start time.
*/
def start(startTime: Long): Long = synchronized {
nextTime = startTime//每次调用的
thread.start() //调用thread启动后台进程。
logInfo("Startedtimer for " + name+" at time " + nextTime)
nextTime
}

blockPushingThread

private val blockPushingThread= new Thread(){ override def run() { keepPushingBlocks() } }

/** Keep pushingblocks to the BlockManager. */
private def keepPushingBlocks() {
logInfo("Started block pushing thread")

def areBlocksBeingGenerated:Boolean = synchronized {
    state != StoppedGeneratingBlocks
}

try {
    // While blocks are being generated, keep polling forto-be-pushed blocks and push them.
    while (areBlocksBeingGenerated){
      Option(blocksForPushing.poll(10,TimeUnit.MILLISECONDS))match {
        case Some(block) =>pushBlock(block)
        case None=>
      }
    }

    // At this point, state is StoppedGeneratingBlock. Sodrain the queue of to-be-pushed blocks.
    logInfo("Pushingout the last " + blocksForPushing.size() + " blocks")
    while (!blocksForPushing.isEmpty) {
      val block= blocksForPushing.take()
      logDebug(s"Pushing block $block")
      pushBlock(block)
      logInfo("Blocks left to push " +blocksForPushing.size())
    }
    logInfo("Stopped block pushing thread")
} catch {
    case ie: InterruptedException =>
      logInfo("Block pushing thread was interrupted")
    case e: Exception =>
      reportError("Error in block pushing thread", e)
}
}

RecurringTimer定时器

//调用thread启动后台进程。
private val thread= new Thread("RecurringTimer - "+ name) {
setDaemon(true)
override def run() { loop } //loop
}

整个引擎是无时无刻都在运行的

/**
   * Repeatedly call the callback everyinterval.
   */
private def loop() {
    try {
      while (!stopped) {
        //tiggerActionForNextInterval源码如下：
        triggerActionForNextInterval()
      }
      triggerActionForNextInterval()
    } catch {
      case e: InterruptedException =>
    }
}
}

private def triggerActionForNextInterval(): Unit = {
clock.waitTillTime(nextTime)
callback(nextTime)
prevTime = nextTime
nextTime += period
logDebug("Callback for " + name +" called at time " + prevTime)
}

ReceiverInputDStream

/**
* Generates RDDs with blocks received bythe receiver of this stream. */
override def compute(validTime: Time): Option[RDD[T]] = {
val blockRDD= {

    if (validTime< graph.startTime) {
      // If this is called for any time before the start timeof the context,
      // then this returns an empty RDD.This may happen when recovering from a
      // driver failure without any writeahead log to recover pre-failure data.
      new BlockRDD[T](ssc.sc, Array.empty)
    } else {
      // Otherwise, ask the tracker for all the blocks thathave been allocated to this stream
      // for this batch
      val receiverTracker= ssc.scheduler.receiverTracker
      val blockInfos= receiverTracker.getBlocksOfBatch(validTime).getOrElse(id,Seq.empty)

      // Register the input blocks information intoInputInfoTracker
      val inputInfo=StreamInputInfo(id, blockInfos.flatMap(_.numRecords).sum)
      ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)

      // Create the BlockRDD
      createBlockRDD(validTime, blockInfos)
    }
}
Some(blockRDD)
}

private[streaming]def createBlockRDD(time:Time, blockInfos:Seq[ReceivedBlockInfo]): RDD[T] = {

if (blockInfos.nonEmpty){
    val blockIds= blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray

    // Are WAL record handles present with all the blocks
    val areWALRecordHandlesPresent= blockInfos.forall { _.walRecordHandleOption.nonEmpty }

  if(areWALRecordHandlesPresent) {
      // If all the blocks have WAL record handle, then createa WALBackedBlockRDD
      val isBlockIdValid= blockInfos.map { _.isBlockIdValid() }.toArray
      val walRecordHandles= blockInfos.map { _.walRecordHandleOption.get }.toArray
      new WriteAheadLogBackedBlockRDD[T](
        ssc.sparkContext, blockIds,walRecordHandles, isBlockIdValid)
    } else {
      // Else, create a BlockRDD. However, if there are someblocks with WAL info but not
      // others then that is unexpectedand log a warning accordingly.
      if (blockInfos.find(_.walRecordHandleOption.nonEmpty).nonEmpty){
        if (WriteAheadLogUtils.enableReceiverLog(ssc.conf)) {
          logError("Some blocks do not have Write Ahead Loginformation; "+
            "this is unexpected and data may not be recoverableafter driver failures")
        } else {
          logWarning("Some blocks have Write Ahead Loginformation; this is unexpected")
        }
      }
      val validBlockIds= blockIds.filter { id =>
       ssc.sparkContext.env.blockManager.master.contains(id)
      }
      if (validBlockIds.size!= blockIds.size) {
        logWarning("Some blocks could not be recovered asthey were not found in memory. "+
          "To prevent such data loss, enabled Write Ahead Log(see programming guide "+
          "for more details.")
      }
      new BlockRDD[T](ssc.sc,validBlockIds)
    }
} else {
    // If no block is ready now, creating WriteAheadLogBackedBlockRDDor BlockRDD
    // according to the configuration
    if (WriteAheadLogUtils.enableReceiverLog(ssc.conf)) {
      new WriteAheadLogBackedBlockRDD[T](
        ssc.sparkContext, Array.empty,Array.empty, Array.empty)
    } else {
      new BlockRDD[T](ssc.sc, Array.empty)
    }
}
}

BlockRDD.scala

override def getPartitions: Array[Partition] = {
assertValid()
(0 untilblockIds.length).map(i => {
new BlockRDDPartition(blockIds(i),i).asInstanceOf[Partition]
}).toArray
}

为什么会出现有partition，但是partition没数据的情况呢？

默认的并行度是遗传的，并行度数量和partition数量是一样的，如果partition里没数据，但是partition还是存在的。

例如：父RDD有100个并行度，子RDD就剩下一条数据，但是并行度还是100个，还是100个partition，所以只有一个paritition有数据，其他都为空，没数据，这样的情况怎么处理呢？可以压缩处理，也就是coalesce处理。

DAG是静态的，RDD生成的时候，只知道元数据，RDD里面有没有数据，在计算的时候才知道，所以RDD肯定会生成的。

Spark Streaming中为什么不管有没有数据，都会产生RDD呢？

原因是：如果没有了RDD，就无法生成Job，提交作业的时候有action，action是作用于RDD的，但是没有了RDD，action就无法运行了，这个时候应用程序的执行直接失败；

Spark Streaming是一直不断的循环，不管有没有数据，有没有任务，也不关心时间间隔，会一直循环，整个引擎是无时无刻不在执行，不管有用没用，不管，一直执行，也就是死循环。

0 0