第19课:Spark Streaming架构设计和运行机制大总结

来源:互联网 发布:photoshop. mac百度云 编辑:程序博客网 时间:2024/06/06 15:46

1.sparkStreaming中架构设计和运行机制

2.SparkStreaming深度思考

 

Spark Streaming的本质是在Spark的RDD的基础之上加上了Timer,Timer不断的运行触发周而复始的接受数据,产生Job处理数据。

因为时间是流动的,所以加上了时间维度的话,RDD也是流动的,所谓RDD的流动,就是每隔时间间隔,都会有新的RDD实例产生,这个实例来自于DStream,RDD之间的依赖关系的模板DAG也是DStream之间的依赖关系的模板,也就是所谓的DStreamGraph。

DStream是RDD的模板,DStreamGraph是RDD DAG的模板,DStream和DStreamGraph只是这个时间间隔Batch Duration中,整个RDD产生的一部分,或者说功能的一部分而已。

Spark Streaming推出这个DStream和DStreamGraph,只是在时间维度下,方便我们管理整个RDD的生命周期的方方面面。

ReceivedBlockTracker.scala

//分配所有未分配的块给定的批次。本次活动将得到写入到写前日志(如果启用)
def allocateBlocksToBatch(batchTime: Time): Unit= synchronized {
  if (lastAllocatedBatchTime== null ||batchTime >lastAllocatedBatchTime) {
    val streamIdToBlocks= streamIds.map { streamId =>
        (streamId,getReceivedBlockQueue(streamId).dequeueAll(x => true))
    }.toMap
    val allocatedBlocks= AllocatedBlocks(streamIdToBlocks)
    if (writeToLog(BatchAllocationEvent(batchTime,allocatedBlocks))) {
      //接收到的数据根据KEY放进去lastAllocatedBatchTime
      timeToAllocatedBlocks
.put(batchTime, allocatedBlocks)
      lastAllocatedBatchTime = batchTime
    } else {
      logInfo(s"Possibly processed batch $batchTimeneed to be processed again in WAL recovery")
    }
  } else {
    // This situation occurs when:
    // 1. WAL is ended withBatchAllocationEvent, but without BatchCleanupEvent,
    // possibly processed batch job orhalf-processed batch job need to be processed again,
    // so the batchTime will be equal tolastAllocatedBatchTime.
    // 2. Slow checkpointing makesrecovered batch time older than WAL recovered
    // lastAllocatedBatchTime.
    // This situation will only occurs inrecovery time.
   
logInfo(s"Possiblyprocessed batch$batchTime need to be processed again in WAL recovery")
  }
}

ReceiverSuppervisor

怎么产生数据的ReceiverSuppervisor,ReceiverSupervisorImpl ,BlockGenerator

/** Start receiver*/
def startReceiver(): Unit = synchronized {
  try {
    if (onReceiverStart()){
      logInfo("Starting receiver")
      receiverState = Started
     
receiver.onStart()
      logInfo("Called receiver onStart")
    } else {
      // The driver refused us
     
stop("Registeredunsuccessfully because Driver refused to start receiver "+ streamId, None)
    }
  } catch {
    case NonFatal(t) =>
      stop("Error starting receiver " +streamId, Some(t))
  }
}

ReceiverSupervisorImpl

//写数据的时候是通过ReceivedBlockHandler
private val receivedBlockHandler: ReceivedBlockHandler = {
  if (WriteAheadLogUtils.enableReceiverLog(env.conf)){
    if (checkpointDirOption.isEmpty){
      throw new SparkException(
        "Cannot enable receiver write-ahead log withoutcheckpoint directory set. "+
          "Please use streamingContext.checkpoint() to set thecheckpoint directory. "+
          "See documentation for more details.")
    }
    new WriteAheadLogBasedBlockHandler(env.blockManager,receiver.streamId,
      receiver.storageLevel, env.conf,hadoopConf, checkpointDirOption.get)
  } else {
    new BlockManagerBasedBlockHandler(env.blockManager,receiver.storageLevel)
  }
}

/** RemoteRpcEndpointRef for the ReceiverTracker */
private val trackerEndpoint= RpcUtils.makeDriverRef("ReceiverTracker", env.conf, env.rpcEnv)

/** RpcEndpointRef for receivingmessages from the ReceiverTracker in the driver */
private val endpoint= env.rpcEnv.setupEndpoint(
  "Receiver-" + streamId+"-" + System.currentTimeMillis(),new ThreadSafeRpcEndpoint {
    override val rpcEnv:RpcEnv = env.rpcEnv

    override def receive: PartialFunction[Any, Unit] = {
      case StopReceiver=>
        logInfo("Received stop signal")
        ReceiverSupervisorImpl.this.stop("Stoppedby driver", None)
      case CleanupOldBlocks(threshTime)=>
        logDebug("Received delete old batch signal")
        cleanupOldBlocks(threshTime)
      case UpdateRateLimit(eps)=>
        logInfo(s"Received a new rate limit: $eps.")
        registeredBlockGenerators.foreach { bg =>
          bg.updateRate(eps)
        }
    }
  })

BlockGenerator

private val blockIntervalTimer=
  new RecurringTimer(clock,blockIntervalMs, updateCurrentBuffer,"BlockGenerator")

 

/** Change thebuffer to which single records are added to. */
private def updateCurrentBuffer(time: Long): Unit = {
  try {
    var newBlock:Block = null
   
synchronized {
      if (currentBuffer.nonEmpty) {
        val newBlockBuffer= currentBuffer
        currentBuffer
= new ArrayBuffer[Any]
        val blockId= StreamBlockId(receiverId, time -blockIntervalMs)
        listener.onGenerateBlock(blockId)
        newBlock = new Block(blockId, newBlockBuffer)
      }
    }

    if (newBlock!= null) {
      blocksForPushing.put(newBlock) // put is blocking when queue is full
   
}
  } catch {
    case ie: InterruptedException =>
      logInfo("Block updating timer thread was interrupted")
    case e: Exception =>
      reportError("Error in block updating thread", e)
  }
}

/** Start blockgenerating and pushing threads. */
def start(): Unit = synchronized {
  if (state== Initialized) {
    state = Active
    blockIntervalTimer
.start()
    blockPushingThread.start()
    logInfo("Started BlockGenerator")
  } else {
    throw new SparkException(
      s"Cannot start BlockGenerator as its not in theInitialized state [state =$state]")
  }
}

blockIntervalTimer.start()

/**
 * Start at the earliest time it canstart based on the period.
 */
def start(): Long = {
  start(getStartTime())
}

/**
 * Start at the given start time.
 */
def start(startTime: Long): Long = synchronized {
  nextTime = startTime//每次调用的
 
thread
.start()  //调用thread启动后台进程。
 
logInfo("Startedtimer for " + name+" at time " + nextTime)
  nextTime
}

blockPushingThread

private val blockPushingThread= new Thread(){ override def run() { keepPushingBlocks() } }

/** Keep pushingblocks to the BlockManager. */
private def keepPushingBlocks() {
  logInfo("Started block pushing thread")

  def areBlocksBeingGenerated:Boolean = synchronized {
    state != StoppedGeneratingBlocks
 
}

  try {
    // While blocks are being generated, keep polling forto-be-pushed blocks and push them.
   
while (areBlocksBeingGenerated){
      Option(blocksForPushing.poll(10,TimeUnit.MILLISECONDS))match {
        case Some(block) =>pushBlock(block)
        case None=>
      }
    }

    // At this point, state is StoppedGeneratingBlock. Sodrain the queue of to-be-pushed blocks.
   
logInfo("Pushingout the last " + blocksForPushing.size() + " blocks")
    while (!blocksForPushing.isEmpty) {
      val block= blocksForPushing.take()
      logDebug(s"Pushing block $block")
      pushBlock(block)
      logInfo("Blocks left to push " +blocksForPushing.size())
    }
    logInfo("Stopped block pushing thread")
  } catch {
    case ie: InterruptedException =>
      logInfo("Block pushing thread was interrupted")
    case e: Exception =>
      reportError("Error in block pushing thread", e)
  }
}

 

RecurringTimer定时器

//调用thread启动后台进程。
private val thread= new Thread("RecurringTimer - "+ name) {
  setDaemon(true)
  override def run() { loop } //loop
}

整个引擎是无时无刻都在运行的

  /**
   * Repeatedly call the callback everyinterval.
   */
 
private def loop() {
    try {
      while (!stopped) {
        //tiggerActionForNextInterval源码如下:
       
triggerActionForNextInterval()
      }
      triggerActionForNextInterval()
    } catch {
      case e: InterruptedException =>
    }
  }
}

private def triggerActionForNextInterval(): Unit = {
  clock.waitTillTime(nextTime)
  callback(nextTime)
  prevTime = nextTime
  nextTime
+= period
  logDebug("Callback for " + name +" called at time " + prevTime)
}

 

ReceiverInputDStream

/**
 * Generates RDDs with blocks received bythe receiver of this stream. */
override def compute(validTime: Time): Option[RDD[T]] = {
  val blockRDD= {

    if (validTime< graph.startTime) {
      // If this is called for any time before the start timeof the context,
      // then this returns an empty RDD.This may happen when recovering from a
      // driver failure without any writeahead log to recover pre-failure data.
     
new BlockRDD[T](ssc.sc, Array.empty)
    } else {
      // Otherwise, ask the tracker for all the blocks thathave been allocated to this stream
      // for this batch
     
val receiverTracker= ssc.scheduler.receiverTracker
     
val blockInfos= receiverTracker.getBlocksOfBatch(validTime).getOrElse(id,Seq.empty)

      // Register the input blocks information intoInputInfoTracker
     
val inputInfo=StreamInputInfo(id, blockInfos.flatMap(_.numRecords).sum)
      ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)

      // Create the BlockRDD
     
createBlockRDD(validTime, blockInfos)
    }
  }
  Some(blockRDD)
}

private[streaming]def createBlockRDD(time:Time, blockInfos:Seq[ReceivedBlockInfo]): RDD[T] = {

  if (blockInfos.nonEmpty){
    val blockIds= blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray

    // Are WAL record handles present with all the blocks
   
val areWALRecordHandlesPresent= blockInfos.forall { _.walRecordHandleOption.nonEmpty }

    if(areWALRecordHandlesPresent) {
      // If all the blocks have WAL record handle, then createa WALBackedBlockRDD
     
val isBlockIdValid= blockInfos.map { _.isBlockIdValid() }.toArray
      val walRecordHandles= blockInfos.map { _.walRecordHandleOption.get }.toArray
      new WriteAheadLogBackedBlockRDD[T](
        ssc.sparkContext, blockIds,walRecordHandles, isBlockIdValid)
    } else {
      // Else, create a BlockRDD. However, if there are someblocks with WAL info but not
      // others then that is unexpectedand log a warning accordingly.
     
if (blockInfos.find(_.walRecordHandleOption.nonEmpty).nonEmpty){
        if (WriteAheadLogUtils.enableReceiverLog(ssc.conf)) {
          logError("Some blocks do not have Write Ahead Loginformation; "+
            "this is unexpected and data may not be recoverableafter driver failures")
        } else {
          logWarning("Some blocks have Write Ahead Loginformation; this is unexpected")
        }
      }
      val validBlockIds= blockIds.filter { id =>
       ssc.sparkContext.env.blockManager.master.contains(id)
      }
      if (validBlockIds.size!= blockIds.size) {
        logWarning("Some blocks could not be recovered asthey were not found in memory. "+
          "To prevent such data loss, enabled Write Ahead Log(see programming guide "+
          "for more details.")
      }
      new BlockRDD[T](ssc.sc,validBlockIds)
    }
  } else {
    // If no block is ready now, creating WriteAheadLogBackedBlockRDDor BlockRDD
    // according to the configuration
   
if (WriteAheadLogUtils.enableReceiverLog(ssc.conf)) {
      new WriteAheadLogBackedBlockRDD[T](
        ssc.sparkContext, Array.empty,Array.empty, Array.empty)
    } else {
      new BlockRDD[T](ssc.sc, Array.empty)
    }
  }
}

BlockRDD.scala

override def getPartitions: Array[Partition] = {
  assertValid()
  (0 untilblockIds.length).map(i => {
    new BlockRDDPartition(blockIds(i),i).asInstanceOf[Partition]
  }).toArray
}

 

 

为什么会出现有partition,但是partition没数据的情况呢?

默认的并行度是遗传的,并行度数量和partition数量是一样的,如果partition里没数据,但是partition还是存在的。

例如:父RDD有100个并行度,子RDD就剩下一条数据,但是并行度还是100个,还是100个partition,所以只有一个paritition有数据,其他都为空,没数据,这样的情况怎么处理呢?可以压缩处理,也就是coalesce处理。

DAG是静态的,RDD生成的时候,只知道元数据,RDD里面有没有数据,在计算的时候才知道,所以RDD肯定会生成的。

 

Spark Streaming中为什么不管有没有数据,都会产生RDD呢?

原因是:如果没有了RDD,就无法生成Job,提交作业的时候有action,action是作用于RDD的,但是没有了RDD,action就无法运行了,这个时候应用程序的执行直接失败;

 

Spark Streaming是一直不断的循环,不管有没有数据,有没有任务,也不关心时间间隔,会一直循环,整个引擎是无时无刻不在执行,不管有用没用,不管,一直执行,也就是死循环。

 

 

0 0