spark1.2.0源码分析之spark streaming保存数据

来源：互联网发布：诡异知乎编辑：程序博客网时间：2024/05/07 08:40

本篇主要讲解一下：当 SocketReceiver 接收到数据之后进行保存的整个流程。

先看一下 SocketReceiver 的 receive() 方法：

  /** Create a socket connection and receive data until receiver is stopped */  def receive() {    var socket: Socket = null    try {      logInfo("Connecting to " + host + ":" + port)      socket = new Socket(host, port)      logInfo("Connected to " + host + ":" + port)      val iterator = bytesToObjects(socket.getInputStream())  //接收数据输入      while(!isStopped && iterator.hasNext) {        store(iterator.next)     //保存结果      }      logInfo("Stopped receiving")      restart("Retrying connecting to " + host + ":" + port)    } catch {      case e: java.net.ConnectException =>        restart("Error connecting to " + host + ":" + port, e)      case t: Throwable =>        restart("Error receiving data", t)    } finally {      if (socket != null) {        socket.close()        logInfo("Closed socket to " + host + ":" + port)      }    }  }

重点在 store(iterator.next) ，在父类 Receiver 中实现的，代码如下：

  /**   * Store a single item of received data to Spark's memory.   * These single items will be aggregated together into data blocks before   * being pushed into Spark's memory.   */  def store(dataItem: T) {    executor.pushSingle(dataItem)  }

executor 是 ReceiverSupervisor 的实例，具体实现类是 ReceiverSupervisorImpl，因此看其 pushSingle()方法：

  /** Push a single record of received data into block generator. */  def pushSingle(data: Any) {    blockGenerator.addData(data)  }

继续跟踪下去：

  /**   * Push a single data item into the buffer. All received data items   * will be periodically pushed into BlockManager.   */  def addData (data: Any): Unit = synchronized {    waitToPush()    currentBuffer += data  }

可以知道，真正执行的是在blockGenerator中，它是BlockGenerator的实例，上一篇文章我们讲过，在启动receiver前，块生成器就启动了：blockGenerator.start()，因此，先看一下start() 的方法：

  /** Start block generating and pushing threads. */  def start() {    blockIntervalTimer.start()    blockPushingThread.start()    logInfo("Started BlockGenerator")  }

先看 blockIntervalTimer.start() 方法：

  /**   * Start at the earliest time it can start based on the period.   */  def start(): Long = {    start(getStartTime())   //getStartTime得到的是下一个周期开始的时间  }

  /**   * Start at the given start time.   */  def start(startTime: Long): Long = synchronized {    nextTime = startTime    thread.start()    //开启一个线程    logInfo("Started timer for " + name + " at time " + nextTime)    nextTime  }

  private val thread = new Thread("RecurringTimer - " + name) {    setDaemon(true)    override def run() { loop }  }

最后执行的是 loop 方法：

  /**   * Repeatedly call the callback every interval.   */  private def loop() {    try {      while (!stopped) {        clock.waitTillTime(nextTime)  //sleep，等待下个周期到来        callback(nextTime)  //回调函数        prevTime = nextTime        nextTime += period        logDebug("Callback for " + name + " called at time " + prevTime)      }    } catch {      case e: InterruptedException =>    }  }

继续看 callback(nextTime) ，它是在构造 RecurringTimer 时传递过来的参数：updateCurrentBuffer ，具体代码如下：

  /** Change the buffer to which single records are added to. */  private def updateCurrentBuffer(time: Long): Unit = synchronized {    try {      val newBlockBuffer = currentBuffer  //当调用addData时，currentBuffer += data      currentBuffer = new ArrayBuffer[Any]  //将缓冲区清零      if (newBlockBuffer.size > 0) {        val blockId = StreamBlockId(receiverId, time - blockInterval) //blockInterval是产生块的时间间隔，默认200ms        val newBlock = new Block(blockId, newBlockBuffer)        listener.onGenerateBlock(blockId)           //blocksForPushing是一个阻塞的队列，默认容纳的块数为10个        blocksForPushing.put(newBlock)  // put is blocking when queue is full        logDebug("Last element in " + blockId + " is " + newBlockBuffer.last)      }    } catch {      case ie: InterruptedException =>        logInfo("Block updating timer thread was interrupted")      case e: Exception =>        reportError("Error in block updating thread", e)    }  }

这样就把块放入一个队列中了。

接着，在看一下 blockGenerator.start() 的第二个方法：blockPushingThread.start()，代码如下：

private val blockPushingThread = new Thread() { override def run() { keepPushingBlocks() } }

继续跟踪 keepPushingBlocks()：

  /** Keep pushing blocks to the BlockManager. */  private def keepPushingBlocks() {    logInfo("Started block pushing thread")    try {      while(!stopped) {        Option(blocksForPushing.poll(100, TimeUnit.MILLISECONDS)) match { //以轮询的方式，从队列中取出块          case Some(block) => pushBlock(block)          case None =>        }      }      // Push out the blocks that are still left      logInfo("Pushing out the last " + blocksForPushing.size() + " blocks")      while (!blocksForPushing.isEmpty) {  //当停止时，将队列中剩余的块取出        logDebug("Getting block ")        val block = blocksForPushing.take()        pushBlock(block)        logInfo("Blocks left to push " + blocksForPushing.size())      }      logInfo("Stopped block pushing thread")    } catch {      case ie: InterruptedException =>        logInfo("Block pushing thread was interrupted")      case e: Exception =>        reportError("Error in block pushing thread", e)    }  }

可以看出最后都会调用 pushBlock(block)，继续跟踪下去：

  private def pushBlock(block: Block) {    listener.onPushBlock(block.id, block.buffer)    logInfo("Pushed block " + block.id)  }

listener实际上在构造 BlockGenerator 实例时传递进来的，在 ReceiverSupervisorImpl 类里：

    def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]) {      pushArrayBuffer(arrayBuffer, None, Some(blockId))    }

  /** Store an ArrayBuffer of received data as a data block into Spark's memory. */  def pushArrayBuffer(      arrayBuffer: ArrayBuffer[_],      metadataOption: Option[Any],      blockIdOption: Option[StreamBlockId]    ) {    pushAndReportBlock(ArrayBufferBlock(arrayBuffer), metadataOption, blockIdOption)  }

  def pushAndReportBlock(      receivedBlock: ReceivedBlock,      metadataOption: Option[Any],      blockIdOption: Option[StreamBlockId]    ) {    val blockId = blockIdOption.getOrElse(nextBlockId)    val numRecords = receivedBlock match {      case ArrayBufferBlock(arrayBuffer) => arrayBuffer.size      case _ => -1    }    val time = System.currentTimeMillis    val blockStoreResult = receivedBlockHandler.storeBlock(blockId, receivedBlock)  //存储块信息    logDebug(s"Pushed block $blockId in ${(System.currentTimeMillis - time)} ms")    val blockInfo = ReceivedBlockInfo(streamId, numRecords, blockStoreResult)    val future = trackerActor.ask(AddBlock(blockInfo))(askTimeout)  //向driver汇报信息    Await.result(future, askTimeout)    logDebug(s"Reported block $blockId")  }

存储块具体代码如下：

  def storeBlock(blockId: StreamBlockId, block: ReceivedBlock): ReceivedBlockStoreResult = {    val putResult: Seq[(BlockId, BlockStatus)] = block match {      case ArrayBufferBlock(arrayBuffer) =>        blockManager.putIterator(blockId, arrayBuffer.iterator, storageLevel, tellMaster = true)  //按storageLevel来存储      case IteratorBlock(iterator) =>        blockManager.putIterator(blockId, iterator, storageLevel, tellMaster = true)      case ByteBufferBlock(byteBuffer) =>        blockManager.putBytes(blockId, byteBuffer, storageLevel, tellMaster = true)      case o =>        throw new SparkException(          s"Could not store $blockId to block manager, unexpected block type ${o.getClass.getName}")    }    if (!putResult.map { _._1 }.contains(blockId)) {      throw new SparkException(        s"Could not store $blockId to block manager with storage level $storageLevel")    }    BlockManagerBasedStoreResult(blockId)  }

最后在 BlockManager 中执行相关的存储操作，socketTextStream()方法默认的存储级别为：MEMORY_AND_DISK_SER_2，当内存不够时，保存到磁盘，保存两份副本。

*********** The End ***********

0 0