spark 2.1 DiskBlockObjectWriter

来源:互联网 发布:ap100编程教程 编辑:程序博客网 时间:2024/06/08 08:12

call stack

In BlockManager.getDiskWriter method, create a DiskBlockObjectWriter object.

  def getDiskWriter(      blockId: BlockId,      file: File,      serializerInstance: SerializerInstance,      bufferSize: Int,      writeMetrics: ShuffleWriteMetrics): DiskBlockObjectWriter = {    val syncWrites = conf.getBoolean("spark.shuffle.sync", false)    new DiskBlockObjectWriter(file, serializerManager, serializerInstance, bufferSize,      syncWrites, writeMetrics, blockId)  }

trait ManualCloseOutputStream

  /**   * Guards against close calls, e.g. from a wrapping stream.   * Call manualClose to close the stream that was extended by this trait.   * Commit uses this trait to close object streams without paying the   * cost of closing and opening the underlying file.   */  private trait ManualCloseOutputStream extends OutputStream {    abstract override def close(): Unit = {      flush()    }    def manualClose(): Unit = {      super.close()    }  }

fields

 /** The file channel, used for repositioning / truncating the file. */  private var channel: FileChannel = null  private var mcs: ManualCloseOutputStream = null  private var bs: OutputStream = null  private var fos: FileOutputStream = null  private var ts: TimeTrackingOutputStream = null  private var objOut: SerializationStream = null  private var initialized = false  private var streamOpen = false  private var hasBeenClosed = false  /**   * Cursors used to represent positions in the file.   *   * xxxxxxxxxx|----------|-----|   *           ^          ^     ^   *           |          |    channel.position()   *           |        reportedPosition   *         committedPosition   *   * reportedPosition: Position at the time of the last update to the write metrics.   * committedPosition: Offset after last committed write.   * -----: Current writes to the underlying file.   * xxxxx: Committed contents of the file.   */  private var committedPosition = file.length()  private var reportedPosition = committedPosition  /**   * Keep track of number of records written and also use this to periodically   * output bytes written since the latter is expensive to do for each record.   */  private var numRecordsWritten = 0

initialize

  private def initialize(): Unit = {    fos = new FileOutputStream(file, true)    channel = fos.getChannel()    ts = new TimeTrackingOutputStream(writeMetrics, fos)    class ManualCloseBufferedOutputStream      extends BufferedOutputStream(ts, bufferSize) with ManualCloseOutputStream    mcs = new ManualCloseBufferedOutputStream  }
def open(): DiskBlockObjectWriter = {    if (hasBeenClosed) {      throw new IllegalStateException("Writer already closed. Cannot be reopened.")    }    if (!initialized) {      initialize()      initialized = true    }    bs = serializerManager.wrapStream(blockId, mcs)    objOut = serializerInstance.serializeStream(bs)    streamOpen = true    this  }

closeResources

/**   * Close and cleanup all resources.   * Should call after committing or reverting partial writes.   */  private def closeResources(): Unit = {    if (initialized) {      mcs.manualClose()      channel = null      mcs = null      bs = null      fos = null      ts = null      objOut = null      initialized = false      streamOpen = false      hasBeenClosed = true    }  }

close

  /**   * Commits any remaining partial writes and closes resources.   */  override def close() {    if (initialized) {      Utils.tryWithSafeFinally {        commitAndGet()      } {        closeResources()      }    }  }

commitAndGet

/**   * Flush the partial writes and commit them as a single atomic block.   * A commit may write additional bytes to frame the atomic block.   *   * @return file segment with previous offset and length committed on this call.   */  def commitAndGet(): FileSegment = {    if (streamOpen) {      // NOTE: Because Kryo doesn't flush the underlying stream we explicitly flush both the      //       serializer stream and the lower level stream.      objOut.flush()      bs.flush()      objOut.close()      streamOpen = false      if (syncWrites) {        // Force outstanding writes to disk and track how long it takes        val start = System.nanoTime()        fos.getFD.sync()        writeMetrics.incWriteTime(System.nanoTime() - start)      }      val pos = channel.position()      val fileSegment = new FileSegment(file, committedPosition, pos - committedPosition)      committedPosition = pos      // In certain compression codecs, more bytes are written after streams are closed      writeMetrics.incBytesWritten(committedPosition - reportedPosition)      reportedPosition = committedPosition      fileSegment    } else {      new FileSegment(file, committedPosition, 0)    }  }

revertPartialWritesAndClose

/**   * Reverts writes that haven't been committed yet. Callers should invoke this function   * when there are runtime exceptions. This method will not throw, though it may be   * unsuccessful in truncating written data.   *   * @return the file that this DiskBlockObjectWriter wrote to.   */  def revertPartialWritesAndClose(): File = {    // Discard current writes. We do this by flushing the outstanding writes and then    // truncating the file to its initial position.    try {      if (initialized) {        writeMetrics.decBytesWritten(reportedPosition - committedPosition)        writeMetrics.decRecordsWritten(numRecordsWritten)        streamOpen = false        closeResources()      }      val truncateStream = new FileOutputStream(file, true)      try {        truncateStream.getChannel.truncate(committedPosition)        file      } finally {        truncateStream.close()      }    } catch {      case e: Exception =>        logError("Uncaught exception while reverting partial writes to file " + file, e)        file    }  }

write

  /**   * Writes a key-value pair.   */  def write(key: Any, value: Any) {    if (!streamOpen) {      open()    }    objOut.writeKey(key)    objOut.writeValue(value)    recordWritten()  }  override def write(kvBytes: Array[Byte], offs: Int, len: Int): Unit = {    if (!streamOpen) {      open()    }    bs.write(kvBytes, offs, len)  }
  /**   * Notify the writer that a record worth of bytes has been written with OutputStream#write.   */  def recordWritten(): Unit = {    numRecordsWritten += 1    writeMetrics.incRecordsWritten(1)    if (numRecordsWritten % 16384 == 0) {      updateBytesWritten()    }  }
  /**   * Report the number of bytes written in this writer's shuffle write metrics.   * Note that this is only valid before the underlying streams are closed.   */  private def updateBytesWritten() {    val pos = channel.position()    writeMetrics.incBytesWritten(pos - reportedPosition)    reportedPosition = pos  }
  // For testing  private[spark] override def flush() {    objOut.flush()    bs.flush()  }}