第12课:Spark Streaming源码解读之executor容错安全性

来源:互联网 发布:淘宝能不能换身份证 编辑:程序博客网 时间:2024/05/22 15:52

本期内容:

1.ExecutorWAL

2.消息重放

3.其它

 

StorageLevel.scala

Memory不够的时候才考虑disk

classStorageLevelprivate(
   
private var _useDisk: Boolean,
   
private var _useMemory: Boolean,
   
private var _useOffHeap: Boolean,
   
private var _deserialized: Boolean,
   
private var _replication: Int =1)
 
extends Externalizable {

ReceiverSupervisorImpl.scala

/** Store the bytes of received data as a data block into Spark's memory. */
def pushBytes(
   
bytes: ByteBuffer,
    metadataOption: Option[Any],
    blockIdOption: Option[StreamBlockId]
  ) {
  pushAndReportBlock(ByteBufferBlock(bytes), metadataOption, blockIdOption)
}

/** Store block and report it to driver */
def pushAndReportBlock(
   
receivedBlock: ReceivedBlock,
    metadataOption: Option[Any],
    blockIdOption: Option[StreamBlockId]
  ) {
  val blockId = blockIdOption.getOrElse(nextBlockId)
 
val time = System.currentTimeMillis
 
val blockStoreResult =receivedBlockHandler.storeBlock(blockId, receivedBlock)
 
logDebug(s"Pushed block $blockId in${(System.currentTimeMillis- time)} ms")
 
val numRecords = blockStoreResult.numRecords
 
val blockInfo =ReceivedBlockInfo(streamId, numRecords, metadataOption, blockStoreResult)
 
trackerEndpoint.askWithRetry[Boolean](AddBlock(blockInfo))
 
logDebug(s"Reported block $blockId")
}

receivedBlockHandler

private valreceivedBlockHandler: ReceivedBlockHandler = {
 
if (WriteAheadLogUtils.enableReceiverLog(env.conf)) {
   
if (checkpointDirOption.isEmpty) {
     
throw new SparkException(
       
"Cannot enable receiver write-ahead log without checkpoint directory set. "+
         
"Please use streamingContext.checkpoint() to set the checkpoint directory. "+
         
"See documentation for more details.")
   
}
    new WriteAheadLogBasedBlockHandler(env.blockManager, receiver.streamId,
     
receiver.storageLevel, env.conf, hadoopConf, checkpointDirOption.get)
  } else {
   
new BlockManagerBasedBlockHandler(env.blockManager, receiver.storageLevel)
 
}
}

def enableReceiverLog(conf: SparkConf): Boolean = {
 
conf.getBoolean(RECEIVER_WAL_ENABLE_CONF_KEY,false)
}

ReceivedBlockHandler

def storeBlock(blockId: StreamBlockId, block: ReceivedBlock): ReceivedBlockStoreResult = {

 
var numRecords = None: Option[Long]

 
val putResult:Seq[(BlockId, BlockStatus)] = blockmatch {
   
case ArrayBufferBlock(arrayBuffer) =>
     
numRecords = Some(arrayBuffer.size.toLong)
      blockManager.putIterator(blockId, arrayBuffer.iterator, storageLevel,
        tellMaster = true)
   
case IteratorBlock(iterator) =>
     
val countIterator =new CountingIterator(iterator)
     
val putResult = blockManager.putIterator(blockId, countIterator, storageLevel,
       
tellMaster = true)
     
numRecords = countIterator.count
      putResult
    case ByteBufferBlock(byteBuffer) =>
     
blockManager.putBytes(blockId, byteBuffer, storageLevel, tellMaster =true)
 
  case o =>
     
throw new SparkException(
       
s"Could not store $blockId to block manager, unexpected block type${o.getClass.getName}")
 
}
  if (!putResult.map { _._1 }.contains(blockId)) {
   
throw new SparkException(
     
s"Could not store $blockId to block manager with storage level$storageLevel")
 
}
  BlockManagerBasedStoreResult(blockId, numRecords)
}

def putIterator(
   
blockId: BlockId,
    values: Iterator[Any],
   
level: StorageLevel,
    tellMaster: Boolean = true,
   
effectiveStorageLevel: Option[StorageLevel] = None): Seq[(BlockId, BlockStatus)] = {
 
require(values != null,"Values is null")
 
doPut(blockId, IteratorValues(values), level, tellMaster, effectiveStorageLevel)
}

FileBasedWriteAheadLog

write

/**
 
* Write a byte buffer to the log file. This method synchronously writes the data in the
 * ByteBuffer to HDFS. When this method returns, the data is guaranteed to have been flushed
 * to HDFS, and will be available for readers to read.
 */
def write(byteBuffer: ByteBuffer, time: Long): FileBasedWriteAheadLogSegment = synchronized {
 
var fileSegment:FileBasedWriteAheadLogSegment = null
 
var
failures = 0
 
var lastException:Exception = null
 
var
succeeded = false
 
while
(!succeeded && failures < maxFailures) {
   
try {
   
  fileSegment = getLogWriter(time).write(byteBuffer)
      if (closeFileAfterWrite) {
       
resetWriter()
      }
      succeeded = true
   
} catch{
     
case ex:Exception =>
       
lastException = ex
        logWarning("Failed to write to write ahead log")
       
resetWriter()
        failures += 1
   
}
 
}
  if (fileSegment ==null) {
   
logError(s"Failed to write to write ahead log after$failures failures")
   
throw lastException
 
}
  fileSegment
}

/** Get the current log writer while taking care of rotation */
private def getLogWriter(currentTime: Long): FileBasedWriteAheadLogWriter = synchronized {
 
if (currentLogWriter== null|| currentTime > currentLogWriterStopTime) {
   
resetWriter()
    currentLogPath.foreach {
     
pastLogs +=LogInfo(currentLogWriterStartTime,currentLogWriterStopTime, _)
   
}
    currentLogWriterStartTime = currentTime
   
currentLogWriterStopTime = currentTime + (rollingIntervalSecs *1000)
   
val newLogPath =new Path(logDirectory,
     
timeToLogFile(currentLogWriterStartTime,currentLogWriterStopTime))
   
currentLogPath =Some(newLogPath.toString)
   
currentLogWriter =new FileBasedWriteAheadLogWriter(currentLogPath.get, hadoopConf)
 
}
  currentLogWriter
}

readAll

def readAll(): JIterator[ByteBuffer] = synchronized {
 
val logFilesToRead =pastLogs.map{ _.path} ++currentLogPath
 
logInfo("Reading from the logs:\n"+ logFilesToRead.mkString("\n"))
 
def readFile(file:String): Iterator[ByteBuffer] = {
   
logDebug(s"Creating log reader with $file")
   
val reader =new FileBasedWriteAheadLogReader(file, hadoopConf)
   
CompletionIterator[ByteBuffer, Iterator[ByteBuffer]](reader, reader.close _)
 
}
  if (!closeFileAfterWrite) {
   
logFilesToRead.iterator.map(readFile).flatten.asJava
  } else {
   
// For performance gains, it makes sense to parallelize the recovery if
   
// closeFileAfterWrite = true
    seqToParIterator
(threadpool, logFilesToRead, readFile).asJava
 
}
}

FileBasedWriteAheadLogReader

private[streaming]class FileBasedWriteAheadLogReader(path:String, conf: Configuration)
 
extends Iterator[ByteBuffer]with Closeablewith Logging {


private val instream= HdfsUtils.getInputStream(path, conf)
private var closed= (instream== null)// the file may be deleted as we're opening the stream
private var nextItem: Option[ByteBuffer] =None

 

override defhasNext: Boolean = synchronized {
 
if (closed) {
   
return false
 
}

 
if (nextItem.isDefined) {// handle the case where hasNext is called without calling next
   
true
 
} else{
   
try {
     
val length =instream.readInt()
     
val buffer =new Array[Byte](length)
     
instream.readFully(buffer)
     
nextItem =Some(ByteBuffer.wrap(buffer))
     
logTrace("Read next item " + nextItem.get)
     
true
   
} catch{
   
  case e: EOFException =>
       
logDebug("Error reading next item, EOF reached", e)
       
close()
        false
     
case
e: IOException =>
       
logWarning("Error while trying to read data. If the file was deleted, "+
         
"this should be okay.", e)
       
close()
        if (HdfsUtils.checkFileExists(path, conf)) {
         
// If file exists, this could be a legitimate error
         
throw e
       
} else {
         
// File was deleted. This can occur when the daemon cleanup thread takes time to
         
// delete the file during recovery.
         
false
       
}

     
case e:Exception =>
       
logWarning("Error while trying to read data from HDFS.", e)
       
close()
        throw e
   
}
  }
}

getInputStream

def getInputStream(path: String, conf: Configuration): FSDataInputStream = {
 
val dfsPath =new Path(path)
 
val dfs =getFileSystemForPath(dfsPath, conf)
 
if (dfs.isFile(dfsPath)) {
   
try {
     
dfs.open(dfsPath)
    } catch {
     
case e: IOException =>
       
// If we are really unlucky, the file may be deleted as we're opening the stream.
       
// This can happen as clean up is performed by daemon threads that may be left over from
        // previous runs.
       
if (!dfs.isFile(dfsPath))null else throw e
   
}
  } else {
   
null
 
}
}

DirectKafkaInputDStream

@tailrec
protected final def latestLeaderOffsets(retries: Int): Map[TopicAndPartition, LeaderOffset] = {
 
val o = kc.getLatestLeaderOffsets(currentOffsets.keySet)
 
// Either.fold would confuse @tailrec, do it manually
 
if (o.isLeft) {
   
val err = o.left.get.toString
   
if (retries <=0) {
     
throw new SparkException(err)
   
} else {
     
log.error(err)
      Thread.sleep(kc.config.refreshLeaderBackoffMs)
      latestLeaderOffsets(retries - 1)
   
}
  } else {
   
o.right.get
  }
}

KafkaRDD

private[kafka]
class KafkaRDD[
 
K: ClassTag,
  V: ClassTag,
  U <: Decoder[_]: ClassTag,
  T <: Decoder[_]: ClassTag,
  R: ClassTag] private[spark] (
   
sc: SparkContext,
    kafkaParams: Map[String, String],
    val offsetRanges: Array[OffsetRange],
   
leaders: Map[TopicAndPartition, (String, Int)],
    messageHandler: MessageAndMetadata[K, V] => R
  ) extends RDD[R](sc, Nil)with Loggingwith HasOffsetRanges {
 
override def getPartitions: Array[Partition] = {
   
offsetRanges.zipWithIndex.map { case (o, i) =>
    
   val (host, port) = leaders(TopicAndPartition(o.topic, o.partition))
       
new KafkaRDDPartition(i, o.topic, o.partition, o.fromOffset, o.untilOffset, host, port)
   
}.toArray
  }

getPreferredLocations

override defgetPreferredLocations(thePart: Partition): Seq[String] = {
 
val part = thePart.asInstanceOf[KafkaRDDPartition]
 
// TODO is additional hostname resolution necessary here
 
Seq(part.host)
}

connectLeader

// The idea is to use the provided preferred host, except on task retry atttempts,
// to minimize number of kafka metadata requests
private def connectLeader: SimpleConsumer = {
 
if (context.attemptNumber >0) {
   
kc.connectLeader(part.topic, part.partition).fold(
      errs => throw new SparkException(
       
s"Couldn't connect to leader for topic${part.topic}${part.partition}: "+
         
errs.mkString("\n")),
     
consumer => consumer
    )
  } else {
   
kc.connect(part.host, part.port)
  }
}

 

0 0
原创粉丝点击