第12课:Spark Streaming源码解读之executor容错安全性
来源:互联网 发布:淘宝能不能换身份证 编辑:程序博客网 时间:2024/05/22 15:52
本期内容:
1.Executor的WAL
2.消息重放
3.其它
StorageLevel.scala
Memory不够的时候才考虑disk
classStorageLevelprivate(
private var _useDisk: Boolean,
private var _useMemory: Boolean,
private var _useOffHeap: Boolean,
private var _deserialized: Boolean,
private var _replication: Int =1)
extends Externalizable {
ReceiverSupervisorImpl.scala
/** Store the bytes of received data as a data block into Spark's memory. */
def pushBytes(
bytes: ByteBuffer,
metadataOption: Option[Any],
blockIdOption: Option[StreamBlockId]
) {
pushAndReportBlock(ByteBufferBlock(bytes), metadataOption, blockIdOption)
}
/** Store block and report it to driver */
def pushAndReportBlock(
receivedBlock: ReceivedBlock,
metadataOption: Option[Any],
blockIdOption: Option[StreamBlockId]
) {
val blockId = blockIdOption.getOrElse(nextBlockId)
val time = System.currentTimeMillis
val blockStoreResult =receivedBlockHandler.storeBlock(blockId, receivedBlock)
logDebug(s"Pushed block $blockId in${(System.currentTimeMillis- time)} ms")
val numRecords = blockStoreResult.numRecords
val blockInfo =ReceivedBlockInfo(streamId, numRecords, metadataOption, blockStoreResult)
trackerEndpoint.askWithRetry[Boolean](AddBlock(blockInfo))
logDebug(s"Reported block $blockId")
}
receivedBlockHandler
private valreceivedBlockHandler: ReceivedBlockHandler = {
if (WriteAheadLogUtils.enableReceiverLog(env.conf)) {
if (checkpointDirOption.isEmpty) {
throw new SparkException(
"Cannot enable receiver write-ahead log without checkpoint directory set. "+
"Please use streamingContext.checkpoint() to set the checkpoint directory. "+
"See documentation for more details.")
}
new WriteAheadLogBasedBlockHandler(env.blockManager, receiver.streamId,
receiver.storageLevel, env.conf, hadoopConf, checkpointDirOption.get)
} else {
new BlockManagerBasedBlockHandler(env.blockManager, receiver.storageLevel)
}
}
def enableReceiverLog(conf: SparkConf): Boolean = {
conf.getBoolean(RECEIVER_WAL_ENABLE_CONF_KEY,false)
}
ReceivedBlockHandler
def storeBlock(blockId: StreamBlockId, block: ReceivedBlock): ReceivedBlockStoreResult = {
var numRecords = None: Option[Long]
val putResult:Seq[(BlockId, BlockStatus)] = blockmatch {
case ArrayBufferBlock(arrayBuffer) =>
numRecords = Some(arrayBuffer.size.toLong)
blockManager.putIterator(blockId, arrayBuffer.iterator, storageLevel,
tellMaster = true)
case IteratorBlock(iterator) =>
val countIterator =new CountingIterator(iterator)
val putResult = blockManager.putIterator(blockId, countIterator, storageLevel,
tellMaster = true)
numRecords = countIterator.count
putResult
case ByteBufferBlock(byteBuffer) =>
blockManager.putBytes(blockId, byteBuffer, storageLevel, tellMaster =true)
case o =>
throw new SparkException(
s"Could not store $blockId to block manager, unexpected block type${o.getClass.getName}")
}
if (!putResult.map { _._1 }.contains(blockId)) {
throw new SparkException(
s"Could not store $blockId to block manager with storage level$storageLevel")
}
BlockManagerBasedStoreResult(blockId, numRecords)
}
def putIterator(
blockId: BlockId,
values: Iterator[Any],
level: StorageLevel,
tellMaster: Boolean = true,
effectiveStorageLevel: Option[StorageLevel] = None): Seq[(BlockId, BlockStatus)] = {
require(values != null,"Values is null")
doPut(blockId, IteratorValues(values), level, tellMaster, effectiveStorageLevel)
}
FileBasedWriteAheadLog
write
/**
* Write a byte buffer to the log file. This method synchronously writes the data in the
* ByteBuffer to HDFS. When this method returns, the data is guaranteed to have been flushed
* to HDFS, and will be available for readers to read.
*/
def write(byteBuffer: ByteBuffer, time: Long): FileBasedWriteAheadLogSegment = synchronized {
var fileSegment:FileBasedWriteAheadLogSegment = null
var failures = 0
var lastException:Exception = null
var succeeded = false
while (!succeeded && failures < maxFailures) {
try {
fileSegment = getLogWriter(time).write(byteBuffer)
if (closeFileAfterWrite) {
resetWriter()
}
succeeded = true
} catch{
case ex:Exception =>
lastException = ex
logWarning("Failed to write to write ahead log")
resetWriter()
failures += 1
}
}
if (fileSegment ==null) {
logError(s"Failed to write to write ahead log after$failures failures")
throw lastException
}
fileSegment
}
/** Get the current log writer while taking care of rotation */
private def getLogWriter(currentTime: Long): FileBasedWriteAheadLogWriter = synchronized {
if (currentLogWriter== null|| currentTime > currentLogWriterStopTime) {
resetWriter()
currentLogPath.foreach {
pastLogs +=LogInfo(currentLogWriterStartTime,currentLogWriterStopTime, _)
}
currentLogWriterStartTime = currentTime
currentLogWriterStopTime = currentTime + (rollingIntervalSecs *1000)
val newLogPath =new Path(logDirectory,
timeToLogFile(currentLogWriterStartTime,currentLogWriterStopTime))
currentLogPath =Some(newLogPath.toString)
currentLogWriter =new FileBasedWriteAheadLogWriter(currentLogPath.get, hadoopConf)
}
currentLogWriter
}
readAll
def readAll(): JIterator[ByteBuffer] = synchronized {
val logFilesToRead =pastLogs.map{ _.path} ++currentLogPath
logInfo("Reading from the logs:\n"+ logFilesToRead.mkString("\n"))
def readFile(file:String): Iterator[ByteBuffer] = {
logDebug(s"Creating log reader with $file")
val reader =new FileBasedWriteAheadLogReader(file, hadoopConf)
CompletionIterator[ByteBuffer, Iterator[ByteBuffer]](reader, reader.close _)
}
if (!closeFileAfterWrite) {
logFilesToRead.iterator.map(readFile).flatten.asJava
} else {
// For performance gains, it makes sense to parallelize the recovery if
// closeFileAfterWrite = true
seqToParIterator(threadpool, logFilesToRead, readFile).asJava
}
}
FileBasedWriteAheadLogReader
private[streaming]class FileBasedWriteAheadLogReader(path:String, conf: Configuration)
extends Iterator[ByteBuffer]with Closeablewith Logging {
private val instream= HdfsUtils.getInputStream(path, conf)
private var closed= (instream== null)// the file may be deleted as we're opening the stream
private var nextItem: Option[ByteBuffer] =None
override defhasNext: Boolean = synchronized {
if (closed) {
return false
}
if (nextItem.isDefined) {// handle the case where hasNext is called without calling next
true
} else{
try {
val length =instream.readInt()
val buffer =new Array[Byte](length)
instream.readFully(buffer)
nextItem =Some(ByteBuffer.wrap(buffer))
logTrace("Read next item " + nextItem.get)
true
} catch{
case e: EOFException =>
logDebug("Error reading next item, EOF reached", e)
close()
false
case e: IOException =>
logWarning("Error while trying to read data. If the file was deleted, "+
"this should be okay.", e)
close()
if (HdfsUtils.checkFileExists(path, conf)) {
// If file exists, this could be a legitimate error
throw e
} else {
// File was deleted. This can occur when the daemon cleanup thread takes time to
// delete the file during recovery.
false
}
case e:Exception =>
logWarning("Error while trying to read data from HDFS.", e)
close()
throw e
}
}
}
getInputStream
def getInputStream(path: String, conf: Configuration): FSDataInputStream = {
val dfsPath =new Path(path)
val dfs =getFileSystemForPath(dfsPath, conf)
if (dfs.isFile(dfsPath)) {
try {
dfs.open(dfsPath)
} catch {
case e: IOException =>
// If we are really unlucky, the file may be deleted as we're opening the stream.
// This can happen as clean up is performed by daemon threads that may be left over from
// previous runs.
if (!dfs.isFile(dfsPath))null else throw e
}
} else {
null
}
}
DirectKafkaInputDStream
@tailrec
protected final def latestLeaderOffsets(retries: Int): Map[TopicAndPartition, LeaderOffset] = {
val o = kc.getLatestLeaderOffsets(currentOffsets.keySet)
// Either.fold would confuse @tailrec, do it manually
if (o.isLeft) {
val err = o.left.get.toString
if (retries <=0) {
throw new SparkException(err)
} else {
log.error(err)
Thread.sleep(kc.config.refreshLeaderBackoffMs)
latestLeaderOffsets(retries - 1)
}
} else {
o.right.get
}
}
KafkaRDD
private[kafka]
class KafkaRDD[
K: ClassTag,
V: ClassTag,
U <: Decoder[_]: ClassTag,
T <: Decoder[_]: ClassTag,
R: ClassTag] private[spark] (
sc: SparkContext,
kafkaParams: Map[String, String],
val offsetRanges: Array[OffsetRange],
leaders: Map[TopicAndPartition, (String, Int)],
messageHandler: MessageAndMetadata[K, V] => R
) extends RDD[R](sc, Nil)with Loggingwith HasOffsetRanges {
override def getPartitions: Array[Partition] = {
offsetRanges.zipWithIndex.map { case (o, i) =>
val (host, port) = leaders(TopicAndPartition(o.topic, o.partition))
new KafkaRDDPartition(i, o.topic, o.partition, o.fromOffset, o.untilOffset, host, port)
}.toArray
}
getPreferredLocations
override defgetPreferredLocations(thePart: Partition): Seq[String] = {
val part = thePart.asInstanceOf[KafkaRDDPartition]
// TODO is additional hostname resolution necessary here
Seq(part.host)
}
connectLeader
// The idea is to use the provided preferred host, except on task retry atttempts,
// to minimize number of kafka metadata requests
private def connectLeader: SimpleConsumer = {
if (context.attemptNumber >0) {
kc.connectLeader(part.topic, part.partition).fold(
errs => throw new SparkException(
s"Couldn't connect to leader for topic${part.topic}${part.partition}: "+
errs.mkString("\n")),
consumer => consumer
)
} else {
kc.connect(part.host, part.port)
}
}
- Spark定制班第12课:Spark Streaming源码解读之Executor容错安全性
- 第12课:Spark Streaming源码解读之Executor容错安全性
- 第12课 :Spark Streaming源码解读之Executor容错安全性
- 第12课:Spark Streaming源码解读之executor容错安全性
- 第12课:Spark Streaming源码解读之Executor容错安全性
- 12、Spark Streaming源码解读之Executor容错安全性
- Spark Streaming源码解读之Executor容错安全性
- Spark Streaming源码解读之Executor容错安全性
- Spark 定制版:012~Spark Streaming源码解读之Executor容错安全性
- Spark定制班第13课:Spark Streaming源码解读之Driver容错安全性
- 第13课:Spark Streaming源码解读之Driver容错安全性
- 第13课:Spark Streaming 源码解读之Driver 容错安全性
- 第13课:Spark Streaming源码解读之Driver容错安全性
- 第13课:Spark Streaming源码解读之Driver容错安全性
- 第13课:Spark Streaming源码解读之Driver容错安全性
- Spark Streaming之Executor容错安全性
- Spark Streaming源码解读之Driver容错安全性
- Spark Streaming源码解读之Driver容错安全性
- ASP.Net原理
- 第11课:Spark Streaming源码解读之Driver中的ReceiverTracker架构设计以及具体实现彻底研究
- 百度之星 初赛2 瞬间转移 [杨辉三角]
- Arch LInux 系统迁移
- innodb_flush_log_at_trx_commit不同参数值下的性能测试
- 第12课:Spark Streaming源码解读之executor容错安全性
- listView 点击事件实现方法
- 动态规划:最长公共子串长度
- Hdu5696 区间的价值(花式水)
- HTTP协议
- CodeForces 23A-You're Given a String...
- vim 格式化文本,调整缩进
- 第一篇文章
- 页面未随软键盘上升及android隐藏软键盘总结