第11课:Spark Streaming源码解读之Driver中的ReceiverTracker架构设计以及具体实现彻底研究
来源:互联网 发布:淘宝能不能换身份证 编辑:程序博客网 时间:2024/05/17 15:04
本期内容:
1.ReceiverTracker的架构设计
2.消息循环系统
3.ReceiverTracker具体实现
启动Receiver的方式:
1.把每个Receiver都封装成为task,这个task是这个job中唯一的task,实质上讲ReceiverTracker启动Receiver的方式就是封装成一个一个的job,有多少个job就会启动多少Receiver。每个task就一条数据,就是Receiver的数据。
2.ReceiverTracker在启动Receiver的时候有一个ReceiverSupervisor, ReceiverSupervisorImp做为ReceiverSupervisor的实现,ReceiverSupervisor在启动的时候会启动Receiver,然后Receiver不断的接收数据,会通过blockGenerate把自己接收的数据变成一个一个的block,背后自己有个定时器,这个定时器会不断的存储数据。一种是直接通过blockGenerate存储,一种是先写日志WAL。ReceiverSupervisorImpl会把存储的元数据汇报给ReceiverTracker(实际上是ReceiverTracker中的RPC通信消息实体)。后面进行下一步的数据管理工作。
数据的大小一般从多少记录考虑,例如10亿级别的
源码
ReceivedBlockHandler
//写数据的时候是通过ReceivedBlockHandler
private val receivedBlockHandler: ReceivedBlockHandler = {
if (WriteAheadLogUtils.enableReceiverLog(env.conf)) {
if (checkpointDirOption.isEmpty) {
throw new SparkException(
"Cannot enable receiver write-ahead log without checkpoint directory set. "+
"Please use streamingContext.checkpoint() to set the checkpoint directory. "+
"See documentation for more details.")
}
new WriteAheadLogBasedBlockHandler(env.blockManager, receiver.streamId,
receiver.storageLevel, env.conf, hadoopConf, checkpointDirOption.get)
} else {
new BlockManagerBasedBlockHandler(env.blockManager, receiver.storageLevel)
}
}
pushAndReportBlock
//存储数据且把数据汇报给Driver
def pushAndReportBlock(
receivedBlock: ReceivedBlock,
metadataOption: Option[Any],
blockIdOption: Option[StreamBlockId]
) {
val blockId = blockIdOption.getOrElse(nextBlockId)
val time = System.currentTimeMillis
val blockStoreResult =receivedBlockHandler.storeBlock(blockId, receivedBlock)
logDebug(s"Pushed block $blockId in${(System.currentTimeMillis- time)} ms")
val numRecords = blockStoreResult.numRecords
val blockInfo =ReceivedBlockInfo(streamId, numRecords, metadataOption, blockStoreResult)
trackerEndpoint.askWithRetry[Boolean](AddBlock(blockInfo))
logDebug(s"Reported block $blockId")
}
private[streaming]case class ReceivedBlockInfo(
streamId: Int,
numRecords: Option[Long],
metadataOption: Option[Any],
blockStoreResult: ReceivedBlockStoreResult
) {
/** Remote RpcEndpointRef for the ReceiverTracker */
private val trackerEndpoint= RpcUtils.makeDriverRef("ReceiverTracker", env.conf, env.rpcEnv)
ReceiverTracker是整个block管理的中心
//RPC消息循环体接收来自receiver的消息
private class ReceiverTrackerEndpoint(override valrpcEnv: RpcEnv) extendsThreadSafeRpcEndpoint {
//sealed说明所有的消息都在这里
private[streaming]sealed trait ReceiverTrackerMessage
private[streaming]case class RegisterReceiver(
streamId: Int,
typ: String,
host: String,
executorId: String,
receiverEndpoint: RpcEndpointRef
) extends ReceiverTrackerMessage
private[streaming]case class AddBlock(receivedBlockInfo: ReceivedBlockInfo)
extends ReceiverTrackerMessage
private[streaming]case class ReportError(streamId: Int, message:String, error:String)
private[streaming]case class DeregisterReceiver(streamId: Int, msg:String, error:String)
extends ReceiverTrackerMessage
receiveAndReply
override defreceiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
// Remote messages
case RegisterReceiver(streamId, typ, host, executorId, receiverEndpoint) =>
val successful =
registerReceiver(streamId, typ, host, executorId, receiverEndpoint, context.senderAddress)
context.reply(successful)
case AddBlock(receivedBlockInfo) =>
if (WriteAheadLogUtils.isBatchingEnabled(ssc.conf, isDriver =true)) {
walBatchingThreadPool.execute(newRunnable {
override def run(): Unit = Utils.tryLogNonFatalError{
if (active) {
context.reply(addBlock(receivedBlockInfo))
} else {
throw new IllegalStateException("ReceiverTracker RpcEndpoint shut down.")
}
}
})
} else {
context.reply(addBlock(receivedBlockInfo))
}
case DeregisterReceiver(streamId, message, error) =>
deregisterReceiver(streamId, message, error)
context.reply(true)
// Local messages
case AllReceiverIds =>
context.reply(receiverTrackingInfos.filter(_._2.state != ReceiverState.INACTIVE).keys.toSeq)
case StopAllReceivers =>
assert(isTrackerStopping || isTrackerStopped)
stopReceivers()
context.reply(true)
}
addBlock
/** Add new blocks for the given stream */
private def addBlock(receivedBlockInfo: ReceivedBlockInfo): Boolean = {
receivedBlockTracker.addBlock(receivedBlockInfo)
}
/** Add received block. This event will get written to the write ahead log (if enabled). */
def addBlock(receivedBlockInfo: ReceivedBlockInfo): Boolean = {
try {
val writeResult =writeToLog(BlockAdditionEvent(receivedBlockInfo))
if (writeResult) {
synchronized {
getReceivedBlockQueue(receivedBlockInfo.streamId) += receivedBlockInfo
}
logDebug(s"Stream ${receivedBlockInfo.streamId} received "+
s"block ${receivedBlockInfo.blockStoreResult.blockId}")
} else {
logDebug(s"Failed to acknowledge stream${receivedBlockInfo.streamId} receiving "+
s"block ${receivedBlockInfo.blockStoreResult.blockId} in the Write Ahead Log.")
}
writeResult
} catch {
case NonFatal(e) =>
logError(s"Error adding block $receivedBlockInfo", e)
false
}
}
/** Write an update to the tracker to the write ahead log */
private def writeToLog(record: ReceivedBlockTrackerLogEvent): Boolean = {
if (isWriteAheadLogEnabled) {
logTrace(s"Writing record: $record")
try {
writeAheadLogOption.get.write(ByteBuffer.wrap(Utils.serialize(record)),
clock.getTimeMillis())
true
} catch{
case NonFatal(e) =>
logWarning(s"Exception thrown while writing record:$record to the WriteAheadLog.", e)
false
}
} else {
true
}
}
/** Get the queue of received blocks belonging to a particular stream */
private def getReceivedBlockQueue(streamId: Int):ReceivedBlockQueue = {
streamIdToUnallocatedBlockQueues.getOrElseUpdate(streamId,new ReceivedBlockQueue)
}
//没有被分配,只是做记录
private val streamIdToUnallocatedBlockQueues= newmutable.HashMap[Int, ReceivedBlockQueue]
/** Allocate all unallocated blocks to the given batch. */
def allocateBlocksToBatch(batchTime: Time): Unit = {
if (receiverInputStreams.nonEmpty) {
receivedBlockTracker.allocateBlocksToBatch(batchTime)
}
}
def allocateBlocksToBatch(batchTime: Time): Unit = synchronized {
if (lastAllocatedBatchTime== null|| batchTime > lastAllocatedBatchTime) {
val streamIdToBlocks = streamIds.map { streamId =>
(streamId, getReceivedBlockQueue(streamId).dequeueAll(x =>true))
}.toMap
val allocatedBlocks =AllocatedBlocks(streamIdToBlocks)
if (writeToLog(BatchAllocationEvent(batchTime, allocatedBlocks))) {
timeToAllocatedBlocks.put(batchTime, allocatedBlocks)
lastAllocatedBatchTime = batchTime
} else {
logInfo(s"Possibly processed batch$batchTime need to be processed again in WAL recovery")
}
} else {
// This situation occurs when:
// 1. WAL is ended with BatchAllocationEvent, but without BatchCleanupEvent,
// possibly processed batch job or half-processed batch job need to be processed again,
// so the batchTime will be equal to lastAllocatedBatchTime.
// 2. Slow checkpointing makes recovered batch time older than WAL recovered
// lastAllocatedBatchTime.
// This situation will only occurs in recovery time.
logInfo(s"Possibly processed batch$batchTime need to be processed again in WAL recovery")
}
}
ReceiverSupervisorImpl.scala
private valendpoint = env.rpcEnv.setupEndpoint(
"Receiver-" +streamId +"-" + System.currentTimeMillis(),new ThreadSafeRpcEndpoint {
override val rpcEnv: RpcEnv = env.rpcEnv
receive
override defreceive: PartialFunction[Any, Unit] = {
case StopReceiver =>
logInfo("Received stop signal")
ReceiverSupervisorImpl.this.stop("Stopped by driver", None)
case CleanupOldBlocks(threshTime) =>
logDebug("Received delete old batch signal")
cleanupOldBlocks(threshTime)
case UpdateRateLimit(eps) =>
logInfo(s"Received a new rate limit:$eps.")
registeredBlockGenerators.foreach { bg =>
bg.updateRate(eps)
}
}
private[receiver]def updateRate(newRate: Long): Unit =
if (newRate >0) {
if (maxRateLimit> 0) {
//限制每个记录流进的速度
rateLimiter.setRate(newRate.min(maxRateLimit))
} else {
rateLimiter.setRate(newRate)
}
}
public final voidsetRate(doublepermitsPerSecond) {
Preconditions.checkArgument(permitsPerSecond > 0.0D && !Double.isNaN(permitsPerSecond), "rate must be positive");
Object var3 = this.mutex;
synchronized(this.mutex) {
this.resync(this.readSafeMicros());
double stableIntervalMicros = (double)TimeUnit.SECONDS.toMicros(1L) / permitsPerSecond;
this.stableIntervalMicros = stableIntervalMicros;
this.doSetRate(permitsPerSecond, stableIntervalMicros);
}
}
ReceiverSupervisorI
/** Mark the supervisor and the receiver for stopping */
def stop(message:String, error: Option[Throwable]) {
stoppingError = error.orNull
stopReceiver(message, error)
onStop(message, error)
futureExecutionContext.shutdownNow()
stopLatch.countDown()
}
/** Stop receiver */
def stopReceiver(message:String, error: Option[Throwable]): Unit = synchronized {
try {
logInfo("Stopping receiver with message: "+ message + ": "+ error.getOrElse(""))
receiverState match{
case Initialized=>
logWarning("Skip stopping receiver because it has not yet stared")
case Started=>
receiverState =Stopped
receiver.onStop()
logInfo("Called receiver onStop")
onReceiverStop(message, error)
case Stopped=>
logWarning("Receiver has been stopped")
}
} catch {
case NonFatal(t) =>
logError("Error stopping receiver "+ streamId+ t.getStackTraceString)
}
}
子类实现
override protected defonReceiverStop(message: String, error: Option[Throwable]) {
logInfo("Deregistering receiver " + streamId)
val errorString = error.map(Throwables.getStackTraceAsString).getOrElse("")
trackerEndpoint.askWithRetry[Boolean](DeregisterReceiver(streamId, message, errorString))
logInfo("Stopped receiver " +streamId)
}
- 第11课:Spark Streaming源码解读之Driver中的ReceiverTracker架构设计以及具体实现彻底研究
- 第11课:Spark Streaming源码解读之Driver中的ReceiverTracker架构设计以及具体实现彻底研究
- 第11课:Spark Streaming源码解读之Driver中的ReceiverTracker架构设计以及具体实现彻底研究
- 第11课:Spark Streaming源码解读之Driver中的ReceiverTracker架构设计以及具体实现彻底研究
- Spark Streaming源码解读之Driver中的ReceiverTracker架构设计以及具体实现彻底研究
- 11.Spark Streaming源码解读之Driver中的ReceiverTracker架构设计以及具体实现彻底研究
- Spark Streaming源码解读之Driver中的ReceiverTracker架构设计以及具体实现彻底研究
- 第11课:Spark Streaming 源码解读之Driver中ReceiverTracker架构设计及具体实现彻底研究
- Spark 定制版:011~Spark Streaming源码解读之Driver中的ReceiverTracker架构设计以及具体实现彻底研究
- Spark定制班第11课:Spark Streaming源码解读之Driver中的ReceiverTracker架构
- Spark Streaming源码解读之Driver中的ReceiverTracker详解
- 第11课:Spark Driver中的ReceiverTracker架构设计
- 11 Spark Streaming Driver中的ReceiverTracker架构
- 第9课:Spark Streaming源码解读之Receiver在Driver的精妙实现全生命周期彻底研究和思考
- 第9课:Spark Streaming源码解读之Receiver在Driver的精妙实现全生命周期彻底研究和思考
- 第9课:Spark Streaming源码解读之Receiver在Driver的精妙实现全生命周期彻底研究和思考
- 第9课:Spark Streaming源码解读之Receiver在Driver的精妙实现全生命周期彻底研究和思考
- 第9课:Spark Streaming源码解读之Receiver在Driver的精妙实现全生命周期彻底研究和思考
- binlog server搭建实验
- Tomcat管理平台
- 第10课:Spark Streaming源码解读之流数据不断接收全生命周期彻底研究和思考
- LaTeX新人教程,30分钟从完全陌生到基本入门
- ASP.Net原理
- 第11课:Spark Streaming源码解读之Driver中的ReceiverTracker架构设计以及具体实现彻底研究
- 百度之星 初赛2 瞬间转移 [杨辉三角]
- Arch LInux 系统迁移
- innodb_flush_log_at_trx_commit不同参数值下的性能测试
- 第12课:Spark Streaming源码解读之executor容错安全性
- listView 点击事件实现方法
- 动态规划:最长公共子串长度
- Hdu5696 区间的价值(花式水)
- HTTP协议