spark学习-33-Spark的SerializerManager序列化管理器
来源:互联网 发布:二手玫瑰知乎 编辑:程序博客网 时间:2024/06/05 05:42
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.spark.serializerimport java.io.{BufferedInputStream, BufferedOutputStream, InputStream, OutputStream}import java.nio.ByteBufferimport scala.reflect.ClassTagimport org.apache.spark.SparkConfimport org.apache.spark.io.CompressionCodecimport org.apache.spark.security.CryptoStreamUtilsimport org.apache.spark.storage._import org.apache.spark.util.io.{ChunkedByteBuffer, ChunkedByteBufferOutputStream}/** * Component which configures serialization, compression and encryption for various Spark * components, including automatic selection of which [[Serializer]] to use for shuffles. * * 为各种Spark组件配置序列化、压缩和加密的组件,包括自动选择用于洗牌的[[Serializer]]。 */private[spark] class SerializerManager( defaultSerializer: Serializer, conf: SparkConf, encryptionKey: Option[Array[Byte]]) { def this(defaultSerializer: Serializer, conf: SparkConf) = this(defaultSerializer, conf, None) private[this] val kryoSerializer = new KryoSerializer(conf) private[this] val stringClassTag: ClassTag[String] = implicitly[ClassTag[String]] private[this] val primitiveAndPrimitiveArrayClassTags: Set[ClassTag[_]] = { val primitiveClassTags = Set[ClassTag[_]]( ClassTag.Boolean, ClassTag.Byte, ClassTag.Char, ClassTag.Double, ClassTag.Float, ClassTag.Int, ClassTag.Long, ClassTag.Null, ClassTag.Short ) val arrayClassTags = primitiveClassTags.map(_.wrap) primitiveClassTags ++ arrayClassTags } // Whether to compress broadcast variables that are stored // 是否压缩存储的广播变量 private[this] val compressBroadcast = conf.getBoolean("spark.broadcast.compress", true) // Whether to compress shuffle output that are stored 是否压缩存储的洗牌输出 private[this] val compressShuffle = conf.getBoolean("spark.shuffle.compress", true) // Whether to compress RDD partitions that are stored serialized // 是否压缩存储序列化的RDD分区 private[this] val compressRdds = conf.getBoolean("spark.rdd.compress", false) // Whether to compress shuffle output temporarily spilled to disk // 是否将shuffle输出压缩到磁盘上 private[this] val compressShuffleSpill = conf.getBoolean("spark.shuffle.spill.compress", true) /* The compression codec to use. Note that the "lazy" val is necessary because we want to delay * the initialization of the compression codec until it is first used. The reason is that a Spark * program could be using a user-defined codec in a third party jar, which is loaded in * Executor.updateDependencies. When the BlockManager is initialized, user level jars hasn't been * loaded yet. * * 使用压缩编解码器。请注意,“lazy”val是必要的,因为我们希望将压缩编解码器的初始化延迟到第一次使用。 * 原因是火花程序可以在第三方jar使用用户定义的编解码器,这在Executor.updateDependencies加载。 * 当BlockManager初始化时,用户级jar还没有加载。 * * * */ private lazy val compressionCodec: CompressionCodec = CompressionCodec.createCodec(conf) def encryptionEnabled: Boolean = encryptionKey.isDefined def canUseKryo(ct: ClassTag[_]): Boolean = { primitiveAndPrimitiveArrayClassTags.contains(ct) || ct == stringClassTag } // SPARK-18617: As feature in SPARK-13990 can not be applied to Spark Streaming now. The worst // result is streaming job based on `Receiver` mode can not run on Spark 2.x properly. It may be // a rational choice to close `kryo auto pick` feature for streaming in the first step. def getSerializer(ct: ClassTag[_], autoPick: Boolean): Serializer = { if (autoPick && canUseKryo(ct)) { kryoSerializer } else { defaultSerializer } } /** * Pick the best serializer for shuffling an RDD of key-value pairs. * 选择最优的序列化器来调整键值对的RDD。 */ def getSerializer(keyClassTag: ClassTag[_], valueClassTag: ClassTag[_]): Serializer = { if (canUseKryo(keyClassTag) && canUseKryo(valueClassTag)) { kryoSerializer } else { defaultSerializer } } private def shouldCompress(blockId: BlockId): Boolean = { blockId match { case _: ShuffleBlockId => compressShuffle case _: BroadcastBlockId => compressBroadcast case _: RDDBlockId => compressRdds case _: TempLocalBlockId => compressShuffleSpill case _: TempShuffleBlockId => compressShuffle case _ => false } } /** * Wrap an input stream for encryption and compression * 为加密和压缩包装输入流 */ def wrapStream(blockId: BlockId, s: InputStream): InputStream = { wrapForCompression(blockId, wrapForEncryption(s)) } /** * Wrap an output stream for encryption and compression *为加密和压缩包装输出流 */ def wrapStream(blockId: BlockId, s: OutputStream): OutputStream = { wrapForCompression(blockId, wrapForEncryption(s)) } /** * Wrap an input stream for encryption if shuffle encryption is enabled * 如果启用了shuffle加密,则为加密包装一个输入流 */ def wrapForEncryption(s: InputStream): InputStream = { encryptionKey .map { key => CryptoStreamUtils.createCryptoInputStream(s, conf, key) } .getOrElse(s) } /** * Wrap an output stream for encryption if shuffle encryption is enabled * 如果启用了shuffle加密,则为加密包装一个输出流 */ def wrapForEncryption(s: OutputStream): OutputStream = { encryptionKey .map { key => CryptoStreamUtils.createCryptoOutputStream(s, conf, key) } .getOrElse(s) } /** * Wrap an output stream for compression if block compression is enabled for its block type */ def wrapForCompression(blockId: BlockId, s: OutputStream): OutputStream = { if (shouldCompress(blockId)) compressionCodec.compressedOutputStream(s) else s } /** * Wrap an input stream for compression if block compression is enabled for its block type */ def wrapForCompression(blockId: BlockId, s: InputStream): InputStream = { if (shouldCompress(blockId)) compressionCodec.compressedInputStream(s) else s } /** Serializes into a stream. * 数据流序列化方法 * 如果写入存储体系的数据本身是序列化的,那么读取时应该对其反序列化。dataSerializeStream方法使用compressionCodec对文件输入流进行压缩 * 和序列化处理。 * */ def dataSerializeStream[T: ClassTag]( blockId: BlockId, outputStream: OutputStream, values: Iterator[T]): Unit = { val byteStream = new BufferedOutputStream(outputStream) val autoPick = !blockId.isInstanceOf[StreamBlockId] val ser = getSerializer(implicitly[ClassTag[T]], autoPick).newInstance() ser.serializeStream(wrapForCompression(blockId, byteStream)).writeAll(values).close() } /** Serializes into a chunked byte buffer. */ def dataSerialize[T: ClassTag]( blockId: BlockId, values: Iterator[T]): ChunkedByteBuffer = { dataSerializeWithExplicitClassTag(blockId, values, implicitly[ClassTag[T]]) } /** Serializes into a chunked byte buffer. */ def dataSerializeWithExplicitClassTag( blockId: BlockId, values: Iterator[_], classTag: ClassTag[_]): ChunkedByteBuffer = { val bbos = new ChunkedByteBufferOutputStream(1024 * 1024 * 4, ByteBuffer.allocate) val byteStream = new BufferedOutputStream(bbos) val autoPick = !blockId.isInstanceOf[StreamBlockId] val ser = getSerializer(classTag, autoPick).newInstance() ser.serializeStream(wrapForCompression(blockId, byteStream)).writeAll(values).close() bbos.toChunkedByteBuffer } /** * Deserializes an InputStream into an iterator of values and disposes of it when the end of * the iterator is reached. */ def dataDeserializeStream[T]( blockId: BlockId, inputStream: InputStream) (classTag: ClassTag[T]): Iterator[T] = { val stream = new BufferedInputStream(inputStream) val autoPick = !blockId.isInstanceOf[StreamBlockId] getSerializer(classTag, autoPick) .newInstance() .deserializeStream(wrapForCompression(blockId, inputStream)) .asIterator.asInstanceOf[Iterator[T]] }}
阅读全文
0 0
- spark学习-33-Spark的SerializerManager序列化管理器
- spark学习-34-Spark的BroadcastManager广播管理器
- spark的集群管理器
- spark 未序列化
- Spark 中的序列化
- spark序列化问题解决
- spark序列化溢出
- Spark的Kryo序列化注册
- spark学习-33-Spark的RPC通信源码分析
- Spark中的序列化机制
- spark Task序列化问题
- Spark之 KryoSerializer序列化
- spark未序列化问题
- spark中的序列化器
- spark学习十五 spark的容错分析
- spark-02-学习spark需要的阶段
- 关于Spark和Spark的学习资料
- spark学习-18-Spark的Core理解
- Android Studio中ButterKnife的使用
- 用房地产数据可视化分析软件实现智慧人居
- LIS--最长不下降子序列
- Error:Failed to open zip file.Gradle's dependency cache may be corrupt 【Mac系统 AS】
- java连接HDFS+Kerberos配置参数示例
- spark学习-33-Spark的SerializerManager序列化管理器
- 算法—直接插入排序
- Eclipse打开报错:failed to load the jni shared library
- 关于助力砍价及微信公众号评论刷点赞及精选文章留言评论区点赞刷赞方法
- Oracle表空间管理
- leetcode: 71. Simplify Path
- python糗事百科爬虫
- EditPlus格式化XML
- 选择小程序的8大理由,让你拒绝说No