spark core 2.0 Partition and HadoopPartition

来源:互联网 发布:约翰特拉沃尔塔 知乎 编辑:程序博客网 时间:2024/04/29 21:16

Spark Partition is a trait.

/** * An identifier for a partition in an RDD. */trait Partition extends Serializable {  /**   * Get the partition's index within its parent RDD   */  def index: Int  // A better default implementation of HashCode  override def hashCode(): Int = index  override def equals(other: Any): Boolean = super.equals(other)}

/** * A Spark split class that wraps around a Hadoop InputSplit. */private[spark] class HadoopPartition(rddId: Int, override val index: Int, s: InputSplit)  extends Partition {  val inputSplit = new SerializableWritable[InputSplit](s)  override def hashCode(): Int = 31 * (31 + rddId) + index  override def equals(other: Any): Boolean = super.equals(other)  /**   * Get any environment variables that should be added to the users environment when running pipes   * @return a Map with the environment variables and corresponding values, it could be empty   */  def getPipeEnvVars(): Map[String, String] = {    val envVars: Map[String, String] = if (inputSplit.value.isInstanceOf[FileSplit]) {      val is: FileSplit = inputSplit.value.asInstanceOf[FileSplit]      // map_input_file is deprecated in favor of mapreduce_map_input_file but set both      // since it's not removed yet      Map("map_input_file" -> is.getPath().toString(),        "mapreduce_map_input_file" -> is.getPath().toString())    } else {      Map()    }    envVars  }}


1 0
原创粉丝点击