spark streaming 2 ParallelCollectionRDD

来源:互联网 发布:2016nba总决赛数据 编辑:程序博客网 时间:2024/05/27 10:43


private object ParallelCollectionRDD {  /**   * Slice a collection into numSlices sub-collections. One extra thing we do here is to treat Range   * collections specially, encoding the slices as other Ranges to minimize memory cost. This makes   * it efficient to run Spark over RDDs representing large sets of numbers. And if the collection   * is an inclusive Range, we use inclusive range for the last slice.   */  def slice[T: ClassTag](seq: Seq[T], numSlices: Int): Seq[Seq[T]] = {    if (numSlices < 1) {      throw new IllegalArgumentException("Positive number of slices required")    }    // Sequences need to be sliced at the same set of index positions for operations    // like RDD.zip() to behave as expected    def positions(length: Long, numSlices: Int): Iterator[(Int, Int)] = {      (0 until numSlices).iterator.map { i =>        val start = ((i * length) / numSlices).toInt        val end = (((i + 1) * length) / numSlices).toInt        (start, end)      }    }    seq match {      case r: Range =>        positions(r.length, numSlices).zipWithIndex.map { case ((start, end), index) =>          // If the range is inclusive, use inclusive range for the last slice          if (r.isInclusive && index == numSlices - 1) {            new Range.Inclusive(r.start + start * r.step, r.end, r.step)          }          else {            new Range(r.start + start * r.step, r.start + end * r.step, r.step)          }        }.toSeq.asInstanceOf[Seq[Seq[T]]]      case nr: NumericRange[_] =>        // For ranges of Long, Double, BigInteger, etc        val slices = new ArrayBuffer[Seq[T]](numSlices)        var r = nr        for ((start, end) <- positions(nr.length, numSlices)) {          val sliceSize = end - start          slices += r.take(sliceSize).asInstanceOf[Seq[T]]          r = r.drop(sliceSize)        }        slices      case _ =>        val array = seq.toArray // To prevent O(n^2) operations for List etc        positions(array.length, numSlices).map { case (start, end) =>            array.slice(start, end).toSeq        }.toSeq    }  }}




0 0
原创粉丝点击