Spark 之RDD API大全

来源:互联网 发布:minitab比较多组数据 编辑:程序博客网 时间:2024/05/21 18:40
package scalaimport org.apache.spark.{SparkConf, SparkContext}/**  *   */object SparkAPI extends App {  val conf = new SparkConf().setAppName("SparkTransformationTest").setMaster("local")  val sc = new SparkContext(conf)  /**    * aggregate    */  def demoAggregate(): Unit = {    val z = sc.parallelize(List(1, 2, 3, 4, 5, 6), 2)    def myfunc(index: Int, iter: Iterator[(Int)]): Iterator[String] = {      iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator    }    z.mapPartitionsWithIndex(myfunc).collect    // res28: Array[String] = Array([partID:0, val: 1], [partID:0, val: 2], [partID:0, val: 3], [partID:1, val: 4], [partID:1, val: 5], [partID:1, val: 6])    z.aggregate(0)(math.max(_, _), _ + _)    // res40: Int = 9    z.aggregate(5)(math.max(_, _), _ + _)    // res29: Int = 16    val z2 = sc.parallelize(List("a", "b", "c", "d", "e", "f"), 2)    def myfunc2(index: Int, iter: Iterator[(String)]): Iterator[String] = {      iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator    }    z2.mapPartitionsWithIndex(myfunc2).collect    //res31: Array[String] = Array([partID:0, val: a], [partID:0, val: b], [partID:0, val: c], [partID:1, val: d], [partID:1, val: e], [partID:1, val: f])    z2.aggregate("")(_ + _, _ + _)    // res115: String = abcdef    z2.aggregate("x")(_ + _, _ + _)    //res116: String = xxdefxabc    val z3 = sc.parallelize(List("12", "23", "345", "4567"), 2)    z3.aggregate("")((x, y) => math.max(x.length, y.length).toString, (x, y) => x + y)    // res141: String = 42    z3.aggregate("")((x, y) => math.min(x.length, y.length).toString, (x, y) => x + y)    // res142: String = 11    val z4 = sc.parallelize(List("12", "23", "345", ""), 2)    z4.aggregate("")((x, y) => math.min(x.length, y.length).toString, (x, y) => x + y)    //  res143: String = 10  }  /**    * aggregateByKey    */  def demoAggregateByKey(): Unit = {    val pairRDD = sc.parallelize(List(("cat", 2), ("cat", 5), ("mouse", 4), ("cat", 12), ("dog", 12), ("mouse", 2)), 2)    def myfunc(index: Int, iter: Iterator[(String, Int)]): Iterator[String] = {      iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator    }    pairRDD.mapPartitionsWithIndex(myfunc).collect    //res2: Array[String] = Array([partID:0, val: (cat,2)], [partID:0, val: (cat,5)], [partID:0, val: (mouse,4)], [partID:1, val: (cat,12)], [partID:1, val: (dog,12)], [partID:1, val: (mouse,2)])    pairRDD.aggregateByKey(0)(math.max(_, _), _ + _).collect    //  res3: Array[(String, Int)] = Array((dog,12), (cat,17), (mouse,6))    pairRDD.aggregateByKey(100)(math.max(_, _), _ + _).collect    //  res4: Array[(String, Int)] = Array((dog,100), (cat,200), (mouse,200))  }  /**    * cartesian    */  def demo_cartesian(): Unit = {    val x = sc.parallelize(List(1, 2, 3, 4, 5))    val y = sc.parallelize(List(6, 7, 8, 9, 10))    x.cartesian(y).collect    // res0: Array[(Int, Int)] = Array((1,6), (1,7), (1,8), (1,9), (1,10), (2,6), (2,7), (2,8), (2,9), (2,10), (3,6), (3,7), (3,8), (3,9), (3,10), (4,6), (5,6), (4,7), (5,7), (4,8), (5,8), (4,9), (4,10), (5,9), (5,10))  }  /**    * checkpoint    */  def demo_checkpoint(): Unit = {    sc.setCheckpointDir("/tmp")    val a = sc.parallelize(1 to 4)    println(a.checkpoint)    println(a.count)  }  /**    * coalesce, repartition    */  def demo_coalesce(): Unit = {    val y = sc.parallelize(1 to 10, 10)    val z = y.coalesce(2, false)    z.partitions.length    //res9: Int = 2  }  /**    * cogroup [Pair], groupWith [Pair]    */  def demo_cogroup(): Unit = {    val a = sc.parallelize(List(1, 2, 1, 3), 1)    val b = a.map((_, "b"))    val c = a.map((_, "c"))    b.cogroup(c).collect    //    res7: Array[(Int, (Iterable[String], Iterable[String]))] = Array(    //      (2,(ArrayBuffer(b),ArrayBuffer(c))),    //      (3,(ArrayBuffer(b),ArrayBuffer(c))),    //      (1,(ArrayBuffer(b, b),ArrayBuffer(c, c)))    //    )    val d = a.map((_, "d"))    b.cogroup(c, d).collect    //    res9: Array[(Int, (Iterable[String], Iterable[String], Iterable[String]))] = Array(    //      (2,(ArrayBuffer(b),ArrayBuffer(c),ArrayBuffer(d))),    //      (3,(ArrayBuffer(b),ArrayBuffer(c),ArrayBuffer(d))),    //      (1,(ArrayBuffer(b, b),ArrayBuffer(c, c),ArrayBuffer(d, d)))    //    )    val x = sc.parallelize(List((1, "apple"), (2, "banana"), (3, "orange"), (4, "kiwi")), 2)    val y = sc.parallelize(List((5, "computer"), (1, "laptop"), (1, "desktop"), (4, "iPad")), 2)    x.cogroup(y).collect    //    res23: Array[(Int, (Iterable[String], Iterable[String]))] = Array(    //      (4,(ArrayBuffer(kiwi),ArrayBuffer(iPad))),    //      (2,(ArrayBuffer(banana),ArrayBuffer())),    //      (3,(ArrayBuffer(orange),ArrayBuffer())),    //      (1,(ArrayBuffer(apple),ArrayBuffer(laptop, desktop))),    //      (5,(ArrayBuffer(),ArrayBuffer(computer))))  }  /**    * collect, toArray    */  def demo_collect(): Unit = {    val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog", "Gnu", "Rat"), 2)    c.collect    // res29: Array[String] = Array(Gnu, Cat, Rat, Dog, Gnu, Rat)  }  /**    * collectAsMap [Pair]    */  def demo_collectAsMap(): Unit = {    val a = sc.parallelize(List(1, 2, 1, 3), 1)    val b = a.zip(a)    b.collectAsMap    // res1: scala.collection.Map[Int,Int] = Map(2 -> 2, 1 -> 1, 3 -> 3)  }  /**    * combineByKey[Pair]    */  def demo_combineByKey(): Unit = {    val a = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3)    val b = sc.parallelize(List(1, 1, 2, 2, 2, 1, 2, 2, 2), 3)    val c = b.zip(a)    val d = c.combineByKey(List(_), (x: List[String], y: String) => y :: x, (x: List[String], y: List[String]) => x ::: y)    d.collect    //    res16: Array[(Int, List[String])] = Array((1,List(cat, dog, turkey)), (2,List(gnu, rabbit, salmon, bee, bear, wolf)))  }  /**    * countApproxDistinct    */  def demo_countApproxDistinct(): Unit = {    val a = sc.parallelize(1 to 10000, 20)    val b = a ++ a ++ a ++ a ++ a    b.countApproxDistinct(0.1)    //    res14: Long = 8224    b.countApproxDistinct(0.05)    //    res15: Long = 9750    b.countApproxDistinct(0.01)    //    res16: Long = 9947    b.countApproxDistinct(0.001)    //    res0: Long = 10000  }  /**    * countApproxDistinctByKey [Pair]    */  def demo_countApproxDistinctByKey(): Unit = {    val a = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog"), 2)    val b = sc.parallelize(a.takeSample(true, 10000, 0), 20)    val c = sc.parallelize(1 to b.count().toInt, 20)    val d = b.zip(c)    d.countApproxDistinctByKey(0.1).collect    //    res15: Array[(String, Long)] = Array((Rat,2567), (Cat,3357), (Dog,2414), (Gnu,2494))    d.countApproxDistinctByKey(0.01).collect    //    res16: Array[(String, Long)] = Array((Rat,2555), (Cat,2455), (Dog,2425), (Gnu,2513))    d.countApproxDistinctByKey(0.001).collect    //    res0: Array[(String, Long)] = Array((Rat,2562), (Cat,2464), (Dog,2451), (Gnu,2521))  }  /**    * countByKey [Pair]    */  def demo_countByKey(): Unit = {    val c = sc.parallelize(List((3, "Gnu"), (3, "Yak"), (5, "Mouse"), (3, "Dog")), 2)    c.countByKey    //    res3: scala.collection.Map[Int,Long] = Map(3 -> 3, 5 -> 1)  }  /**    * countByValue    */  def demo_countByValue(): Unit = {    val b = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 2, 4, 2, 1, 1, 1, 1, 1))    b.countByValue    //    res27: scala.collection.Map[Int,Long] = Map(5 -> 1, 8 -> 1, 3 -> 1, 6 -> 1, 1 -> 6, 2 -> 3, 4 -> 2, 7 -> 1)  }  /**    * distinct    */  def demo_distinct(): Unit = {    val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog", "Gnu", "Rat"), 2)    c.distinct.collect    //    res6: Array[String] = Array(Dog, Gnu, Cat, Rat)    val a = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))    a.distinct(2).partitions.length    //    res16: Int = 2    a.distinct(3).partitions.length    //    res17: Int = 3  }  /**    * filter    */  def demo_filter(): Unit = {    val a1 = sc.parallelize(1 to 10, 3)    val b1 = a1.filter(_ % 2 == 0)    b1.collect    //    res3: Array[Int] = Array(2, 4, 6, 8, 10)    //---------------------------------------------------------------------    val a = sc.parallelize(List("cat", "horse", 4.0, 3.5, 2, "dog"))    a.collect({      case a: Int => "is integer"      case b: String => "is string"    }).collect    //    res17: Array[String] = Array(is string, is string, is integer, is string)    val myfunc: PartialFunction[Any, Any] = {      case a: Int => "is integer"      case b: String => "is string"    }    myfunc.isDefinedAt("")    //    res21: Boolean = true    myfunc.isDefinedAt(1)    //    res22: Boolean = true    myfunc.isDefinedAt(1.5)    //    res23: Boolean = false  }  /**    * filterByRange    */  def demo_filterByRange(): Unit = {    val randRDD = sc.parallelize(List((2, "cat"), (6, "mouse"), (7, "cup"), (3, "book"), (4, "tv"), (1, "screen"), (5, "heater")), 3)    val sortedRDD = randRDD.sortByKey()    sortedRDD.filterByRange(1, 3).collect.foreach(i => println(i._1 + ":" + i._2))    //    res66: Array[(Int, String)] = Array((1,screen), (2,cat), (3,book))  }  /**    * flatMap    */  def demo_flatMap(): Unit = {    val a = sc.parallelize(1 to 10, 5)    a.flatMap(1 to _).collect    //    res47: Array[Int] = Array(1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)    sc.parallelize(List(1, 2, 3), 2).flatMap(x => List(x, x, x)).collect    //    res85: Array[Int] = Array(1, 1, 1, 2, 2, 2, 3, 3, 3)    val x = sc.parallelize(1 to 10, 3)    x.flatMap(List.fill(scala.util.Random.nextInt(10))(_)).collect    //    res1: Array[Int] = Array(1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10)  }  /**    * flatMapValues    *    * def fold(zeroValue: T)(op: (T, T) => T): T    */  def demo_flatMapValues(): Unit = {    val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)    val b = a.map(x => (x.length, x))    b.flatMapValues("x" + _ + "x").collect    //    res6: Array[(Int, Char)] = Array((3,x), (3,d), (3,o), (3,g), (3,x), (5,x), (5,t), (5,i), (5,g), (5,e), (5,r), (5,x), (4,x), (4,l), (4,i), (4,o), (4,n), (4,x), (3,x), (3,c), (3,a), (3,t), (3,x), (7,x), (7,p), (7,a), (7,n), (7,t), (7,h), (7,e), (7,r), (7,x), (5,x), (5,e), (5,a), (5,g), (5,l), (5,e), (5,x))  }  /**    * fold    */  def demo_fold(): Unit = {    val a = sc.parallelize(List(1, 2, 3), 3)    a.fold(0)(_ + _)    //    res59: Int = 6  }  /**    * foldByKey [Pair]    * def foldByKey(zeroValue: V)(func: (V, V) => V): RDD[(K, V)]    * def foldByKey(zeroValue: V, numPartitions: Int)(func: (V, V) => V): RDD[(K, V)]    * def foldByKey(zeroValue: V, partitioner: Partitioner)(func: (V, V) => V): RDD[(K, V)]    */  def demo_foldByKey(): Unit = {    val a = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), 2)    val b = a.map(x => (x.length, x))    b.foldByKey("")(_ + _).collect    //    res84: Array[(Int, String)] = Array((3,dogcatowlgnuant)    val a2 = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)    val b2 = a2.map(x => (x.length, x))    b2.foldByKey("")(_ + _).collect    //    res85: Array[(Int, String)] = Array((4,lion), (3,dogcat), (7,panther), (5,tigereagle))  }  /**    * foreachPartition    * def foreachPartition(f: Iterator[T] => Unit)    */  def demo_foreachPartition(): Unit = {    val b = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 9), 3)    b.foreachPartition(x => println(x.reduce(_ + _)))    //    6    //    15    //    24  }  /**    * fullOuterJoin    * def fullOuterJoin[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (Option[V], Option[W]))]    * def fullOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (Option[V], Option[W]))]    * def fullOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (Option[V], Option[W]))]    */  def demo_fullOuterJoin(): Unit = {    val pairRDD1 = sc.parallelize(List(("cat", 2), ("cat", 5), ("book", 4), ("cat", 12)))    val pairRDD2 = sc.parallelize(List(("cat", 2), ("cup", 5), ("mouse", 4), ("cat", 12)))    pairRDD1.fullOuterJoin(pairRDD2).collect    //    res5: Array[(String, (Option[Int], Option[Int]))] = Array((book,(Some(4),None)), (mouse,(None,Some(4))), (cup,(None,Some(5))), (cat,(Some(2),Some(2))), (cat,(Some(2),Some(12))), (cat,(Some(5),Some(2))), (cat,(Some(5),Some(12))), (cat,(Some(12),Some(2))), (cat,(Some(12),Some(12))))  }  /**    * groupBy    *    * def groupBy[K: ClassTag](f: T => K): RDD[(K, Iterable[T])]    * def groupBy[K: ClassTag](f: T => K, numPartitions: Int): RDD[(K, Iterable[T])]    * def groupBy[K: ClassTag](f: T => K, p: Partitioner): RDD[(K, Iterable[T])]    */  def demo_groupBy(): Unit = {    val a = sc.parallelize(1 to 9, 3)    a.groupBy(x => {      if (x % 2 == 0) "even" else "odd"    }).collect    //    res42: Array[(String, Seq[Int])] = Array((even,ArrayBuffer(2, 4, 6, 8)), (odd,ArrayBuffer(1, 3, 5, 7, 9)))    def myfunc(aa: Int): Int = {      aa % 2    }    a.groupBy(myfunc).collect    //    res3: Array[(Int, Seq[Int])] = Array((0,ArrayBuffer(2, 4, 6, 8)), (1,ArrayBuffer(1, 3, 5, 7, 9)))    // a.groupBy(x => myfunc(x), 3).collect    a.groupBy(myfunc(_), 1).collect    //    res7: Array[(Int, Seq[Int])] = Array((0,ArrayBuffer(2, 4, 6, 8)), (1,ArrayBuffer(1, 3, 5, 7, 9)))  }  /**    * groupByKey [Pair]    * def groupByKey(): RDD[(K, Iterable[V])]    * def groupByKey(numPartitions: Int): RDD[(K, Iterable[V])]    * def groupByKey(partitioner: Partitioner): RDD[(K, Iterable[V])]    */  def demo_groupByKey(): Unit = {    val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "spider", "eagle"), 2)    val b = a.keyBy(_.length)    b.groupByKey().collect()    //    res11: Array[(Int, Seq[String])] = Array((4,ArrayBuffer(lion)), (6,ArrayBuffer(spider)), (3,ArrayBuffer(dog, cat)), (5,ArrayBuffer(tiger, eagle)))  }  /**    * histogram [Double]    * def histogram(bucketCount: Int): Pair[Array[Double], Array[Long]]    * def histogram(buckets: Array[Double], evenBuckets: Boolean = false): Array[Long]    */  def demo_histogram(): Unit = {    val a1 = sc.parallelize(List(1.1, 1.2, 1.3, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 9.0), 3)    a1.histogram(5)    //    res11: (Array[Double], Array[Long]) = (Array(1.1, 2.68, 4.26, 5.84, 7.42, 9.0),Array(5, 0, 0, 1, 4))    val a2 = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3, 5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3)    a2.histogram(6)    //    res18: (Array[Double], Array[Long]) = (Array(1.0, 2.5, 4.0, 5.5, 7.0, 8.5, 10.0),Array(6, 0, 1, 1, 3, 4))    val b1 = sc.parallelize(List(1.1, 1.2, 1.3, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 9.0), 3)    b1.histogram(Array(0.0, 3.0, 8.0))    //    res14: Array[Long] = Array(5, 3)    val b2 = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3, 5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3)    b2.histogram(Array(0.0, 5.0, 10.0))    //    res1: Array[Long] = Array(6, 9)    b2.histogram(Array(0.0, 5.0, 10.0, 15.0))    //    res1: Array[Long] = Array(6, 8, 1)  }  /**    * intersection    */  def demo_intersection(): Unit = {    val x = sc.parallelize(1 to 20)    val y = sc.parallelize(10 to 30)    val z = x.intersection(y)    z.collect    //    res74: Array[Int] = Array(16, 12, 20, 13, 17, 14, 18, 10, 19, 15, 11)  }  /**    * join    */  def demo_join(): Unit = {    val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3)    val b = a.keyBy(_.length)    val c = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3)    val d = c.keyBy(_.length)    b.join(d).collect    //    res0: Array[(Int, (String, String))] = Array((6,(salmon,salmon)), (6,(salmon,rabbit)), (6,(salmon,turkey)), (6,(salmon,salmon)), (6,(salmon,rabbit)), (6,(salmon,turkey)), (3,(dog,dog)), (3,(dog,cat)), (3,(dog,gnu)), (3,(dog,bee)), (3,(rat,dog)), (3,(rat,cat)), (3,(rat,gnu)), (3,(rat,bee)))  }  /**    * lookup    */  def demo_lookup(): Unit = {    val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)    val b = a.map(x => (x.length, x))    b.lookup(5)    //    res0: Seq[String] = WrappedArray(tiger, eagle)  }  /**    * map    */  def demo_map(): Unit = {    val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3)    val b = a.map(_.length)    val c = a.zip(b)    c.collect    //    res0: Array[(String, Int)] = Array((dog,3), (salmon,6), (salmon,6), (rat,3), (elephant,8))  }  /**    * mapPartitions    */  def demo_mapPartitions(): Unit = {    val a = sc.parallelize(1 to 9, 3)    def myfunc[T](iter: Iterator[T]): Iterator[(T, T)] = {      var res = List[(T, T)]()      var pre = iter.next      while (iter.hasNext) {        val cur = iter.next;        res.::=(pre, cur)        pre = cur;      }      res.iterator    }    a.mapPartitions(myfunc).collect    //    res0: Array[(Int, Int)] = Array((2,3), (1,2), (5,6), (4,5), (8,9), (7,8))  }  /**    * mapPartitionsWithIndex    */  def demo_mapPartitionsWithIndex(): Unit = {    val x = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 3)    def myfunc(index: Int, iter: Iterator[Int]): Iterator[String] = {      iter.toList.map(x => index + "," + x).iterator    }    x.mapPartitionsWithIndex(myfunc).collect()    //    res10: Array[String] = Array(0,1, 0,2, 0,3, 1,4, 1,5, 1,6, 2,7, 2,8, 2,9, 2,10)  }  /**    * mapValues    */  def demo_mapValues(): Unit = {    val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)    val b = a.map(x => (x.length, x))    b.mapValues("x" + _ + "x").collect    //    res5: Array[(Int, String)] = Array((3,xdogx), (5,xtigerx), (4,xlionx), (3,xcatx), (7,xpantherx), (5,xeaglex))  }  /**    * max    */  def demo_max(): Unit = {    val y = sc.parallelize(10 to 30)    y.max    //    res75: Int = 30    val a = sc.parallelize(List((10, "dog"), (3, "tiger"), (9, "lion"), (18, "cat")))    a.max    //    res6: (Int, String) = (18,cat)  }  /**    * min    */  def demo_min(): Unit = {    val y = sc.parallelize(10 to 30)    y.min    //    res75: Int = 10    val a = sc.parallelize(List((10, "dog"), (3, "tiger"), (9, "lion"), (8, "cat")))    a.min    //    res4: (Int, String) = (3,tiger)  }  /**    * mean [Double], meanApprox [Double]    */  def demo_mean(): Unit = {    val a = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3, 5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3)    a.mean    //    res0: Double = 5.3  }  /**    * pipe    */  def demo_pipe(): Unit = {    val a = sc.parallelize(1 to 9, 3)    a.pipe("head -n 1").collect    //    res2: Array[String] = Array(1, 4, 7)  }  /**    * randomSplit    */  def demo_randomSplit(): Unit = {    val y = sc.parallelize(1 to 10)    val splits = y.randomSplit(Array(0.6, 0.4), seed = 11L)    val training = splits(0)    val test = splits(1)    training.collect    //    res:85 Array[Int] = Array(1, 4, 5, 6, 8, 10)    test.collect    //    res86: Array[Int] = Array(2, 3, 7, 9)    //--------------------------------------    val y2 = sc.parallelize(1 to 10)    val splits2 = y2.randomSplit(Array(0.1, 0.3, 0.6))    val rdd1 = splits2(0)    val rdd2 = splits2(1)    val rdd3 = splits2(2)    rdd1.collect    //    res87: Array[Int] = Array(4, 10)    rdd2.collect    //    res88: Array[Int] = Array(1, 3, 5, 8)    rdd3.collect    //    res91: Array[Int] = Array(2, 6, 7, 9)  }  /**    * reduce    */  def demo_reduce(): Unit = {    val a = sc.parallelize(1 to 100, 3)    a.reduce(_ + _)    //    res41: Int = 5050  }  /**    * reduceByKey    * def reduceByKey(func: (V, V) => V): RDD[(K, V)]    * def reduceByKey(func: (V, V) => V, numPartitions: Int): RDD[(K, V)]    * def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)]    * def reduceByKeyLocally(func: (V, V) => V): Map[K, V]    * def reduceByKeyToDriver(func: (V, V) => V): Map[K, V]    */  def demo_reduceByKey(): Unit = {    val a = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), 2)    val b = a.map(x => (x.length, x))    b.reduceByKey(_ + _).collect    //    res86: Array[(Int, String)] = Array((3,dogcatowlgnuant))    val a2 = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)    val b2 = a2.map(x => (x.length, x))    b2.reduceByKey(_ + _).collect    //    res87: Array[(Int, String)] = Array((4,lion), (3,dogcat), (7,panther), (5,tigereagle))  }  /**    * repartition    */  def demo_repartition(): Unit = {    val rdd = sc.parallelize(List(1, 2, 10, 4, 5, 2, 1, 1, 1), 3)    rdd.partitions.length    //    res2: Int = 3    val rdd2 = rdd.repartition(5)    rdd2.partitions.length    //    res6: Int = 5  }  /**    * rightOuterJoin    */  def demo_rightOuterJoin(): Unit = {    val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3)    val b = a.keyBy(_.length)    val c = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3)    val d = c.keyBy(_.length)    b.rightOuterJoin(d).collect    //    res2: Array[(Int, (Option[String], String))] = Array((6,(Some(salmon),salmon)), (6,(Some(salmon),rabbit)), (6,(Some(salmon),turkey)), (6,(Some(salmon),salmon)), (6,(Some(salmon),rabbit)), (6,(Some(salmon),turkey)), (3,(Some(dog),dog)), (3,(Some(dog),cat)), (3,(Some(dog),gnu)), (3,(Some(dog),bee)), (3,(Some(rat),dog)), (3,(Some(rat),cat)), (3,(Some(rat),gnu)), (3,(Some(rat),bee)), (4,(None,wolf)), (4,(None,bear)))  }  /**    * sample    */  def demo_sample(): Unit = {    val a = sc.parallelize(1 to 10000, 3)    a.sample(false, 0.1, 0).count    //    res24: Long = 960    a.sample(true, 0.3, 0).count    //    res25: Long = 2888    a.sample(true, 0.3, 13).count    //    res26: Long = 2985  }  /**    * sampleByKey    */  def demo_sampleByKey(): Unit = {    val randRDD = sc.parallelize(List((7, "cat"), (6, "mouse"), (7, "cup"), (6, "book"), (7, "tv"), (6, "screen"), (7, "heater")))    val sampleMap = List((7, 0.4), (6, 0.6)).toMap    randRDD.sampleByKey(false, sampleMap, 42).collect    //    res6: Array[(Int, String)] = Array((7,cat), (6,mouse), (6,book), (6,screen), (7,heater))  }  /**    * saveAsHadoopFile [Pair],    * saveAsHadoopDataset [Pair],    * saveAsNewAPIHadoopFile [Pair]    *    */  def demo_saveAsHadoopFile(): Unit = {    val listRDD = sc.parallelize(List(("name", "zhangsan"), ("age", 20), ("address", "wuhan")))    listRDD.saveAsTextFile("/tmp/listRDD");    // listRDD.saveAsHadoopFile("/tmp/listRDD.txt");  }  /**    * saveAsObjectFile    */  def demo_saveAsObjectFile(): Unit = {    val x = sc.parallelize(1 to 10, 3)    x.saveAsObjectFile("objFile")    val y = sc.objectFile[Int]("objFile")    y.collect    //    res52: Array[Int] =  Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)  }  /**    * saveAsSequenceFile    */  def demo_saveAsSequenceFile(): Unit = {    val v = sc.parallelize(Array(("owl", 3), ("gnu", 4), ("dog", 1), ("cat", 2), ("ant", 5)), 2)    v.saveAsSequenceFile("hd_seq_file")    //    14/04/19 05:45:43 INFO FileOutputCommitter: Saved output of task 'attempt_201404190545_0000_m_000001_191' to file:/home/cloudera/hd_seq_file    //    //      [cloudera@localhost ~]$ ll ~/hd_seq_file    //      total 8    //    -rwxr-xr-x 1 cloudera cloudera 117 Apr 19 05:45 part-00000    //    -rwxr-xr-x 1 cloudera cloudera 133 Apr 19 05:45 part-00001    //    -rwxr - xr - x    //    1 cloudera cloudera    //    0 Apr 19    //    05:    //    45 _SUCCESS  }  /**    * sortBy    */  def demo_sortBy(): Unit = {    val y = sc.parallelize(Array(5, 7, 1, 3, 2, 1))    y.sortBy(c => c, true).collect    //    res101: Array[Int] = Array(1, 1, 2, 3, 5, 7)    y.sortBy(c => c, false).collect    //    res102: Array[Int] = Array(7, 5, 3, 2, 1, 1)    val z = sc.parallelize(Array(("H", 10), ("A", 26), ("Z", 1), ("L", 5)))    z.sortBy(c => c._1, true).collect    //    res109: Array[(String, Int)] = Array((A,26), (H,10), (L,5), (Z,1))    z.sortBy(c => c._2, true).collect    //    res108: Array[(String, Int)] = Array((Z,1), (L,5), (H,10), (A,26))  }  /**    * sortByKey    */  def demo_sortByKey(): Unit = {    val a = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), 2)    val b = sc.parallelize(1 to a.count.toInt, 2)    val c = a.zip(b)    c.sortByKey(true).collect    //    res74: Array[(String, Int)] = Array((ant,5), (cat,2), (dog,1), (gnu,4), (owl,3))    c.sortByKey(false).collect    //    res75: Array[(String, Int)] = Array((owl,3), (gnu,4), (dog,1), (cat,2), (ant,5))    //=====================================================    val aa = sc.parallelize(1 to 100, 5)    val bb = a.cartesian(aa)    val cc = sc.parallelize(bb.takeSample(true, 5, 13), 2)    val dd = cc.sortByKey(false)    //    res56: Array[(Int, Int)] = Array((96,9), (84,76), (59,59), (53,65), (52,4))  }  /**    * subtract    */  def demo_subtract(): Unit = {    val a = sc.parallelize(1 to 9, 3)    val b = sc.parallelize(1 to 3, 3)    val c = a.subtract(b)    c.collect    //    res3: Array[Int] = Array(6, 9, 4, 7, 5, 8)  }  /**    * subtractByKey    */  def demo_subtractByKey(): Unit = {    val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "spider", "eagle"), 2)    val b = a.keyBy(_.length)    val c = sc.parallelize(List("ant", "falcon", "squid"), 2)    val d = c.keyBy(_.length)    b.subtractByKey(d).collect    //    res15: Array[(Int, String)] = Array((4,lion))  }  /**    * sum    */  def demo_sum(): Unit = {    val x = sc.parallelize(List(1.0, 2.0, 3.0, 5.0, 20.0, 19.02, 19.29, 11.09, 21.0), 2)    x.sum    //    res17: Double = 101.39999999999999  }  /**    * take    */  def demo_take(): Unit = {    val b = sc.parallelize(List("dog", "cat", "ape", "salmon", "gnu"), 2)    b.take(2)    //    res18: Array[String] = Array(dog, cat)    val b2 = sc.parallelize(1 to 10000, 5000)    b2.take(10)    //    res6: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)  }  /**    * takeOrdered    */  def demo_takeOrdered(): Unit = {    val b = sc.parallelize(List("dog", "cat", "ape", "salmon", "gnu"), 2)    b.takeOrdered(2)    //    res19: Array[String] = Array(ape, cat)  }  /**    * takeSample    */  def demo_takeSample(): Unit = {    val x = sc.parallelize(1 to 1000, 3)    x.takeSample(true, 100, 1)  }  /**    * toJavaRDD    */  def demo_toJavaRDD(): Unit = {    val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog"), 2)    c.toJavaRDD    //    res3: org.apache.spark.api.java.JavaRDD[String] = ParallelCollectionRDD[6] at parallelize at <console>:12  }  /**    * toLocalIterator    */  def demo_toLocalIterator(): Unit = {    val z = sc.parallelize(List(1, 2, 3, 4, 5, 6), 2)    val iter = z.toLocalIterator    iter.next    //    res51: Int = 1    iter.next    //    res52: Int = 2  }  /**    * top    */  def demo_top(): Unit = {    val c = sc.parallelize(Array(6, 9, 4, 7, 5, 8), 2)    c.top(2)    //    res28: Array[Int] = Array(9, 8)  }  /**    * treeAggregate    */  def demo_treeAggregate(): Unit = {    val z = sc.parallelize(List(1, 2, 3, 4, 5, 6), 2)    def myfunc(index: Int, iter: Iterator[(Int)]): Iterator[String] = {      iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator    }    z.mapPartitionsWithIndex(myfunc).collect    //    res28: Array[String] = Array([partID:0, val: 1], [partID:0, val: 2], [partID:0, val: 3], [partID:1, val: 4], [partID:1, val: 5], [partID:1, val: 6])    z.treeAggregate(0)(math.max(_, _), _ + _)    //    res40: Int = 9    z.treeAggregate(5)(math.max(_, _), _ + _)    //    res42: Int = 11  }  /**    * treeReduce    */  def demo_treeReduce(): Unit = {    val z = sc.parallelize(List(1, 2, 3, 4, 5, 6), 2)    z.treeReduce(_ + _)    //    res49: Int = 21  }  /**    * union, ++    */  def demo_union(): Unit = {    val a = sc.parallelize(1 to 3, 1)    val b = sc.parallelize(5 to 7, 1)    (a ++ b).collect    //    res0: Array[Int] = Array(1, 2, 3, 5, 6, 7)  }  /**    * unpersist    */  def demo_unpersist(): Unit = {    val y = sc.parallelize(1 to 10, 10)    val z = (y ++ y)    z.collect    z.unpersist(true)    //    14/04/19 03:04:57 INFO UnionRDD: Removing RDD 22 from persistence list    //    14/04/19 03:04:57 INFO BlockManager: Removing RDD 22  }  /**    * values    */  def demo_values(): Unit = {    val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)    val b = a.map(x => (x.length, x))    b.values.collect    //    res3: Array[String] = Array(dog, tiger, lion, cat, panther, eagle)  }  /**    * variance [Double], sampleVariance [Double]    */  def demo_variance(): Unit = {    val a = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3, 5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3)    a.variance    //  res70: Double = 10.605333333333332    val x = sc.parallelize(List(1.0, 2.0, 3.0, 5.0, 20.0, 19.02, 19.29, 11.09, 21.0), 2)    x.variance    //  res14: Double = 66.04584444444443    x.sampleVariance    //  res13: Double = 74.30157499999999  }  /**    * zip    */  def demo_zip(): Unit = {    val a = sc.parallelize(1 to 100, 3)    val b = sc.parallelize(101 to 200, 3)    a.zip(b).collect    //    res1: Array[(Int, Int)] = Array((1,101), (2,102), (3,103), (4,104), ...    val aa = sc.parallelize(1 to 100, 3)    val bb = sc.parallelize(101 to 200, 3)    val cc = sc.parallelize(201 to 300, 3)    a.zip(b).zip(cc).map((x) => (x._1._1, x._1._2, x._2)).collect    //    res12: Array[(Int, Int, Int)] = Array((1,101,201), (2,102,202), (3,103,203),...  }  /**    * zipParititions    */  def demo_zipParititions(): Unit = {    val a = sc.parallelize(0 to 9, 3)    val b = sc.parallelize(10 to 19, 3)    val c = sc.parallelize(100 to 109, 3)    def myfunc(aiter: Iterator[Int], biter: Iterator[Int], citer: Iterator[Int]): Iterator[String] = {      var res = List[String]()      while (aiter.hasNext && biter.hasNext && citer.hasNext) {        val x = aiter.next + " " + biter.next + " " + citer.next        res ::= x      }      res.iterator    }    a.zipPartitions(b, c)(myfunc).collect    //    res50: Array[String] = Array(2 12 102, 1 11 101, 0 10 100, 5 15 105, 4 14 104, 3 13 103, 9 19 109, 8 18 108, 7 17 107, 6 16 106)  }  /**    * zipWithIndex    */  def demo_zipWithIndex(): Unit = {    val z = sc.parallelize(Array("A", "B", "C", "D"))    val r = z.zipWithIndex    //    res110: Array[(String, Long)] = Array((A,0), (B,1), (C,2), (D,3))    val z2 = sc.parallelize(100 to 120, 5)    val r2 = z2.zipWithIndex    r2.collect    //    res11: Array[(Int, Long)] = Array((100,0), (101,1), (102,2), (103,3), (104,4), (105,5), (106,6), (107,7), (108,8), (109,9), (110,10), (111,11), (112,12), (113,13), (114,14), (115,15), (116,16), (117,17), (118,18), (119,19), (120,20))  }  /**    * zipWithUniqueId    */  def demo_zipWithUniqueId(): Unit = {    val z = sc.parallelize(100 to 120, 5)    val r = z.zipWithUniqueId    r.collect    //    res12: Array[(Int, Long)] = Array((100,0), (101,5), (102,10), (103,15), (104,1), (105,6), (106,11), (107,16), (108,2), (109,7), (110,12), (111,17), (112,3), (113,8), (114,13), (115,18), (116,4), (117,9), (118,14), (119,19), (120,24))  }}
0 0
原创粉丝点击