Spark 之RDD API大全
来源:互联网 发布:minitab比较多组数据 编辑:程序博客网 时间:2024/05/21 18:40
package scalaimport org.apache.spark.{SparkConf, SparkContext}/** * */object SparkAPI extends App { val conf = new SparkConf().setAppName("SparkTransformationTest").setMaster("local") val sc = new SparkContext(conf) /** * aggregate */ def demoAggregate(): Unit = { val z = sc.parallelize(List(1, 2, 3, 4, 5, 6), 2) def myfunc(index: Int, iter: Iterator[(Int)]): Iterator[String] = { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } z.mapPartitionsWithIndex(myfunc).collect // res28: Array[String] = Array([partID:0, val: 1], [partID:0, val: 2], [partID:0, val: 3], [partID:1, val: 4], [partID:1, val: 5], [partID:1, val: 6]) z.aggregate(0)(math.max(_, _), _ + _) // res40: Int = 9 z.aggregate(5)(math.max(_, _), _ + _) // res29: Int = 16 val z2 = sc.parallelize(List("a", "b", "c", "d", "e", "f"), 2) def myfunc2(index: Int, iter: Iterator[(String)]): Iterator[String] = { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } z2.mapPartitionsWithIndex(myfunc2).collect //res31: Array[String] = Array([partID:0, val: a], [partID:0, val: b], [partID:0, val: c], [partID:1, val: d], [partID:1, val: e], [partID:1, val: f]) z2.aggregate("")(_ + _, _ + _) // res115: String = abcdef z2.aggregate("x")(_ + _, _ + _) //res116: String = xxdefxabc val z3 = sc.parallelize(List("12", "23", "345", "4567"), 2) z3.aggregate("")((x, y) => math.max(x.length, y.length).toString, (x, y) => x + y) // res141: String = 42 z3.aggregate("")((x, y) => math.min(x.length, y.length).toString, (x, y) => x + y) // res142: String = 11 val z4 = sc.parallelize(List("12", "23", "345", ""), 2) z4.aggregate("")((x, y) => math.min(x.length, y.length).toString, (x, y) => x + y) // res143: String = 10 } /** * aggregateByKey */ def demoAggregateByKey(): Unit = { val pairRDD = sc.parallelize(List(("cat", 2), ("cat", 5), ("mouse", 4), ("cat", 12), ("dog", 12), ("mouse", 2)), 2) def myfunc(index: Int, iter: Iterator[(String, Int)]): Iterator[String] = { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } pairRDD.mapPartitionsWithIndex(myfunc).collect //res2: Array[String] = Array([partID:0, val: (cat,2)], [partID:0, val: (cat,5)], [partID:0, val: (mouse,4)], [partID:1, val: (cat,12)], [partID:1, val: (dog,12)], [partID:1, val: (mouse,2)]) pairRDD.aggregateByKey(0)(math.max(_, _), _ + _).collect // res3: Array[(String, Int)] = Array((dog,12), (cat,17), (mouse,6)) pairRDD.aggregateByKey(100)(math.max(_, _), _ + _).collect // res4: Array[(String, Int)] = Array((dog,100), (cat,200), (mouse,200)) } /** * cartesian */ def demo_cartesian(): Unit = { val x = sc.parallelize(List(1, 2, 3, 4, 5)) val y = sc.parallelize(List(6, 7, 8, 9, 10)) x.cartesian(y).collect // res0: Array[(Int, Int)] = Array((1,6), (1,7), (1,8), (1,9), (1,10), (2,6), (2,7), (2,8), (2,9), (2,10), (3,6), (3,7), (3,8), (3,9), (3,10), (4,6), (5,6), (4,7), (5,7), (4,8), (5,8), (4,9), (4,10), (5,9), (5,10)) } /** * checkpoint */ def demo_checkpoint(): Unit = { sc.setCheckpointDir("/tmp") val a = sc.parallelize(1 to 4) println(a.checkpoint) println(a.count) } /** * coalesce, repartition */ def demo_coalesce(): Unit = { val y = sc.parallelize(1 to 10, 10) val z = y.coalesce(2, false) z.partitions.length //res9: Int = 2 } /** * cogroup [Pair], groupWith [Pair] */ def demo_cogroup(): Unit = { val a = sc.parallelize(List(1, 2, 1, 3), 1) val b = a.map((_, "b")) val c = a.map((_, "c")) b.cogroup(c).collect // res7: Array[(Int, (Iterable[String], Iterable[String]))] = Array( // (2,(ArrayBuffer(b),ArrayBuffer(c))), // (3,(ArrayBuffer(b),ArrayBuffer(c))), // (1,(ArrayBuffer(b, b),ArrayBuffer(c, c))) // ) val d = a.map((_, "d")) b.cogroup(c, d).collect // res9: Array[(Int, (Iterable[String], Iterable[String], Iterable[String]))] = Array( // (2,(ArrayBuffer(b),ArrayBuffer(c),ArrayBuffer(d))), // (3,(ArrayBuffer(b),ArrayBuffer(c),ArrayBuffer(d))), // (1,(ArrayBuffer(b, b),ArrayBuffer(c, c),ArrayBuffer(d, d))) // ) val x = sc.parallelize(List((1, "apple"), (2, "banana"), (3, "orange"), (4, "kiwi")), 2) val y = sc.parallelize(List((5, "computer"), (1, "laptop"), (1, "desktop"), (4, "iPad")), 2) x.cogroup(y).collect // res23: Array[(Int, (Iterable[String], Iterable[String]))] = Array( // (4,(ArrayBuffer(kiwi),ArrayBuffer(iPad))), // (2,(ArrayBuffer(banana),ArrayBuffer())), // (3,(ArrayBuffer(orange),ArrayBuffer())), // (1,(ArrayBuffer(apple),ArrayBuffer(laptop, desktop))), // (5,(ArrayBuffer(),ArrayBuffer(computer)))) } /** * collect, toArray */ def demo_collect(): Unit = { val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog", "Gnu", "Rat"), 2) c.collect // res29: Array[String] = Array(Gnu, Cat, Rat, Dog, Gnu, Rat) } /** * collectAsMap [Pair] */ def demo_collectAsMap(): Unit = { val a = sc.parallelize(List(1, 2, 1, 3), 1) val b = a.zip(a) b.collectAsMap // res1: scala.collection.Map[Int,Int] = Map(2 -> 2, 1 -> 1, 3 -> 3) } /** * combineByKey[Pair] */ def demo_combineByKey(): Unit = { val a = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3) val b = sc.parallelize(List(1, 1, 2, 2, 2, 1, 2, 2, 2), 3) val c = b.zip(a) val d = c.combineByKey(List(_), (x: List[String], y: String) => y :: x, (x: List[String], y: List[String]) => x ::: y) d.collect // res16: Array[(Int, List[String])] = Array((1,List(cat, dog, turkey)), (2,List(gnu, rabbit, salmon, bee, bear, wolf))) } /** * countApproxDistinct */ def demo_countApproxDistinct(): Unit = { val a = sc.parallelize(1 to 10000, 20) val b = a ++ a ++ a ++ a ++ a b.countApproxDistinct(0.1) // res14: Long = 8224 b.countApproxDistinct(0.05) // res15: Long = 9750 b.countApproxDistinct(0.01) // res16: Long = 9947 b.countApproxDistinct(0.001) // res0: Long = 10000 } /** * countApproxDistinctByKey [Pair] */ def demo_countApproxDistinctByKey(): Unit = { val a = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog"), 2) val b = sc.parallelize(a.takeSample(true, 10000, 0), 20) val c = sc.parallelize(1 to b.count().toInt, 20) val d = b.zip(c) d.countApproxDistinctByKey(0.1).collect // res15: Array[(String, Long)] = Array((Rat,2567), (Cat,3357), (Dog,2414), (Gnu,2494)) d.countApproxDistinctByKey(0.01).collect // res16: Array[(String, Long)] = Array((Rat,2555), (Cat,2455), (Dog,2425), (Gnu,2513)) d.countApproxDistinctByKey(0.001).collect // res0: Array[(String, Long)] = Array((Rat,2562), (Cat,2464), (Dog,2451), (Gnu,2521)) } /** * countByKey [Pair] */ def demo_countByKey(): Unit = { val c = sc.parallelize(List((3, "Gnu"), (3, "Yak"), (5, "Mouse"), (3, "Dog")), 2) c.countByKey // res3: scala.collection.Map[Int,Long] = Map(3 -> 3, 5 -> 1) } /** * countByValue */ def demo_countByValue(): Unit = { val b = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 2, 4, 2, 1, 1, 1, 1, 1)) b.countByValue // res27: scala.collection.Map[Int,Long] = Map(5 -> 1, 8 -> 1, 3 -> 1, 6 -> 1, 1 -> 6, 2 -> 3, 4 -> 2, 7 -> 1) } /** * distinct */ def demo_distinct(): Unit = { val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog", "Gnu", "Rat"), 2) c.distinct.collect // res6: Array[String] = Array(Dog, Gnu, Cat, Rat) val a = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) a.distinct(2).partitions.length // res16: Int = 2 a.distinct(3).partitions.length // res17: Int = 3 } /** * filter */ def demo_filter(): Unit = { val a1 = sc.parallelize(1 to 10, 3) val b1 = a1.filter(_ % 2 == 0) b1.collect // res3: Array[Int] = Array(2, 4, 6, 8, 10) //--------------------------------------------------------------------- val a = sc.parallelize(List("cat", "horse", 4.0, 3.5, 2, "dog")) a.collect({ case a: Int => "is integer" case b: String => "is string" }).collect // res17: Array[String] = Array(is string, is string, is integer, is string) val myfunc: PartialFunction[Any, Any] = { case a: Int => "is integer" case b: String => "is string" } myfunc.isDefinedAt("") // res21: Boolean = true myfunc.isDefinedAt(1) // res22: Boolean = true myfunc.isDefinedAt(1.5) // res23: Boolean = false } /** * filterByRange */ def demo_filterByRange(): Unit = { val randRDD = sc.parallelize(List((2, "cat"), (6, "mouse"), (7, "cup"), (3, "book"), (4, "tv"), (1, "screen"), (5, "heater")), 3) val sortedRDD = randRDD.sortByKey() sortedRDD.filterByRange(1, 3).collect.foreach(i => println(i._1 + ":" + i._2)) // res66: Array[(Int, String)] = Array((1,screen), (2,cat), (3,book)) } /** * flatMap */ def demo_flatMap(): Unit = { val a = sc.parallelize(1 to 10, 5) a.flatMap(1 to _).collect // res47: Array[Int] = Array(1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10) sc.parallelize(List(1, 2, 3), 2).flatMap(x => List(x, x, x)).collect // res85: Array[Int] = Array(1, 1, 1, 2, 2, 2, 3, 3, 3) val x = sc.parallelize(1 to 10, 3) x.flatMap(List.fill(scala.util.Random.nextInt(10))(_)).collect // res1: Array[Int] = Array(1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10) } /** * flatMapValues * * def fold(zeroValue: T)(op: (T, T) => T): T */ def demo_flatMapValues(): Unit = { val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2) val b = a.map(x => (x.length, x)) b.flatMapValues("x" + _ + "x").collect // res6: Array[(Int, Char)] = Array((3,x), (3,d), (3,o), (3,g), (3,x), (5,x), (5,t), (5,i), (5,g), (5,e), (5,r), (5,x), (4,x), (4,l), (4,i), (4,o), (4,n), (4,x), (3,x), (3,c), (3,a), (3,t), (3,x), (7,x), (7,p), (7,a), (7,n), (7,t), (7,h), (7,e), (7,r), (7,x), (5,x), (5,e), (5,a), (5,g), (5,l), (5,e), (5,x)) } /** * fold */ def demo_fold(): Unit = { val a = sc.parallelize(List(1, 2, 3), 3) a.fold(0)(_ + _) // res59: Int = 6 } /** * foldByKey [Pair] * def foldByKey(zeroValue: V)(func: (V, V) => V): RDD[(K, V)] * def foldByKey(zeroValue: V, numPartitions: Int)(func: (V, V) => V): RDD[(K, V)] * def foldByKey(zeroValue: V, partitioner: Partitioner)(func: (V, V) => V): RDD[(K, V)] */ def demo_foldByKey(): Unit = { val a = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), 2) val b = a.map(x => (x.length, x)) b.foldByKey("")(_ + _).collect // res84: Array[(Int, String)] = Array((3,dogcatowlgnuant) val a2 = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2) val b2 = a2.map(x => (x.length, x)) b2.foldByKey("")(_ + _).collect // res85: Array[(Int, String)] = Array((4,lion), (3,dogcat), (7,panther), (5,tigereagle)) } /** * foreachPartition * def foreachPartition(f: Iterator[T] => Unit) */ def demo_foreachPartition(): Unit = { val b = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 9), 3) b.foreachPartition(x => println(x.reduce(_ + _))) // 6 // 15 // 24 } /** * fullOuterJoin * def fullOuterJoin[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (Option[V], Option[W]))] * def fullOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (Option[V], Option[W]))] * def fullOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (Option[V], Option[W]))] */ def demo_fullOuterJoin(): Unit = { val pairRDD1 = sc.parallelize(List(("cat", 2), ("cat", 5), ("book", 4), ("cat", 12))) val pairRDD2 = sc.parallelize(List(("cat", 2), ("cup", 5), ("mouse", 4), ("cat", 12))) pairRDD1.fullOuterJoin(pairRDD2).collect // res5: Array[(String, (Option[Int], Option[Int]))] = Array((book,(Some(4),None)), (mouse,(None,Some(4))), (cup,(None,Some(5))), (cat,(Some(2),Some(2))), (cat,(Some(2),Some(12))), (cat,(Some(5),Some(2))), (cat,(Some(5),Some(12))), (cat,(Some(12),Some(2))), (cat,(Some(12),Some(12)))) } /** * groupBy * * def groupBy[K: ClassTag](f: T => K): RDD[(K, Iterable[T])] * def groupBy[K: ClassTag](f: T => K, numPartitions: Int): RDD[(K, Iterable[T])] * def groupBy[K: ClassTag](f: T => K, p: Partitioner): RDD[(K, Iterable[T])] */ def demo_groupBy(): Unit = { val a = sc.parallelize(1 to 9, 3) a.groupBy(x => { if (x % 2 == 0) "even" else "odd" }).collect // res42: Array[(String, Seq[Int])] = Array((even,ArrayBuffer(2, 4, 6, 8)), (odd,ArrayBuffer(1, 3, 5, 7, 9))) def myfunc(aa: Int): Int = { aa % 2 } a.groupBy(myfunc).collect // res3: Array[(Int, Seq[Int])] = Array((0,ArrayBuffer(2, 4, 6, 8)), (1,ArrayBuffer(1, 3, 5, 7, 9))) // a.groupBy(x => myfunc(x), 3).collect a.groupBy(myfunc(_), 1).collect // res7: Array[(Int, Seq[Int])] = Array((0,ArrayBuffer(2, 4, 6, 8)), (1,ArrayBuffer(1, 3, 5, 7, 9))) } /** * groupByKey [Pair] * def groupByKey(): RDD[(K, Iterable[V])] * def groupByKey(numPartitions: Int): RDD[(K, Iterable[V])] * def groupByKey(partitioner: Partitioner): RDD[(K, Iterable[V])] */ def demo_groupByKey(): Unit = { val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "spider", "eagle"), 2) val b = a.keyBy(_.length) b.groupByKey().collect() // res11: Array[(Int, Seq[String])] = Array((4,ArrayBuffer(lion)), (6,ArrayBuffer(spider)), (3,ArrayBuffer(dog, cat)), (5,ArrayBuffer(tiger, eagle))) } /** * histogram [Double] * def histogram(bucketCount: Int): Pair[Array[Double], Array[Long]] * def histogram(buckets: Array[Double], evenBuckets: Boolean = false): Array[Long] */ def demo_histogram(): Unit = { val a1 = sc.parallelize(List(1.1, 1.2, 1.3, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 9.0), 3) a1.histogram(5) // res11: (Array[Double], Array[Long]) = (Array(1.1, 2.68, 4.26, 5.84, 7.42, 9.0),Array(5, 0, 0, 1, 4)) val a2 = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3, 5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3) a2.histogram(6) // res18: (Array[Double], Array[Long]) = (Array(1.0, 2.5, 4.0, 5.5, 7.0, 8.5, 10.0),Array(6, 0, 1, 1, 3, 4)) val b1 = sc.parallelize(List(1.1, 1.2, 1.3, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 9.0), 3) b1.histogram(Array(0.0, 3.0, 8.0)) // res14: Array[Long] = Array(5, 3) val b2 = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3, 5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3) b2.histogram(Array(0.0, 5.0, 10.0)) // res1: Array[Long] = Array(6, 9) b2.histogram(Array(0.0, 5.0, 10.0, 15.0)) // res1: Array[Long] = Array(6, 8, 1) } /** * intersection */ def demo_intersection(): Unit = { val x = sc.parallelize(1 to 20) val y = sc.parallelize(10 to 30) val z = x.intersection(y) z.collect // res74: Array[Int] = Array(16, 12, 20, 13, 17, 14, 18, 10, 19, 15, 11) } /** * join */ def demo_join(): Unit = { val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3) val b = a.keyBy(_.length) val c = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3) val d = c.keyBy(_.length) b.join(d).collect // res0: Array[(Int, (String, String))] = Array((6,(salmon,salmon)), (6,(salmon,rabbit)), (6,(salmon,turkey)), (6,(salmon,salmon)), (6,(salmon,rabbit)), (6,(salmon,turkey)), (3,(dog,dog)), (3,(dog,cat)), (3,(dog,gnu)), (3,(dog,bee)), (3,(rat,dog)), (3,(rat,cat)), (3,(rat,gnu)), (3,(rat,bee))) } /** * lookup */ def demo_lookup(): Unit = { val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2) val b = a.map(x => (x.length, x)) b.lookup(5) // res0: Seq[String] = WrappedArray(tiger, eagle) } /** * map */ def demo_map(): Unit = { val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3) val b = a.map(_.length) val c = a.zip(b) c.collect // res0: Array[(String, Int)] = Array((dog,3), (salmon,6), (salmon,6), (rat,3), (elephant,8)) } /** * mapPartitions */ def demo_mapPartitions(): Unit = { val a = sc.parallelize(1 to 9, 3) def myfunc[T](iter: Iterator[T]): Iterator[(T, T)] = { var res = List[(T, T)]() var pre = iter.next while (iter.hasNext) { val cur = iter.next; res.::=(pre, cur) pre = cur; } res.iterator } a.mapPartitions(myfunc).collect // res0: Array[(Int, Int)] = Array((2,3), (1,2), (5,6), (4,5), (8,9), (7,8)) } /** * mapPartitionsWithIndex */ def demo_mapPartitionsWithIndex(): Unit = { val x = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 3) def myfunc(index: Int, iter: Iterator[Int]): Iterator[String] = { iter.toList.map(x => index + "," + x).iterator } x.mapPartitionsWithIndex(myfunc).collect() // res10: Array[String] = Array(0,1, 0,2, 0,3, 1,4, 1,5, 1,6, 2,7, 2,8, 2,9, 2,10) } /** * mapValues */ def demo_mapValues(): Unit = { val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2) val b = a.map(x => (x.length, x)) b.mapValues("x" + _ + "x").collect // res5: Array[(Int, String)] = Array((3,xdogx), (5,xtigerx), (4,xlionx), (3,xcatx), (7,xpantherx), (5,xeaglex)) } /** * max */ def demo_max(): Unit = { val y = sc.parallelize(10 to 30) y.max // res75: Int = 30 val a = sc.parallelize(List((10, "dog"), (3, "tiger"), (9, "lion"), (18, "cat"))) a.max // res6: (Int, String) = (18,cat) } /** * min */ def demo_min(): Unit = { val y = sc.parallelize(10 to 30) y.min // res75: Int = 10 val a = sc.parallelize(List((10, "dog"), (3, "tiger"), (9, "lion"), (8, "cat"))) a.min // res4: (Int, String) = (3,tiger) } /** * mean [Double], meanApprox [Double] */ def demo_mean(): Unit = { val a = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3, 5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3) a.mean // res0: Double = 5.3 } /** * pipe */ def demo_pipe(): Unit = { val a = sc.parallelize(1 to 9, 3) a.pipe("head -n 1").collect // res2: Array[String] = Array(1, 4, 7) } /** * randomSplit */ def demo_randomSplit(): Unit = { val y = sc.parallelize(1 to 10) val splits = y.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0) val test = splits(1) training.collect // res:85 Array[Int] = Array(1, 4, 5, 6, 8, 10) test.collect // res86: Array[Int] = Array(2, 3, 7, 9) //-------------------------------------- val y2 = sc.parallelize(1 to 10) val splits2 = y2.randomSplit(Array(0.1, 0.3, 0.6)) val rdd1 = splits2(0) val rdd2 = splits2(1) val rdd3 = splits2(2) rdd1.collect // res87: Array[Int] = Array(4, 10) rdd2.collect // res88: Array[Int] = Array(1, 3, 5, 8) rdd3.collect // res91: Array[Int] = Array(2, 6, 7, 9) } /** * reduce */ def demo_reduce(): Unit = { val a = sc.parallelize(1 to 100, 3) a.reduce(_ + _) // res41: Int = 5050 } /** * reduceByKey * def reduceByKey(func: (V, V) => V): RDD[(K, V)] * def reduceByKey(func: (V, V) => V, numPartitions: Int): RDD[(K, V)] * def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] * def reduceByKeyLocally(func: (V, V) => V): Map[K, V] * def reduceByKeyToDriver(func: (V, V) => V): Map[K, V] */ def demo_reduceByKey(): Unit = { val a = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), 2) val b = a.map(x => (x.length, x)) b.reduceByKey(_ + _).collect // res86: Array[(Int, String)] = Array((3,dogcatowlgnuant)) val a2 = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2) val b2 = a2.map(x => (x.length, x)) b2.reduceByKey(_ + _).collect // res87: Array[(Int, String)] = Array((4,lion), (3,dogcat), (7,panther), (5,tigereagle)) } /** * repartition */ def demo_repartition(): Unit = { val rdd = sc.parallelize(List(1, 2, 10, 4, 5, 2, 1, 1, 1), 3) rdd.partitions.length // res2: Int = 3 val rdd2 = rdd.repartition(5) rdd2.partitions.length // res6: Int = 5 } /** * rightOuterJoin */ def demo_rightOuterJoin(): Unit = { val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3) val b = a.keyBy(_.length) val c = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3) val d = c.keyBy(_.length) b.rightOuterJoin(d).collect // res2: Array[(Int, (Option[String], String))] = Array((6,(Some(salmon),salmon)), (6,(Some(salmon),rabbit)), (6,(Some(salmon),turkey)), (6,(Some(salmon),salmon)), (6,(Some(salmon),rabbit)), (6,(Some(salmon),turkey)), (3,(Some(dog),dog)), (3,(Some(dog),cat)), (3,(Some(dog),gnu)), (3,(Some(dog),bee)), (3,(Some(rat),dog)), (3,(Some(rat),cat)), (3,(Some(rat),gnu)), (3,(Some(rat),bee)), (4,(None,wolf)), (4,(None,bear))) } /** * sample */ def demo_sample(): Unit = { val a = sc.parallelize(1 to 10000, 3) a.sample(false, 0.1, 0).count // res24: Long = 960 a.sample(true, 0.3, 0).count // res25: Long = 2888 a.sample(true, 0.3, 13).count // res26: Long = 2985 } /** * sampleByKey */ def demo_sampleByKey(): Unit = { val randRDD = sc.parallelize(List((7, "cat"), (6, "mouse"), (7, "cup"), (6, "book"), (7, "tv"), (6, "screen"), (7, "heater"))) val sampleMap = List((7, 0.4), (6, 0.6)).toMap randRDD.sampleByKey(false, sampleMap, 42).collect // res6: Array[(Int, String)] = Array((7,cat), (6,mouse), (6,book), (6,screen), (7,heater)) } /** * saveAsHadoopFile [Pair], * saveAsHadoopDataset [Pair], * saveAsNewAPIHadoopFile [Pair] * */ def demo_saveAsHadoopFile(): Unit = { val listRDD = sc.parallelize(List(("name", "zhangsan"), ("age", 20), ("address", "wuhan"))) listRDD.saveAsTextFile("/tmp/listRDD"); // listRDD.saveAsHadoopFile("/tmp/listRDD.txt"); } /** * saveAsObjectFile */ def demo_saveAsObjectFile(): Unit = { val x = sc.parallelize(1 to 10, 3) x.saveAsObjectFile("objFile") val y = sc.objectFile[Int]("objFile") y.collect // res52: Array[Int] = Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) } /** * saveAsSequenceFile */ def demo_saveAsSequenceFile(): Unit = { val v = sc.parallelize(Array(("owl", 3), ("gnu", 4), ("dog", 1), ("cat", 2), ("ant", 5)), 2) v.saveAsSequenceFile("hd_seq_file") // 14/04/19 05:45:43 INFO FileOutputCommitter: Saved output of task 'attempt_201404190545_0000_m_000001_191' to file:/home/cloudera/hd_seq_file // // [cloudera@localhost ~]$ ll ~/hd_seq_file // total 8 // -rwxr-xr-x 1 cloudera cloudera 117 Apr 19 05:45 part-00000 // -rwxr-xr-x 1 cloudera cloudera 133 Apr 19 05:45 part-00001 // -rwxr - xr - x // 1 cloudera cloudera // 0 Apr 19 // 05: // 45 _SUCCESS } /** * sortBy */ def demo_sortBy(): Unit = { val y = sc.parallelize(Array(5, 7, 1, 3, 2, 1)) y.sortBy(c => c, true).collect // res101: Array[Int] = Array(1, 1, 2, 3, 5, 7) y.sortBy(c => c, false).collect // res102: Array[Int] = Array(7, 5, 3, 2, 1, 1) val z = sc.parallelize(Array(("H", 10), ("A", 26), ("Z", 1), ("L", 5))) z.sortBy(c => c._1, true).collect // res109: Array[(String, Int)] = Array((A,26), (H,10), (L,5), (Z,1)) z.sortBy(c => c._2, true).collect // res108: Array[(String, Int)] = Array((Z,1), (L,5), (H,10), (A,26)) } /** * sortByKey */ def demo_sortByKey(): Unit = { val a = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), 2) val b = sc.parallelize(1 to a.count.toInt, 2) val c = a.zip(b) c.sortByKey(true).collect // res74: Array[(String, Int)] = Array((ant,5), (cat,2), (dog,1), (gnu,4), (owl,3)) c.sortByKey(false).collect // res75: Array[(String, Int)] = Array((owl,3), (gnu,4), (dog,1), (cat,2), (ant,5)) //===================================================== val aa = sc.parallelize(1 to 100, 5) val bb = a.cartesian(aa) val cc = sc.parallelize(bb.takeSample(true, 5, 13), 2) val dd = cc.sortByKey(false) // res56: Array[(Int, Int)] = Array((96,9), (84,76), (59,59), (53,65), (52,4)) } /** * subtract */ def demo_subtract(): Unit = { val a = sc.parallelize(1 to 9, 3) val b = sc.parallelize(1 to 3, 3) val c = a.subtract(b) c.collect // res3: Array[Int] = Array(6, 9, 4, 7, 5, 8) } /** * subtractByKey */ def demo_subtractByKey(): Unit = { val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "spider", "eagle"), 2) val b = a.keyBy(_.length) val c = sc.parallelize(List("ant", "falcon", "squid"), 2) val d = c.keyBy(_.length) b.subtractByKey(d).collect // res15: Array[(Int, String)] = Array((4,lion)) } /** * sum */ def demo_sum(): Unit = { val x = sc.parallelize(List(1.0, 2.0, 3.0, 5.0, 20.0, 19.02, 19.29, 11.09, 21.0), 2) x.sum // res17: Double = 101.39999999999999 } /** * take */ def demo_take(): Unit = { val b = sc.parallelize(List("dog", "cat", "ape", "salmon", "gnu"), 2) b.take(2) // res18: Array[String] = Array(dog, cat) val b2 = sc.parallelize(1 to 10000, 5000) b2.take(10) // res6: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) } /** * takeOrdered */ def demo_takeOrdered(): Unit = { val b = sc.parallelize(List("dog", "cat", "ape", "salmon", "gnu"), 2) b.takeOrdered(2) // res19: Array[String] = Array(ape, cat) } /** * takeSample */ def demo_takeSample(): Unit = { val x = sc.parallelize(1 to 1000, 3) x.takeSample(true, 100, 1) } /** * toJavaRDD */ def demo_toJavaRDD(): Unit = { val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog"), 2) c.toJavaRDD // res3: org.apache.spark.api.java.JavaRDD[String] = ParallelCollectionRDD[6] at parallelize at <console>:12 } /** * toLocalIterator */ def demo_toLocalIterator(): Unit = { val z = sc.parallelize(List(1, 2, 3, 4, 5, 6), 2) val iter = z.toLocalIterator iter.next // res51: Int = 1 iter.next // res52: Int = 2 } /** * top */ def demo_top(): Unit = { val c = sc.parallelize(Array(6, 9, 4, 7, 5, 8), 2) c.top(2) // res28: Array[Int] = Array(9, 8) } /** * treeAggregate */ def demo_treeAggregate(): Unit = { val z = sc.parallelize(List(1, 2, 3, 4, 5, 6), 2) def myfunc(index: Int, iter: Iterator[(Int)]): Iterator[String] = { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } z.mapPartitionsWithIndex(myfunc).collect // res28: Array[String] = Array([partID:0, val: 1], [partID:0, val: 2], [partID:0, val: 3], [partID:1, val: 4], [partID:1, val: 5], [partID:1, val: 6]) z.treeAggregate(0)(math.max(_, _), _ + _) // res40: Int = 9 z.treeAggregate(5)(math.max(_, _), _ + _) // res42: Int = 11 } /** * treeReduce */ def demo_treeReduce(): Unit = { val z = sc.parallelize(List(1, 2, 3, 4, 5, 6), 2) z.treeReduce(_ + _) // res49: Int = 21 } /** * union, ++ */ def demo_union(): Unit = { val a = sc.parallelize(1 to 3, 1) val b = sc.parallelize(5 to 7, 1) (a ++ b).collect // res0: Array[Int] = Array(1, 2, 3, 5, 6, 7) } /** * unpersist */ def demo_unpersist(): Unit = { val y = sc.parallelize(1 to 10, 10) val z = (y ++ y) z.collect z.unpersist(true) // 14/04/19 03:04:57 INFO UnionRDD: Removing RDD 22 from persistence list // 14/04/19 03:04:57 INFO BlockManager: Removing RDD 22 } /** * values */ def demo_values(): Unit = { val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2) val b = a.map(x => (x.length, x)) b.values.collect // res3: Array[String] = Array(dog, tiger, lion, cat, panther, eagle) } /** * variance [Double], sampleVariance [Double] */ def demo_variance(): Unit = { val a = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3, 5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3) a.variance // res70: Double = 10.605333333333332 val x = sc.parallelize(List(1.0, 2.0, 3.0, 5.0, 20.0, 19.02, 19.29, 11.09, 21.0), 2) x.variance // res14: Double = 66.04584444444443 x.sampleVariance // res13: Double = 74.30157499999999 } /** * zip */ def demo_zip(): Unit = { val a = sc.parallelize(1 to 100, 3) val b = sc.parallelize(101 to 200, 3) a.zip(b).collect // res1: Array[(Int, Int)] = Array((1,101), (2,102), (3,103), (4,104), ... val aa = sc.parallelize(1 to 100, 3) val bb = sc.parallelize(101 to 200, 3) val cc = sc.parallelize(201 to 300, 3) a.zip(b).zip(cc).map((x) => (x._1._1, x._1._2, x._2)).collect // res12: Array[(Int, Int, Int)] = Array((1,101,201), (2,102,202), (3,103,203),... } /** * zipParititions */ def demo_zipParititions(): Unit = { val a = sc.parallelize(0 to 9, 3) val b = sc.parallelize(10 to 19, 3) val c = sc.parallelize(100 to 109, 3) def myfunc(aiter: Iterator[Int], biter: Iterator[Int], citer: Iterator[Int]): Iterator[String] = { var res = List[String]() while (aiter.hasNext && biter.hasNext && citer.hasNext) { val x = aiter.next + " " + biter.next + " " + citer.next res ::= x } res.iterator } a.zipPartitions(b, c)(myfunc).collect // res50: Array[String] = Array(2 12 102, 1 11 101, 0 10 100, 5 15 105, 4 14 104, 3 13 103, 9 19 109, 8 18 108, 7 17 107, 6 16 106) } /** * zipWithIndex */ def demo_zipWithIndex(): Unit = { val z = sc.parallelize(Array("A", "B", "C", "D")) val r = z.zipWithIndex // res110: Array[(String, Long)] = Array((A,0), (B,1), (C,2), (D,3)) val z2 = sc.parallelize(100 to 120, 5) val r2 = z2.zipWithIndex r2.collect // res11: Array[(Int, Long)] = Array((100,0), (101,1), (102,2), (103,3), (104,4), (105,5), (106,6), (107,7), (108,8), (109,9), (110,10), (111,11), (112,12), (113,13), (114,14), (115,15), (116,16), (117,17), (118,18), (119,19), (120,20)) } /** * zipWithUniqueId */ def demo_zipWithUniqueId(): Unit = { val z = sc.parallelize(100 to 120, 5) val r = z.zipWithUniqueId r.collect // res12: Array[(Int, Long)] = Array((100,0), (101,5), (102,10), (103,15), (104,1), (105,6), (106,11), (107,16), (108,2), (109,7), (110,12), (111,17), (112,3), (113,8), (114,13), (115,18), (116,4), (117,9), (118,14), (119,19), (120,24)) }}
0 0
- Spark 之RDD API大全
- spark RDD API详解
- Spark-RDD API
- spark rdd api
- spark rdd api
- Spark RDD API详解
- spark-rdd-api
- Spark RDD API 详解
- Spark RDD API详解
- spark rdd操作API
- Spark RDD API
- Spark RDD API
- spark RDD api
- Spark RDD API详解之Map和Reduce
- Spark API 详解/大白话解释 之 RDD、partition、count、collect
- 【实践】Spark RDD API实战
- Spark RDD---api(map&reduce)
- Spark RDD API 基本操作
- Linux下进度条的简单实现
- 记录一下shell简单写法
- 使用opencv在debug正常release下报错
- Python里面截取指定的字符串
- Arduino学习日记(3)——使用webclientget方式发送数据后自动断开连接?
- Spark 之RDD API大全
- 基本数据结构——图
- 4.11
- 高位低位存储方式以及数组地址细节
- 题目1036:Old Bill
- 静态块 和构造器在继承情况下执行顺序
- URAL1993-This cheeseburger you don't need
- HDU 4920 Matrix multiplication 暴力(bitset)
- Maven聚合和继承