Spark API样例

来源：互联网发布：淘宝女装店铺设计图编辑：程序博客网时间：2024/06/08 08:51

创建算子

SparkContext.makeRDD创建RDD

scala> val rdd=sc.makeRDD(1 to 6,2)

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[137] at makeRDD at <console>:25

scala> rdd.collect

res85: Array[Int] = Array(1, 2, 3, 4, 5, 6)

scala> rdd.partitions

res86: Array[org.apache.spark.Partition] = Array(org.apache.spark.rdd.ParallelCollectionPartition@1c82, org.apache.spark.rdd.ParallelCollectionPartition@1c83)

scala> val data=Seq((1 to 6,Seq("host1,host2")),(7 to 10,Seq("host3")))

data: Seq[(scala.collection.immutable.Range.Inclusive, Seq[String])] = List((Range(1, 2, 3, 4, 5, 6),List(host1,host2)), (Range(7, 8, 9, 10),List(host3)))

scala> val rdd2=sc.makeRDD(data)

rdd2: org.apache.spark.rdd.RDD[scala.collection.immutable.Range.Inclusive] = ParallelCollectionRDD[138] at makeRDD at <console>:27

scala> rdd2.collect

res87: Array[scala.collection.immutable.Range.Inclusive] = Array(Range(1, 2, 3, 4, 5, 6), Range(7, 8, 9, 10))

scala> rdd2.preferredLocations(rdd2.partitions(0))

res89: Seq[String] = List(host1,host2)

scala> rdd2.preferredLocations(rdd2.partitions(1))

res90: Seq[String] = List(host3)

SparkContext.parallelize数据并行化生成RDD

scala> val rdd=sc.makeRDD(1 to 6,2)

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[137] at makeRDD at <console>:25

scala> rdd.collect

res91: Array[Int] = Array(1, 2, 3, 4, 5, 6)

scala> rdd.partitions

res92: Array[org.apache.spark.Partition] = Array(org.apache.spark.rdd.ParallelCollectionPartition@1c82, org.apache.spark.rdd.ParallelCollectionPartition@1c83)

SparkContext.textFile基于文本文件创建RDD

scala> val textFile=sc.textFile("/Users/user/Desktop/block_1.csv")

textFile: org.apache.spark.rdd.RDD[String] = /Users/user/Desktop/block_1.csv MapPartitionsRDD[144] at textFile at <console>:25

scala> textFile.count

res95: Long = 574914

scala> textFile.first()

res96: String = "id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match"

SparkContext.wholeTextFiles基于一个目录下的全部文本文件创建RDD

scala> val rdd=sc.wholeTextFiles("/Users/user/Desktop")

rdd: org.apache.spark.rdd.RDD[(String, String)] = /Users/user/Desktop MapPartitionsRDD[146] at wholeTextFiles at <console>:25

scala> rdd.count

res97: Long = 3

scala> rdd.first()

res98: (String, String) =

(file:/Users/user/Desktop/block_1.csv,""id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match"

37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE

39086,47614,1,?,1,?,1,1,1,1,1,TRUE

70031,70237,1,?,1,?,1,1,1,1,1,TRUE

84795,97439,1,?,1,?,1,1,1,1,1,TRUE

36950,42116,1,?,1,1,1,1,1,1,1,TRUE

42413,48491,1,?,1,?,1,1,1,1,1,TRUE

25965,64753,1,?,1,?,1,1,1,1,1,TRUE

49451,90407,1,?,1,?,1,1,1,1,0,TRUE

39932,40902,1,?,1,?,1,1,1,1,1,TRUE

46626,47940,1,?,1,?,1,1,1,1,1,TRUE

48948,98379,1,?,1,?,1,1,1,1,1,TRUE

4767,4826,1,?,1,?,1,1,1,1,1,TRUE

45463,69659,1,?,1,?,1,1,1,1,1,TRUE

11367,13169,1,?,1,?,1,1,1,1,1,TRUE

10782,89636,1,?,1,?,1,0,1,1,1,TRUE

26206,39147,1,?,1,?,1,1,1,1,1,TRUE

16662,27083,1,1,1,?,1,1,1,...

变换算子

map－变换

def map[U](f: (T) ⇒ U)(implicit arg0: ClassTag[U]): RDD[U]

Return a new RDD by applying a function to all elements of this RDD.

----------

scala> val rdd=sc.parallelize(List(1,2,3,4,5,6,7,8),4)

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:24

scala> rdd.map(x=>x+1).collect

res0: Array[Int] = Array(2, 3, 4, 5, 6, 7, 8, 9)

对RDD数据每个元素进行转换，返回新的RDD。下面每行是上面rdd的一个分区，通过map(x=>x+1)进行变换。

1,2 => 2,3

3,4 => 4,5

5,6 => 6,7

7,8 => 8,9

coalesce－重新分区

def coalesce(numPartitions: Int, shuffle: Boolean = false, partitionCoalescer: Option[PartitionCoalescer] = Option.empty)(implicit ord: Ordering[T] = null): RDD[T]

Return a new RDD that is reduced into numPartitions partitions.

This results in a narrow dependency, e.g. if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of the 100 new partitions will claim 10 of the current partitions.

However, if you're doing a drastic coalesce, e.g. to numPartitions = 1, this may result in your computation taking place on fewer nodes than you like (e.g. one node in the case of numPartitions = 1). To avoid this, you can pass shuffle = true. This will add a shuffle step, but means the current upstream partitions will be executed in parallel (per whatever the current partitioning is).

Note: With shuffle = true, you can actually coalesce to a larger number of partitions. This is useful if you have a small number of partitions, say 100, potentially with a few partitions being abnormally large. Calling coalesce(1000, shuffle = true) will result in 1000 partitions with the data distributed using a hash partitioner.

----------

scala> val rdd=sc.parallelize(List(1,2,3,4,5,6,7,8),4)

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:24

scala> rdd.glom.collect

res2: Array[Array[Int]] = Array(Array(1, 2), Array(3, 4), Array(5, 6), Array(7, 8))

scala> val newrdd=rdd.coalesce(2,false)

newrdd: org.apache.spark.rdd.RDD[Int] = CoalescedRDD[3] at coalesce at <console>:26

scala> newrdd.glom.collect

res3: Array[Array[Int]] = Array(Array(1, 2, 3, 4), Array(5, 6, 7, 8))

对RDD数据重新分区，形成新的RDD并返回。注意，新的RDD分区数如超过原分区数，则RDD分区不会改变。下面每行是上面rdd的一个分区，通过重新分区变换。

1,2 => \

1,2,3,4

3,4 => /

5,6 => \

5,6,7,8

7,8 => /

distinct－去重

def distinct(): RDD[T]

def distinct(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T]

Return a new RDD containing the distinct elements in this RDD.

----------

scala> val rdd=sc.parallelize(List(1,1,1,1,2,2,2,3,3,4),2)

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[5] at parallelize at <console>:24

scala> val rdd1=rdd.distinct

rdd1: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[14] at distinct at <console>:26

scala> rdd1.glom.collect

res6: Array[Array[Int]] = Array(Array(4, 2), Array(1, 3))

scala> val rdd2=rdd.distinct(3)

rdd2: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[18] at distinct at <console>:26

scala> rdd2.glom.collect

res7: Array[Array[Int]] = Array(Array(3), Array(4, 1), Array(2))

对RDD数据去重，形成不重复的RDD。如果设置numPartitions，会导致重新分区。注意，distinct首先在分区内对数据去重，再在分区间去重，可能会导致shuffle。下面每行是上面rdd的一个分区，通过distinct变换。

1,1,1,1,2 => 1,2 => \

1,2,3,4

2,2,3,3,4 => 2,3,4 => /

1,1,1,1,2 => 1,2 => 3

=> 4,1

2,2,3,3,4 => 2,3,4 => 2

filter过滤

def filter(f: (T) ⇒ Boolean): RDD[T]

Return a new RDD containing only the elements that satisfy a predicate.

----------

scala> val rdd=sc.parallelize(0 to 9,2)

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[20] at parallelize at <console>:24

scala> val filterrdd=rdd.filter(_<5)

filterrdd: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[21] at filter at <console>:26

scala> filterrdd.collect

res3: Array[Int] = Array(0, 1, 2, 3, 4)

scala> filterrdd.glom.collect

res4: Array[Array[Int]] = Array(Array(0, 1, 2, 3, 4), Array())

filter对RDD每个元素过滤，保留返回true的元素。下面每行是上面rdd的一个分区，通过filter变换。

0,1,2,3,4 => 0,1,2,3,4

5,6,7,8,9 =>

filter在RDD数据清洗中有应用，可以通过偏函数（partial function，给定一个输入不一定有输出的函数）过滤混合数据类型。例如：

scala> val rdd=sc.parallelize(Array(1,2,"a","b","c"),3) //rdd是Int和String混合类型，分成3个分区

rdd: org.apache.spark.rdd.RDD[Any] = ParallelCollectionRDD[17] at parallelize at <console>:24

scala> rdd.glom.collect //各分区结构

res8: Array[Array[Any]] = Array(Array(1), Array(2, a), Array(b, c))

scala> val newrdd=rdd.collect({case x:Int=>x}) //提取Int类型元素

newrdd: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[20] at collect at <console>:26

scala> newrdd.collect

res9: Array[Int] = Array(1, 2)

scala> newrdd.glom.collect

res10: Array[Array[Int]] = Array(Array(1), Array(2), Array())

1 => 1

2,a => 2

b,c => ()

flatMap－变换

def flatMap[U](f: (T) ⇒ TraversableOnce[U])(implicit arg0: ClassTag[U]): RDD[U]

Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.

----------

scala> val rdd=sc.parallelize(0 to 5,1)

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[22] at parallelize at <console>:24

scala> val flatmaprdd=rdd.flatMap(x=>0 to x)

flatmaprdd: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[23] at flatMap at <console>:26

scala> flatmaprdd.collect

res9: Array[Int] = Array(0, 0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5)

下面每行是上面rdd的一个分区，通过flatMap变换。

0,1,2,3,4,5 => 0, 0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5

pipe－调用外部命令

def pipe(command: String): RDD[String]

def pipe(command: String, env: Map[String, String]): RDD[String]

def pipe(command: Seq[String], env: Map[String, String] = Map(), printPipeContext: (String => Unit) => Unit = null, printRDDElement: (T, String => Unit) => Unit = null): RDD[String]

Return an RDD created by piping elements to a forked external process. The resulting RDD is computed by executing the given process once per partition. All elements of each input partition are written to a process's stdin as lines of input separated by a newline. The resulting partition consists of the process's stdout output, with each line of stdout resulting in one element of the output partition. A process is invoked even for empty partitions.

The print behavior can be customized by providing two functions.

command command to run in forked process.

env environment variables to set.

printPipeContext Before piping elements, this function is called as an opportunity to pipe context data. Print line function (like out.println) will be passed as printPipeContext's parameter.

printRDDElement Use this function to customize how to pipe elements. This function will be called with each RDD element as the 1st parameter, and the print line function (like out.println()) as the 2nd parameter. An example of pipe the RDD data of groupBy() in a streaming way, instead of constructing a huge String to concat all the elements: def printRDDElement(record:(String, Seq[String]), f:String=>Unit) = for (e <- record._2) {f(e)}

separateWorkingDir Use separate working directories for each task.

bufferSize Buffer size for the stdin writer for the piped process.

encoding Char encoding used for interacting (via stdin, stdout and stderr) with the piped process

returns the result RDD

----------

scala> val rdd=sc.parallelize(0 to 7,4)

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[24] at parallelize at <console>:24

scala> rdd.glom.collect

res10: Array[Array[Int]] = Array(Array(0, 1), Array(2, 3), Array(4, 5), Array(6, 7))

scala> rdd.pipe("head -n 1").collect

res11: Array[String] = Array(0, 2, 4, 6)

Sample－抽样

def sample(withReplacement: Boolean, fraction: Double, seed: Long = Utils.random.nextLong): RDD[T]

Return a sampled subset of this RDD.

withReplacement can elements be sampled multiple times (replaced when sampled out)

fraction expected size of the sample as a fraction of this RDD's size without replacement: probability that each element is chosen; fraction must be [0, 1] with replacement: expected number of times each element is chosen; fraction must be >= 0

seed seed for the random number generator

----------

随机选择RDD元素，返回后构成新RDD。withReplacement，是否重复抽样，true－重复抽样，false－不重复抽样；fraction，抽样比例；seed－随机数种子。

scala> val rdd=sc.parallelize(0 to 9,1)

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[27] at parallelize at <console>:24

scala> rdd.sample(false,0.5).collect

res12: Array[Int] = Array(0, 1, 3, 5, 6, 7, 8, 9)

scala> rdd.sample(false,0.5).collect

res13: Array[Int] = Array(0, 2, 4, 6, 8)

scala> rdd.sample(false,0.5).collect

res14: Array[Int] = Array(2, 3, 6)

scala> rdd.sample(false,0.5).collect

res15: Array[Int] = Array(0, 2, 4, 6, 9)

scala> rdd.sample(false,0.8).collect

res16: Array[Int] = Array(0, 1, 2, 3, 6, 7, 8)

scala> rdd.sample(true,0.5).collect

res17: Array[Int] = Array(0, 3)

scala> rdd.sample(true,0.5).collect

res18: Array[Int] = Array(2, 3, 4, 8, 8)

sortBy－排序

def sortBy[K](f: (T) ⇒ K, ascending: Boolean = true, numPartitions: Int = this.partitions.length)(implicit ord: Ordering[K], ctag: ClassTag[K]): RDD[T]

Return this RDD sorted by the given key function.

----------

对RDD元素进行排序，并输出排序后的RDD。ascending=true，升序排列；ascending=false，降序排列。

scala> val rdd=sc.parallelize(List(2,1,4,3),2)

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[35] at parallelize at <console>:24

scala> rdd.sortBy(x=>x,true).collect

res19: Array[Int] = Array(1, 2, 3, 4)

scala> rdd.sortBy(x=>x,false).collect

res20: Array[Int] = Array(4, 3, 2, 1)

scala> val z=sc.parallelize(Array(("h",10),("a",2),("c",3)),2)

z: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[16] at parallelize at <console>:24

scala> z.glom.collect

res18: Array[Array[(String, Int)]] = Array(Array((h,10)), Array((a,2), (c,3)))

scala> z.sortBy(x=>x._1,true).glom.collect

res20: Array[Array[(String, Int)]] = Array(Array((a,2), (c,3)), Array((h,10)))

(h,10) => (h,10) => (a,2),(c,3)

(a,2),(c,3) => (a,2),(c,3) => (h,10)

cartesian－笛卡尔积

def cartesian[U](other: RDD[U])(implicit arg0: ClassTag[U]): RDD[(T, U)]

Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements (a, b) where a is in this and b is in other.

----------

scala> val rdd1=sc.parallelize(List("a","b","c","d"),1)

rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[46] at parallelize at <console>:24

scala> val rdd2=sc.parallelize(List(1,2,3),1)

rdd2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[47] at parallelize at <console>:24

scala> rdd1.cartesian(rdd2).collect

res21: Array[(String, Int)] = Array((a,1), (a,2), (a,3), (b,1), (b,2), (b,3), (c,1), (c,2), (c,3), (d,1), (d,2), (d,3))

scala> val rdd3=rdd1.cartesian(rdd2)

rdd3: org.apache.spark.rdd.RDD[(String, Int)] = CartesianRDD[49] at cartesian at <console>:28

scala> rdd3.glom.collect

res22: Array[Array[(String, Int)]] = Array(Array((a,1), (a,2), (a,3), (b,1), (b,2), (b,3), (c,1), (c,2), (c,3), (d,1), (d,2), (d,3)))

scala> rdd3.partitions.length

res23: Int = 1

对两个RDD进行笛卡尔积，也就是RDD的每个元素与另外一个RDD中每个元素进行JOIN。该算子可能导致内存溢出。

a,b,c,d \ (a,1)

=> (a,2)

1,2,3 / (a,3)

(b,1)

…...

intersection－交集

def intersection(other: RDD[T]): RDD[T]

def intersection(other: RDD[T], numPartitions: Int): RDD[T]

def intersection(other: RDD[T], partitioner: Partitioner)(implicit ord: Ordering[T] = null): RDD[T]

Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did. Performs a hash partition across the cluster. Note that this method performs a shuffle internally.

numPartitions How many partitions to use in the resulting RDD

partitioner Partitioner to use for the resulting RDD

----------

scala> val rdd1=sc.parallelize(List(1,2,3,4),1)

rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[51] at parallelize at <console>:24

scala> val rdd2=sc.parallelize(List(2,3,6,7),2)

rdd2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[52] at parallelize at <console>:24

scala> rdd1.intersection(rdd2).collect

res24: Array[Int] = Array(2, 3)

下面每行是上面rdd1/2的一个分区，通过intersection变换计算交集，也就是A∩B。

1,2,3,4 => \

2,3 =>

6,7 => /

subtract－补集

def subtract(other: RDD[T]): RDD[T]

def subtract(other: RDD[T], numPartitions: Int): RDD[T]

def subtract(other: RDD[T], p: Partitioner)(implicit ord: Ordering[T] = null): RDD[T]

Return an RDD with the elements from this that are not in other.

Uses this partitioner/partition size, because even if other is huge, the resulting RDD will be <= us.

----------

scala> val rdd1=sc.parallelize(List(1,2,3,4),1)

rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[51] at parallelize at <console>:24

scala> val rdd2=sc.parallelize(List(2,3,6,7),2)

rdd2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[52] at parallelize at <console>:24

scala> rdd1.subtract(rdd2).collect

res25: Array[Int] = Array(1, 4)

下面每行是上面rdd1/2的一个分区，通过subtract变换计算交集，也就是A-B。

1,2,3,4 => \

1,4

2,3 =>

6,7 => /

union－并集

def ++

def union(other: RDD[T]): RDD[T]

Return the union of this RDD and another one. Any identical elements will appear multiple times (use .distinct() to eliminate them).

----------

scala> val rdd1=sc.parallelize(List(1,2,3,4),1)

rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[51] at parallelize at <console>:24

scala> val rdd2=sc.parallelize(List(2,3,6,7),2)

rdd2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[52] at parallelize at <console>:24

scala> rdd1.union(rdd2).collect

res26: Array[Int] = Array(1, 2, 3, 4, 2, 3, 6, 7)

scala> rdd1.union(rdd2).glom.collect

res27: Array[Array[Int]] = Array(Array(1, 2, 3, 4), Array(2, 3), Array(6, 7))

scala> (rdd1++rdd2).collect

res28: Array[Int] = Array(1, 2, 3, 4, 2, 3, 6, 7)

下面每行是上面rdd1/2的一个分区，通过subtract变换计算交集，也就是A∪B。

1,2,3,4 => \ 1,2,3,4

2,3 => 2,3

6,7 => / 6,7

zip－联结

def zip[U](other: RDD[U])(implicit arg0: ClassTag[U]): RDD[(T, U)]

Zips this RDD with another one, returning key-value pairs with the first element in each RDD, second element in each RDD, etc. Assumes that the two RDDs havethe *same number of partitions* and the *same number of elements in each partition* (e.g. one was made through a map on the other).

----------

scala> val rdd1=sc.parallelize(List(1,2,3,4),1)

rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[51] at parallelize at <console>:24

scala> val rdd2=sc.parallelize(List(2,3,6,7),2)

rdd2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[52] at parallelize at <console>:24

scala> rdd1.zip(rdd2).glom.collect //rdd1与rdd2的分区数不同，zip操作出错

java.lang.IllegalArgumentException: Can't zip RDDs with unequal numbers of partitions: List(1, 2)

at org.apache.spark.rdd.ZippedPartitionsBaseRDD.getPartitions(ZippedPartitionsRDD.scala:57)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)

at scala.Option.getOrElse(Option.scala:121)

at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)

at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)

at scala.Option.getOrElse(Option.scala:121)

at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)

at org.apache.spark.SparkContext.runJob(SparkContext.scala:1911)

at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:893)

at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)

at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)

at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)

at org.apache.spark.rdd.RDD.collect(RDD.scala:892)

... 48 elided

scala> val rdd3=sc.parallelize(List(2,3,6,7),1)

rdd3: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[70] at parallelize at <console>:24

scala> val rdd4=rdd1.zip(rdd3)

rdd4: org.apache.spark.rdd.RDD[(Int, Int)] = ZippedPartitionsRDD2[71] at zip at <console>:28

scala> rdd4.glom.collect

res35: Array[Array[(Int, Int)]] = Array(Array((1,2), (2,3), (3,6), (4,7)))

scala> rdd4.collect

res36: Array[(Int, Int)] = Array((1,2), (2,3), (3,6), (4,7))

下面每行是上面rdd1/3，通过zip变换计算，形成一个key/value的RDD。

(1,2)

1,2,3,4 => \ (2,3)

2,3,6,7 => / (3,6)

(4,7)

map－创建RDD

def map[U: ClassTag](f: T => U): RDD[U]

Applies a transformation function on each item of the RDD and returns the result as a new RDD.

----------

scala> val words=sc.parallelize(List("apple","banana","berry","cherry","cumquat","haw"),1)

words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[78] at parallelize at <console>:24

scala> words.collect

res44: Array[String] = Array(apple, banana, berry, cherry, cumquat, haw)

scala> val pairs=words.map(x=>(x(0),x))

pairs: org.apache.spark.rdd.RDD[(Char, String)] = MapPartitionsRDD[79] at map at <console>:26

下面是word，通过map变换计算。

apple, banana, berry, cherry, cumquat, haw => (a,apple), (b,banana), (b,berry), (c,cherry), (c,cumquat), (h,haw)

keyBy－创建key/value的RDD

def keyBy[K](f: (T) ⇒ K): RDD[(K, T)]

Creates tuples of the elements in this RDD by applying f. The result of the function becomes the key and the original data item becomes the value of the newly created tuples.

----------

scala> val words=sc.parallelize(List("apple","banana","berry","cherry","cumquat","haw"),1)

words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[80] at parallelize at <console>:24

scala> words.collect

res51: Array[String] = Array(apple, banana, berry, cherry, cumquat, haw)

scala> val pairs=words.keyBy(_.length)

pairs: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[81] at keyBy at <console>:26

scala> pairs.collect

res52: Array[(Int, String)] = Array((5,apple), (6,banana), (5,berry), (6,cherry), (7,cumquat), (3,haw))

下面是word，通过keyBy变换计算。

apple, banana, berry, cherry, cumquat, haw => (5,apple), (6,banana), (5,berry), (6,cherry), (7,cumquat), (3,haw)

combineByKey－按key进行聚合

def combineByKey[C](createCombiner: (V) ⇒ C, mergeValue: (C, V) ⇒ C, mergeCombiners: (C, C) ⇒ C): RDD[(K, C)]

def combineByKey[C](createCombiner: (V) ⇒ C, mergeValue: (C, V) ⇒ C, mergeCombiners: (C, C) ⇒ C, numPartitions: Int): RDD[(K, C)]

def combineByKey[C](createCombiner: (V) ⇒ C, mergeValue: (C, V) ⇒ C, mergeCombiners: (C, C) ⇒ C, partitioner: Partitioner, mapSideCombine: Boolean = true, serializer: Serializer = null): RDD[(K, C)]

Very efficient implementation that combines the values of a RDD consisting of two-component tuples by applying multiple aggregators one after another.

----------

scala> val pair=sc.parallelize(List(("fruit","apple"),("fruit","banana"),("vegetable","cucumber"),("fruit","cherry"),("vegetable","bean"),("vegetable","pepper")),2)

pair: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[82] at parallelize at <console>:24

scala> val combinedPair=pair.combineByKey(List(_),(x:List[String],y:String)=>y::x,(x:List[String],y:List[String])=>x:::y)

combinedPair: org.apache.spark.rdd.RDD[(String, List[String])] = ShuffledRDD[83] at combineByKey at <console>:26

scala> combinedPair.collect

res54: Array[(String, List[String])] = Array((fruit,List(banana, apple, cherry)), (vegetable,List(cucumber, pepper, bean)))

下面是pair的两个分区，通过combineByKey变换计算。

(fruit,apple),(fruit,banana),(vegetable,cucumber) => （fruit, List(banana, apple, cherry)

(fruit,cherry),(vegetable,bean),(vegetable,pepper) => (vegetable, List(cucumber, pepper, bean))

说明，SCALA的4种操作符如下

1）:: 该方法被称为cons，意为构造，向队列的头部追加数据，创造新的列表。用法为 x::list,其中x为加入到头部的元素，无论x是列表与否，它都只将成为新生成列表的第一个元素，也就是说新生成的列表长度为list的长度＋1(btw, x::list等价于list.::(x))

2）:+和+: 两者的区别在于:+方法用于在尾部追加元素，+:方法用于在头部追加元素，和::很类似，但是::可以用于pattern match ，而+:则不行. 关于+:和:+,只要记住冒号永远靠近集合类型就OK了。

3）++ 该方法用于连接两个集合，list1++list2

4）::: 该方法只能用于连接两个List类型的集合

scala> "A"::"B"::Nil

res0: List[String] = List(A, B)

scala> "A"+:"B"+:Nil

res1: List[String] = List(A, B)

scala> Nil:+"A":+"B"

res2: List[String] = List(A, B)

scala> res0 ++ res1

res3: List[String] = List(A, B, A, B)

scala> res0 ::: res1

res4: List[String] = List(A, B, A, B)

scala> res0 :: res1

res5: List[java.io.Serializable] = List(List(A, B), A, B)

flatMapValues－对所有 Value进行flatMap聚合

def flatMapValues[U](f: (V) ⇒ TraversableOnce[U]): RDD[(K, U)]

Pass each value in the key-value pair RDD through a flatMap function without changing the keys; this also retains the original RDD's partitioning.

----------

scala> val rdd=sc.parallelize(List("a","boy"),1).keyBy(_.length)

rdd: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[85] at keyBy at <console>:24

scala> rdd.glom.collect

res55: Array[Array[(Int, String)]] = Array(Array((1,a), (3,boy)))

scala> rdd.flatMapValues(x=>"*"+x+"*").collect

res57: Array[(Int, Char)] = Array((1,*), (1,a), (1,*), (3,*), (3,b), (3,o), (3,y), (3,*))

下面是rdd的两个分区，通过flatMapValues变换计算。

a,boy => (1,a), (3,boy) => (1,*), (1,a), (1,*), (3,*), (3,b), (3,o), (3,y), (3,*)

groupByKey－按key进行聚合

def groupByKey(): RDD[(K, Iterable[V])]

def groupByKey(numPartitions: Int): RDD[(K, Iterable[V])]

def groupByKey(partitioner: Partitioner): RDD[(K, Iterable[V])]

Group the values for each key in the RDD into a single sequence. Hash-partitions the resulting RDD with the existing partitioner/parallelism level、numPartitions partitions、by passing a Partitioner respectively. The ordering of elements within each group is not guaranteed, and may even differ each time the resulting RDD is evaluated.

Note: This operation may be very expensive. If you are grouping in order to perform an aggregation (such as a sum or average) over each key, using PairRDDFunctions.aggregateByKey or PairRDDFunctions.reduceByKey will provide much better performance.

As currently implemented, groupByKey must be able to hold all the key-value pairs for any key in memory. If a key has too many values, it can result in an OutOfMemoryError.

----------

scala> val pairs=sc.parallelize(List(("fruit","apple"),("vegetable","cucumber"),("fruit","cherry"),("vegetable","bean"),("fruit","banana"),("vegetable","pepper")),2)

pairs: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[88] at parallelize at <console>:24

scala> pairs.groupByKey.collect

res58: Array[(String, Iterable[String])] = Array((fruit,CompactBuffer(apple, cherry, banana)), (vegetable,CompactBuffer(cucumber, bean, pepper)))

下面是pairs的两个分区，通过groupByKey变换计算。

(fruit,apple),(vegetable,cucumber),(fruit,cherry) => （fruit, CompactBuffer(banana, apple, cherry)

(vegetable,bean),(fruit,banana),(vegetable,pepper) => (vegetable, CompactBuffer(cucumber, pepper, bean))

keys－提取key

def keys: RDD[K]

Return an RDD with the keys of each tuple.

----------

scala> val pairs=sc.parallelize(List("apple","banana","berry","cherry","cumquat","haw"),1).keyBy(_.length)

pairs: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[91] at keyBy at <console>:24

scala> pairs.keys.collect

res59: Array[Int] = Array(5, 6, 5, 6, 7, 3)

scala> pairs.collect

res60: Array[(Int, String)] = Array((5,apple), (6,banana), (5,berry), (6,cherry), (7,cumquat), (3,haw))

下面是pairs的分区，通过keys变换计算。

(5,apple), (6,banana), (5,berry), (6,cherry), (7,cumquat), (3,haw) => 5,6,5,6,7,3

mapValues－对value进行变换

def mapValues[U](f: (V) ⇒ U): RDD[(K, U)]

Pass each value in the key-value pair RDD through a map function without changing the keys; this also retains the original RDD's partitioning.

----------

scala> val pairs=sc.parallelize(List("apple","banana","berry","cherry","cumquat","haw"),1).keyBy(_.length)

pairs: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[91] at keyBy at <console>:24

scala> pairs.mapValues(v=>v+" "+v(0)).collect

res61: Array[(Int, String)] = Array((5,apple a), (6,banana b), (5,berry b), (6,cherry c), (7,cumquat c), (3,haw h))

下面是pairs的分区，通过keys变换计算。

(5,apple), (6,banana), (5,berry), (6,cherry), (7,cumquat), (3,haw) => (5,apple a), (6,banana b), (5,berry b), (6,cherry c), (7,cumquat c), (3,haw h)

partitionBy－按Key值重新分区

def partitionBy(partitioner: Partitioner): RDD[(K, V)]

Return a copy of the RDD partitioned using the specified partitioner.

----------

scala> val pairs=sc.parallelize(0 to 9,2).keyBy(x=>x)

pairs: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[106] at keyBy at <console>:24

scala> pairs.glom.collect

res69: Array[Array[(Int, Int)]] = Array(Array((0,0), (1,1), (2,2), (3,3), (4,4)), Array((5,5), (6,6), (7,7), (8,8), (9,9)))

scala> import org.apache.spark.HashPartitioner

import org.apache.spark.HashPartitioner

scala> val partitionedPairs=pairs.partitionBy(new HashPartitioner(2))

partitionedPairs: org.apache.spark.rdd.RDD[(Int, Int)] = ShuffledRDD[108] at partitionBy at <console>:27

scala> partitionedPairs.glom.collect

res70: Array[Array[(Int, Int)]] = Array(Array((0,0), (2,2), (4,4), (6,6), (8,8)), Array((1,1), (3,3), (5,5), (7,7), (9,9)))

reduceByKey－按key值进行reduce操作

def reduceByKey(func: (V, V) ⇒ V): RDD[(K, V)]

def reduceByKey(func: (V, V) ⇒ V, numPartitions: Int): RDD[(K, V)]

def reduceByKey(partitioner: Partitioner, func: (V, V) ⇒ V): RDD[(K, V)]

Merge the values for each key using an associative and commutative reduce function.This will also perform the merging locally on each mapper before sending results to a reducer, similarly to a "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/ parallelism level, or hash-partitioned with numPartitions partitions. Please note that any function f you provide, should be commutative in order to generate reproducible results.

----------

scala> val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)

a: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[116] at parallelize at <console>:24

scala> val b = a.map(x => (x.length, x))

b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[117] at map at <console>:26

scala> b.reduceByKey(_ + _).collect

res54: Array[(Int, String)] = Array((4,lion), (3,dogcat), (7,panther), (5,tigereagle))

下面是a/b的分区，通过reduceByKey变换计算。

dog,tiger,lion => (3,dog),(5,tiger),(4,lion) =>\

(4,lion),(3,dogcat),(7,panther),(5,tigereagle)

cat,panther,eagle => (3,cat),(7,panther),(5,eagle) =>/

sortByKey－按key进行排序

def sortByKey(ascending: Boolean = true, numPartitions: Int = self.partitions.size): RDD[P]

This function sorts the input RDD's data and stores it in a new RDD.The output RDD is a shuffled RDD because it stores data that is output by a reducer which has been shuffled. The implementation of this function is actually very clever. First, it uses a range partitioner to partition the data in ranges within the shuffled RDD. Then it sorts these ranges individually with mapPartitions using standard sort mechanisms.

----------

scala> val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)

a: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[116] at parallelize at <console>:24

scala> val b = a.map(x => (x.length, x))

b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[117] at map at <console>:26

scala> pairs.sortByKey(true).collect

res64: Array[(Int, String)] = Array((3,haw), (5,apple), (5,berry), (6,banana), (6,cherry), (7,cumquat))

scala> pairs.sortByKey(false,2).glom.collect

res65: Array[Array[(Int, String)]] = Array(Array((7,cumquat), (6,banana), (6,cherry)), Array((5,apple), (5,berry), (3,haw)))

下面是a/b的分区，通过reduceByKey变换计算，使用降序排列。

dog,tiger,lion => (3,dog),(5,tiger),(4,lion) =>\

(7,cumquat), (6,banana), (6,cherry)), Array((5,apple), (5,berry), (3,haw)

cat,panther,eagle => (3,cat),(7,panther),(5,eagle) =>/

values－提取value

def values: RDD[V]

Return an RDD with the values of each tuple.

----------

scala> val pairs=sc.parallelize(0 to 9,2).keyBy(x=>x)

pairs: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[106] at keyBy at <console>:24

scala> pairs.values.collect

res72: Array[Int] = Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)

cogroup－按key聚合

def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))]

def cogroup[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (Iterable[V], Iterable[W]))]

def cogroup[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (Iterable[V], Iterable[W]))]

def cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)]): RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))]

def cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)], numPartitions: Int): RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))]

def cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)], partitioner: Partitioner): RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))]

A very powerful set of functions that allow grouping up to 3 key-value RDDs together using their keys.

----------

scala> val a=sc.parallelize(List(1,2,1,3),1)

a: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[126] at parallelize at <console>:24

scala> val b=a.map((_,"b"))

b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[127] at map at <console>:26

scala> val c=a.map((_,"c"))

c: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[128] at map at <console>:26

scala> b.collect

res58: Array[(Int, String)] = Array((1,b), (2,b), (1,b), (3,b))

scala> c.collect

res59: Array[(Int, String)] = Array((1,c), (2,c), (1,c), (3,c))

scala> b.cogroup(c).collect

res60: Array[(Int, (Iterable[String], Iterable[String]))] = Array((1,(CompactBuffer(b, b),CompactBuffer(c, c))), (3,(CompactBuffer(b),CompactBuffer(c))), (2,(CompactBuffer(b),CompactBuffer(c))))

scala> val d=a.map((_,"d"))

d: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[131] at map at <console>:26

scala> b.cogroup(c,d).collect

res61: Array[(Int, (Iterable[String], Iterable[String], Iterable[String]))] = Array((1,(CompactBuffer(b, b),CompactBuffer(c, c),CompactBuffer(d, d))), (3,(CompactBuffer(b),CompactBuffer(c),CompactBuffer(d))), (2,(CompactBuffer(b),CompactBuffer(c),CompactBuffer(d))))

join－按key联结

def join[W](other: RDD[(K, W)]): RDD[(K, (V, W))]

def join[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (V, W))]

def join[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, W))]

Return an RDD containing all pairs of elements with matching keys in this and other. Each pair of elements will be returned as a (k, (v1, v2)) tuple, where (k, v1) is in this and (k, v2) is in other.

----------

scala> val pair1=sc.parallelize(List((1,"a"),(2,"b"),(3,"c")),2)

pair1: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[111] at parallelize at <console>:25

scala> val pair2=sc.parallelize(List((1,"apple"),(2,"banana")),2)

pair2: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[112] at parallelize at <console>:25

scala> pair1.join(pair2,1).collect

res75: Array[(Int, (String, String))] = Array((1,(a,apple)), (2,(b,banana)))

下面是pair1/2的分区，通过join变换计算。

(1,a)

(2,b),(3,c) (1,(a,apple))

(1,apple) (2,(b,banana))

(2,banana)

leftOuterJoin－按key值进行左外联结

def leftOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))]

def leftOuterJoin[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (V, Option[W]))]

def leftOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, Option[W]))]

Performs an left outer join using two key-value RDDs. Please note that the keys must be generally comparable to make this work correctly.

----------

scala> val a=sc.parallelize(List("a","boy","cat"),1).keyBy(_.length)

a: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[120] at keyBy at <console>:25

scala> val b=sc.parallelize(List("boy","cat","dragon"),1).keyBy(_.length)

b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[122] at keyBy at <console>:25

scala> a.collect

res76: Array[(Int, String)] = Array((1,a), (3,boy), (3,cat))

scala> b.collect

res77: Array[(Int, String)] = Array((3,boy), (3,cat), (6,dragon))

scala> a.leftOuterJoin(b).collect

res79: Array[(Int, (String, Option[String]))] = Array((1,(a,None)), (3,(boy,Some(boy))), (3,(boy,Some(cat))), (3,(cat,Some(boy))), (3,(cat,Some(cat))))

下面是a/b的分区，通过leftOuterJoin变换计算。

(1,a),(3,boy),(3,cat)

=> (1,(a,None)), (3,(boy,Some(boy))), (3,(boy,Some(cat))), (3,(cat,Some(boy))), (3,(cat,Some(cat)))

(3,boy),(3,cat),(6,dragon)

rightOuterJoin－按key值进行右外联结

def rightOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (Option[V], W))]

def rightOuterJoin[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (Option[V], W))]

def rightOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (Option[V], W))]

Performs an right outer join using two key-value RDDs. Please note that the keys must be generally comparable to make this work correctly.

----------

scala> val a=sc.parallelize(List("a","boy","cat"),1).keyBy(_.length)

a: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[120] at keyBy at <console>:25

scala> val b=sc.parallelize(List("boy","cat","dragon"),1).keyBy(_.length)

b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[122] at keyBy at <console>:25

scala> a.rightOuterJoin(b).collect

res81: Array[(Int, (Option[String], String))] = Array((6,(None,dragon)), (3,(Some(boy),boy)), (3,(Some(boy),cat)), (3,(Some(cat),boy)), (3,(Some(cat),cat)))

下面是a/b的分区，通过rightOuterJoin变换计算。

(1,a),(3,boy),(3,cat)

=> (6,(None,dragon)), (3,(Some(boy),boy)), (3,(Some(boy),cat)), (3,(Some(cat),boy)), (3,(Some(cat),cat))

(3,boy),(3,cat),(6,dragon)

subtractByKey－按key值进行求补

def subtractByKey[W: ClassTag](other: RDD[(K, W)]): RDD[(K, V)]

def subtractByKey[W: ClassTag](other: RDD[(K, W)], numPartitions: Int): RDD[(K, V)]

def subtractByKey[W: ClassTag](other: RDD[(K, W)], p: Partitioner): RDD[(K, V)]

Very similar to subtract, but instead of supplying a function, the key-component of each pair will be automatically used as criterion for removing items from the first RDD.

----------

scala> val a=sc.parallelize(List("a","boy","cat"),1).keyBy(_.length)

a: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[120] at keyBy at <console>:25

scala> val b=sc.parallelize(List("boy","cat","dragon"),1).keyBy(_.length)

b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[122] at keyBy at <console>:25

scala> a.subtractByKey(b).collect

res83: Array[(Int, String)] = Array((1,a))

下面是a/b的分区，通过subtractByKey变换计算。

(1,a),(3,boy),(3,cat)

=> (1,a)

(3,boy),(3,cat),(6,dragon)

行动算子

aggregate－聚合操作

def aggregate[U](zeroValue: U)(seqOp: (U, T) ⇒ U, combOp: (U, U) ⇒ U)(implicit arg0: ClassTag[U]): U

The aggregate function allows the user to apply two different reduce functions to the RDD.

- The first reduce function is applied within each partition to reduce the data within each partition into a single result.

- The second reduce function is used to combine the different reduced results of all partitions together to arrive at one final result.

The ability to have two separate reduce functions for intra partition versus across partition reducing adds a lot of flexibility. For example the first reduce function can be the max function and the second one can be the sum function. The user also specifies an initial value. Here are some important facts.

1）The initial value is applied at both levels of reduce. So both at the intra partition reduction and across partition reduction.

2）Both reduce functions have to be commutative and associative.

3）Do not assume any execution order for either partition computations or combining partitions.

4）Why would one want to use two input data types? Let us assume we do an archaeological site survey using a metal detector. While walking through the site we take GPS coordinates of important findings based on the output of the metal detector. Later, we intend to draw an image of a map that highlights these locations using the aggregate function. In this case the zeroValue could be an area map with no highlights. The possibly huge set of input data is stored as GPS coordinates across many partitions. seqOp (first reducer) could convert the GPS coordinates to map coordinates and put a marker on the map at the respective position. combOp (second reducer) will receive these highlights as partial maps and combine them into a single final output map.

seqOp

an operator used to accumulate results within a partition

combOp

an associative operator used to combine results from different partitions

----------

scala> val rdd=sc.parallelize(List(1,2,3,4,5,6),2)

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[1] at parallelize at <console>:24

scala> rdd.glom.collect

res1: Array[Array[Int]] = Array(Array(1, 2, 3), Array(4, 5, 6))

scala> rdd.aggregate(0)(_+_,Math.max(_,_))

res2: Int = 15

下面每行是rdd的两个分区。首先对每个分区内的所有元素累加求和（(_+_)），然后对分区元素的和，求最大值（Math.max(_,_)）。

1,2,3 => 6

=> 15

4,5,6 => 15

collect－收集元素

def collect(): Array[T]

def collect[U: ClassTag](f: PartialFunction[T, U]): RDD[U]

def toArray(): Array[T]

Converts the RDD into a Scala array and returns it. If you provide a standard map-function (i.e. f = T -> U) it will be applied before inserting the values into the result array. Note this method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory.

----------

scala> val data=sc.parallelize(0 to 9,2)

data: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[3] at parallelize at <console>:24

scala> data.collect

res4: Array[Int] = Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)

collectAsMap－收集Key/Value型RDD中的元素

def collectAsMap(): Map[K, V]

Return the key-value pairs in this RDD to the master as a Map.

Warning: this doesn't return a multimap (so if you have multiple values to the same key, only one value per key is preserved in the map returned)

Note this method should only be used if the resulting data is expected to be small, as all the data is loaded into the driver's memory.

----------

scala> val pairRDD=sc.parallelize(List((1,"a"),(2,"b"),(3,"c"),(4,"d")),2)

pairRDD: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[0] at parallelize at <console>:24

scala> pairRDD.collectAsMap

res0: scala.collection.Map[Int,String] = Map(2 -> b, 4 -> d, 1 -> a, 3 -> c)

count－计算元素个数

def count(): Long

Return the number of elements in the RDD.

----------

scala> val pairRDD=sc.parallelize(List((1,"a"),(2,"b"),(3,"c"),(4,"d")),2)

pairRDD: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[0] at parallelize at <console>:24

scala> pairRDD.count

res8: Long = 4

countByKey－按key值统计key/value型RDD中的元素个数

def countByKey(): Map[K, Long]

Count the number of elements for each key, collecting the results to a local Map.

Note that this method should only be used if the resulting map is expected to be small, as the whole thing is loaded into the driver's memory.To handle very large results, consider using rdd.mapValues(_ => 1L).reduceByKey(_ + _), which returns an RDD[T, Long] instead of a map.

----------

scala> val pairRDD=sc.parallelize(List(("fruit","apple"),("fruit","banana"),("fruit","cherry"),("vegetable","bean"),("vegetable","cucumber"),("vegetable","pepper")),2)

pairRDD: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[6] at parallelize at <console>:24

scala> pairRDD.countByKey

res9: scala.collection.Map[String,Long] = Map(fruit -> 3, vegetable -> 3)

countByValue－统计RDD中元素出现次数

def countByValue()(implicit ord: Ordering[(K, V)] = null): Map[(K, V), Long]

Return the count of each unique value in this RDD as a local map of (value, count) pairs.

Returns a map that contains all unique values of the RDD and their respective occurrence counts.(Warning: This operation will finally aggregate the information in a single reducer.)

----------

这种方法可以计算wordcount，但是需要注意可能导致内存溢出。

scala> val pairRDD=sc.parallelize(List(("fruit","apple"),("fruit","banana"),("fruit","cherry"),("vegetable","bean"),("vegetable","cucumber"),("vegetable","pepper")),2)

pairRDD: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[6] at parallelize at <console>:24

scala> pairRDD.countByValue

res10: scala.collection.Map[(String, String),Long] = Map((vegetable,bean) -> 1, (fruit,cherry) -> 1, (vegetable,cucumber) -> 1, (fruit,banana) -> 1, (fruit,apple) -> 1, (vegetable,pepper) -> 1)

(fruit,apple),(fruit,banana),(fruit,cherry) (fruit,apple)->1,(fruit,banana)->1,(fruit,cherry) ->1

(vegetable,bean),(vegetable,cucumber),(vegetable,pepper) (vegetable,bean)->1,(vegetable,cucumber)->1,(vegetable,pepper)->1

first－得到首个元素

def first(): T

Return the first element in this RDD.

----------

scala> val words=sc.parallelize(List("first","second","third"),1)

words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[16] at parallelize at <console>:24

scala> words.first

res12: String = first

glom－返回分区情况

def glom(): RDD[Array[T]]

Return an RDD created by coalescing all elements within each partition into an array. Assembles an array that contains all elements of the partition and embeds it in an RDD. Each returned array contains the contents of one partition.

----------

每个第二级Array为一个分区。注意，返回所有信息，形成新RDD给driver。如果数据量过大，可能导致out of memory问题。

scala> val num=sc.parallelize(0 to 10,4)

num: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[17] at parallelize at <console>:24

scala> num.glom.collect

res14: Array[Array[Int]] = Array(Array(0, 1), Array(2, 3, 4), Array(5, 6, 7), Array(8, 9, 10))

fold－合并

def fold(zeroValue: T)(op: (T, T) ⇒ T): T

Aggregate the elements of each partition, and then the results for all the partitions, using a given associative function and a neutral "zero value". The function op(t1, t2) is allowed to modify t1 and return it as its result value to avoid object allocation; however, it should not modify t2.

This behaves somewhat differently from fold operations implemented for non-distributed collections in functional languages like Scala. This fold operation may be applied to partitions individually, and then fold those results into the final result, rather than apply the fold to each element sequentially in some defined ordering. For functions that are not commutative, the result may differ from that of a fold applied to a non-distributed collection.

zeroValue

the initial value for the accumulated result of each partition for the op operator, and also the initial value for the combine results from different partitions for the op operator - this will typically be the neutral element (e.g. Nil for list concatenation or 0 for summation)

an operator used to both accumulate results within a partition and combine results from different partitions

----------

scala> val words=sc.parallelize(List("A","B","C","D"),2)

words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[19] at parallelize at <console>:24

scala> words.glom.collect

res15: Array[Array[String]] = Array(Array(A, B), Array(C, D))

scala> words.fold(" |")(_+"."+_)

res17: String = " |. |.A.B. |.C.D"

scala> words.fold(" ")(_+"."+_)

res18: String = " . .A.B. .C.D"

foreach－逐个处理RDD元素

def foreach(f: (T) ⇒ Unit): Unit

Applies a function f to all elements of this RDD.

----------

与map不同，foreach仅对RDD的每个元素进行处理，不返回新RDD。而map返回RDD。

scala> val words=sc.parallelize(List("A","B","C","D"),2)

words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[21] at parallelize at <console>:24

scala> words.foreach(x=>println(x+" is a letter."))

A is a letter.

C is a letter.

D is a letter.

B is a letter.

lookup－查找元素

def lookup(key: K): Seq[V]

Return the list of values in the RDD for key key. This operation is done efficiently if the RDD has a known partitioner by only searching the partition that the key maps to.

----------

scala> val pairs=sc.parallelize(List("apple","banana","berry","cherry","cumquat","haw"),2).keyBy(_.length)

pairs: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[23] at keyBy at <console>:24

scala> pairs.collect

res21: Array[(Int, String)] = Array((5,apple), (6,banana), (5,berry), (6,cherry), (7,cumquat), (3,haw))

scala> pairs.lookup(5)

res22: Seq[String] = WrappedArray(apple, berry)

max－求最大值

def max()(implicit ord: Ordering[T]): T

Returns the max of this RDD as defined by the implicit Ordering[T]. returns the maximum element of the RDD

----------

scala> val nums=sc.parallelize(0 to 9,1)

nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.max

res23: Int = 9

min－求最小值

def min()(implicit ord: Ordering[T]): T

Returns the min of this RDD as defined by the implicit Ordering[T]. returns the minimum element of the RDD

----------

scala> val nums=sc.parallelize(0 to 9,1)

nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.min

res24: Int = 0

partitions－返回RDD各分区信息

final def partitions: Array[Partition]

Get the array of partitions of this RDD, taking into account whether the RDD is checkpointed or not.

----------

查询RDD的分区数量，经常使用rdd.partitions.length，十分有用

take－获取前n个元素

def take(num: Int): Array[T]

Take the first num elements of the RDD. It works by first scanning one partition, and use the results from that partition to estimate the number of additional partitions needed to satisfy the limit.

Note due to complications in the internal implementation, this method will raise an exception if called on an RDD of Nothing or Null.This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory.

----------

scala> val nums=sc.parallelize(0 to 9,1)

nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.take(4)

res25: Array[Int] = Array(0, 1, 2, 3)

takeOrdered－获取排序后的前n个元素

def takeOrdered(num: Int)(implicit ord: Ordering[T]): Array[T]

Returns the first k (smallest) elements from this RDD as defined by the specified implicit Ordering[T] and maintains the ordering. This does the opposite of top. For example:

num k, the number of elements to return

ord the implicit ordering for T

returns an array of top elements

Note

this method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory.

----------

sc.parallelize(Seq(10, 4, 2, 12, 3)).takeOrdered(1)

// returns Array(2)

sc.parallelize(Seq(2, 3, 4, 5, 6)).takeOrdered(2)

// returns Array(2, 3)

scala> val nums=sc.parallelize(0 to 9,1)

nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.takeOrdered(3)

res26: Array[Int] = Array(0, 1, 2)

takeSample－提取n个样本

def takeOrdered(num: Int)(implicit ord: Ordering[T]): Array[T]

Returns the first k (smallest) elements from this RDD as defined by the specified implicit Ordering[T] and maintains the ordering. This does the opposite of top. For example:

num k, the number of elements to return

ord the implicit ordering for T

returns an array of top elements

Note

this method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory.

----------

sc.parallelize(Seq(10, 4, 2, 12, 3)).takeOrdered(1)

// returns Array(2)

sc.parallelize(Seq(2, 3, 4, 5, 6)).takeOrdered(2)

// returns Array(2, 3)

scala> val nums=sc.parallelize(0 to 9,1)

nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.takeSample(false,3,5)

res28: Array[Int] = Array(4, 3, 5)

top－寻找值最大的前几个元素

def top(num: Int)(implicit ord: Ordering[T]): Array[T]

Returns the top k (largest) elements from this RDD as defined by the specified implicit Ordering[T] and maintains the ordering. This does the opposite of takeOrdered. For example:

num k, the number of top elements to return

ord the implicit ordering for T

returns an array of top elements

Note

this method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory.

----------

sc.parallelize(Seq(10, 4, 2, 12, 3)).top(1)

// returns Array(12)

sc.parallelize(Seq(2, 3, 4, 5, 6)).top(2)

// returns Array(6, 5)

scala> val nums=sc.parallelize(0 to 9,1)

nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.top(2)

res29: Array[Int] = Array(9, 8)

saveAsObjectFile－存储为二进制文件

def saveAsObjectFile(path: String): Unit

Save this RDD as a SequenceFile of serialized objects.

----------

scala> val nums=sc.parallelize(0 to 9,1)

nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.saveAsObjectFile("~/obj")

saveAsTextFile－存储为文本文件

def saveAsTextFile(path: String, codec: Class[_ <: CompressionCodec]): Unit

Save this RDD as a compressed text file, using string representations of elements.

def saveAsTextFile(path: String): Unit

Save this RDD as a text file, using string representations of elements.

----------

scala> val nums=sc.parallelize(0 to 9,1)

nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> num.saveAsTextFile("~/text")

saveAsNewAPIHadoopFile－存储为hadoop文件

saveAsNewAPIHadoopDataSet－存储为Hadoop数据集

缓存算子

cache－缓存RDD

def cache(): RDD.this.type

Persist this RDD with the default storage level (MEMORY_ONLY).

----------

scala> val num=sc.parallelize(0 to 9,1)

num: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[5] at parallelize at <console>:24

scala> nums.collect

res33: Array[Int] = Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)

scala> val result=num.map(x=>x*x)

result: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[36] at map at <console>:26

scala> result.count

res35: Long = 11

scala> result.cache

res36: result.type = MapPartitionsRDD[36] at map at <console>:26

scala> result.count

res37: Long = 11

checkpoint－建立RDD的检查点

def checkpoint(): Unit

Mark this RDD for checkpointing. It will be saved to a file inside the checkpoint directory set with SparkContext#setCheckpointDir and all references to its parent RDDs will be removed. This function must be called before any job has been executed on this RDD. It is strongly recommended that this RDD is persisted in memory, otherwise saving it on a file will require recomputation.

----------

scala> val rdd=sc.makeRDD(1 to 9,2)

rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[37] at makeRDD at <console>:24

scala> val flatMapRDD=rdd.flatMap(x=>Seq(x,x))

flatMapRDD: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[38] at flatMap at <console>:26

scala> flatMapRDD.collect

res39: Array[Int] = Array(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9)

scala> sc.setCheckpointDir("my_checkpoint")

scala> flatMapRDD.checkpoint

scala> flatMapRDD.dependencies.head.rdd

res42: org.apache.spark.rdd.RDD[_] = ParallelCollectionRDD[37] at makeRDD at <console>:24

scala> flatMapRDD.collect

res43: Array[Int] = Array(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9)

scala> flatMapRDD.dependencies.head.rdd

res44: org.apache.spark.rdd.RDD[_] = ParallelCollectionRDD[37] at makeRDD at <console>:24

persist－持久化RDD

def persist(newLevel: StorageLevel): RDD.this.type

Set this RDD's storage level to persist its values across operations after the first time it is computed. This can only be used to assign a new storage level if the RDD does not have a storage level set yet. Local checkpointing is an exception.

----------

scala> num.collect

res45: Array[Int] = Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)

scala> num.getStorageLevel

res48: org.apache.spark.storage.StorageLevel = StorageLevel(1 replicas)

scala> num.persist()

res50: num.type = ParallelCollectionRDD[17] at parallelize at <console>:24

scala> num.getStorageLevel

res51: org.apache.spark.storage.StorageLevel = StorageLevel(memory, deserialized, 1 replicas)

scala> num.coalesce(2,false)

res52: org.apache.spark.rdd.RDD[Int] = CoalescedRDD[39] at coalesce at <console>:27

scala> num.glom.collect

res53: Array[Array[Int]] = Array(Array(0, 1), Array(2, 3, 4), Array(5, 6, 7), Array(8, 9, 10))

scala> num.persist()

res54: num.type = ParallelCollectionRDD[17] at parallelize at <console>:24

scala> num.getStorageLevel

res55: org.apache.spark.storage.StorageLevel = StorageLevel(memory, deserialized, 1 replicas)

0 0