Spark API样例

来源:互联网 发布:淘宝女装店铺设计图 编辑:程序博客网 时间:2024/06/08 08:51
创建算子

SparkContext.makeRDD创建RDD

scala> val rdd=sc.makeRDD(1 to 6,2)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[137] at makeRDD at <console>:25

scala> rdd.collect
res85: Array[Int] = Array(1, 2, 3, 4, 5, 6)

scala> rdd.partitions
res86: Array[org.apache.spark.Partition] = Array(org.apache.spark.rdd.ParallelCollectionPartition@1c82, org.apache.spark.rdd.ParallelCollectionPartition@1c83)

scala> val data=Seq((1 to 6,Seq("host1,host2")),(7 to 10,Seq("host3")))
data: Seq[(scala.collection.immutable.Range.Inclusive, Seq[String])] = List((Range(1, 2, 3, 4, 5, 6),List(host1,host2)), (Range(7, 8, 9, 10),List(host3)))

scala> val rdd2=sc.makeRDD(data)
rdd2: org.apache.spark.rdd.RDD[scala.collection.immutable.Range.Inclusive] = ParallelCollectionRDD[138] at makeRDD at <console>:27

scala> rdd2.collect
res87: Array[scala.collection.immutable.Range.Inclusive] = Array(Range(1, 2, 3, 4, 5, 6), Range(7, 8, 9, 10))

scala> rdd2.preferredLocations(rdd2.partitions(0))
res89: Seq[String] = List(host1,host2)

scala> rdd2.preferredLocations(rdd2.partitions(1))
res90: Seq[String] = List(host3)





SparkContext.parallelize数据并行化生成RDD

scala> val rdd=sc.makeRDD(1 to 6,2)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[137] at makeRDD at <console>:25

scala> rdd.collect
res91: Array[Int] = Array(1, 2, 3, 4, 5, 6)

scala> rdd.partitions
res92: Array[org.apache.spark.Partition] = Array(org.apache.spark.rdd.ParallelCollectionPartition@1c82, org.apache.spark.rdd.ParallelCollectionPartition@1c83)






SparkContext.textFile基于文本文件创建RDD

scala> val textFile=sc.textFile("/Users/user/Desktop/block_1.csv")
textFile: org.apache.spark.rdd.RDD[String] = /Users/user/Desktop/block_1.csv MapPartitionsRDD[144] at textFile at <console>:25

scala> textFile.count
res95: Long = 574914

scala> textFile.first()
res96: String = "id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match"





SparkContext.wholeTextFiles基于一个目录下的全部文本文件创建RDD

scala>  val rdd=sc.wholeTextFiles("/Users/user/Desktop")
rdd: org.apache.spark.rdd.RDD[(String, String)] = /Users/user/Desktop MapPartitionsRDD[146] at wholeTextFiles at <console>:25

scala> rdd.count
res97: Long = 3

scala> rdd.first()
res98: (String, String) =
(file:/Users/user/Desktop/block_1.csv,""id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match"
37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE
39086,47614,1,?,1,?,1,1,1,1,1,TRUE
70031,70237,1,?,1,?,1,1,1,1,1,TRUE
84795,97439,1,?,1,?,1,1,1,1,1,TRUE
36950,42116,1,?,1,1,1,1,1,1,1,TRUE
42413,48491,1,?,1,?,1,1,1,1,1,TRUE
25965,64753,1,?,1,?,1,1,1,1,1,TRUE
49451,90407,1,?,1,?,1,1,1,1,0,TRUE
39932,40902,1,?,1,?,1,1,1,1,1,TRUE
46626,47940,1,?,1,?,1,1,1,1,1,TRUE
48948,98379,1,?,1,?,1,1,1,1,1,TRUE
4767,4826,1,?,1,?,1,1,1,1,1,TRUE
45463,69659,1,?,1,?,1,1,1,1,1,TRUE
11367,13169,1,?,1,?,1,1,1,1,1,TRUE
10782,89636,1,?,1,?,1,0,1,1,1,TRUE
26206,39147,1,?,1,?,1,1,1,1,1,TRUE
16662,27083,1,1,1,?,1,1,1,...


变换算子

map-变换
def map[U](f: (T) ⇒ U)(implicit arg0: ClassTag[U]): RDD[U]
Return a new RDD by applying a function to all elements of this RDD.

----------
scala> val rdd=sc.parallelize(List(1,2,3,4,5,6,7,8),4)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:24

scala> rdd.map(x=>x+1).collect
res0: Array[Int] = Array(2, 3, 4, 5, 6, 7, 8, 9)

对RDD数据每个元素进行转换,返回新的RDD。下面每行是上面rdd的一个分区,通过map(x=>x+1)进行变换。
1,2 => 2,3 
3,4 => 4,5
5,6 => 6,7
7,8 => 8,9



coalesce-重新分区
def coalesce(numPartitions: Int, shuffle: Boolean = false, partitionCoalescer: Option[PartitionCoalescer] = Option.empty)(implicit ord: Ordering[T] = null): RDD[T]
Return a new RDD that is reduced into numPartitions partitions.

This results in a narrow dependency, e.g. if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of the 100 new partitions will claim 10 of the current partitions.

However, if you're doing a drastic coalesce, e.g. to numPartitions = 1, this may result in your computation taking place on fewer nodes than you like (e.g. one node in the case of numPartitions = 1). To avoid this, you can pass shuffle = true. This will add a shuffle step, but means the current upstream partitions will be executed in parallel (per whatever the current partitioning is).

Note: With shuffle = true, you can actually coalesce to a larger number of partitions. This is useful if you have a small number of partitions, say 100, potentially with a few partitions being abnormally large. Calling coalesce(1000, shuffle = true) will result in 1000 partitions with the data distributed using a hash partitioner.

----------
scala> val rdd=sc.parallelize(List(1,2,3,4,5,6,7,8),4)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:24

scala> rdd.glom.collect
res2: Array[Array[Int]] = Array(Array(1, 2), Array(3, 4), Array(5, 6), Array(7, 8))

scala> val newrdd=rdd.coalesce(2,false)
newrdd: org.apache.spark.rdd.RDD[Int] = CoalescedRDD[3] at coalesce at <console>:26

scala> newrdd.glom.collect
res3: Array[Array[Int]] = Array(Array(1, 2, 3, 4), Array(5, 6, 7, 8))

对RDD数据重新分区,形成新的RDD并返回。注意,新的RDD分区数如超过原分区数,则RDD分区不会改变。下面每行是上面rdd的一个分区,通过重新分区变换。
1,2 => \
          1,2,3,4
3,4 => /

5,6 => \
          5,6,7,8
7,8 => /






distinct-去重
def distinct(): RDD[T]
def distinct(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T]
Return a new RDD containing the distinct elements in this RDD.

----------
scala> val rdd=sc.parallelize(List(1,1,1,1,2,2,2,3,3,4),2)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[5] at parallelize at <console>:24

scala> val rdd1=rdd.distinct
rdd1: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[14] at distinct at <console>:26

scala> rdd1.glom.collect
res6: Array[Array[Int]] = Array(Array(4, 2), Array(1, 3))

scala> val rdd2=rdd.distinct(3)
rdd2: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[18] at distinct at <console>:26

scala> rdd2.glom.collect
res7: Array[Array[Int]] = Array(Array(3), Array(4, 1), Array(2))

对RDD数据去重,形成不重复的RDD。如果设置numPartitions,会导致重新分区。注意,distinct首先在分区内对数据去重,再在分区间去重,可能会导致shuffle。下面每行是上面rdd的一个分区,通过distinct变换。

1,1,1,1,2 => 1,2     => \
                                     1,2,3,4
2,2,3,3,4 => 2,3,4  => /


1,1,1,1,2 => 1,2     => 3
                              => 4,1
2,2,3,3,4 => 2,3,4  => 2




filter过滤
def filter(f: (T) ⇒ Boolean): RDD[T]
Return a new RDD containing only the elements that satisfy a predicate.

----------
scala> val rdd=sc.parallelize(0 to 9,2)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[20] at parallelize at <console>:24

scala> val filterrdd=rdd.filter(_<5)
filterrdd: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[21] at filter at <console>:26

scala> filterrdd.collect
res3: Array[Int] = Array(0, 1, 2, 3, 4)

scala> filterrdd.glom.collect
res4: Array[Array[Int]] = Array(Array(0, 1, 2, 3, 4), Array())

filter对RDD每个元素过滤,保留返回true的元素。下面每行是上面rdd的一个分区,通过filter变换。

0,1,2,3,4 => 0,1,2,3,4
5,6,7,8,9 => 

filter在RDD数据清洗中有应用,可以通过偏函数(partial function,给定一个输入不一定有输出的函数)过滤混合数据类型。例如:

scala> val rdd=sc.parallelize(Array(1,2,"a","b","c"),3) //rdd是Int和String混合类型,分成3个分区
rdd: org.apache.spark.rdd.RDD[Any] = ParallelCollectionRDD[17] at parallelize at <console>:24

scala>  rdd.glom.collect //各分区结构
res8: Array[Array[Any]] = Array(Array(1), Array(2, a), Array(b, c))

scala> val newrdd=rdd.collect({case x:Int=>x}) //提取Int类型元素
newrdd: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[20] at collect at <console>:26

scala> newrdd.collect
res9: Array[Int] = Array(1, 2)

scala> newrdd.glom.collect
res10: Array[Array[Int]] = Array(Array(1), Array(2), Array())

1        => 1
2,a     => 2
b,c     => ()



flatMap-变换
def flatMap[U](f: (T) ⇒ TraversableOnce[U])(implicit arg0: ClassTag[U]): RDD[U]
Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.

----------
scala> val rdd=sc.parallelize(0 to 5,1)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[22] at parallelize at <console>:24

scala> val flatmaprdd=rdd.flatMap(x=>0 to x)
flatmaprdd: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[23] at flatMap at <console>:26

scala> flatmaprdd.collect
res9: Array[Int] = Array(0, 0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5)

下面每行是上面rdd的一个分区,通过flatMap变换。

0,1,2,3,4,5 => 0, 0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5




pipe-调用外部命令
def pipe(command: String): RDD[String]
def pipe(command: String, env: Map[String, String]): RDD[String]
def pipe(command: Seq[String], env: Map[String, String] = Map(), printPipeContext: (String => Unit) => Unit = null, printRDDElement: (T, String => Unit) => Unit = null): RDD[String]

Return an RDD created by piping elements to a forked external process. The resulting RDD is computed by executing the given process once per partition. All elements of each input partition are written to a process's stdin as lines of input separated by a newline. The resulting partition consists of the process's stdout output, with each line of stdout resulting in one element of the output partition. A process is invoked even for empty partitions.

The print behavior can be customized by providing two functions.
command     command to run in forked process.
env     environment variables to set.
printPipeContext     Before piping elements, this function is called as an opportunity to pipe context data. Print line function (like out.println) will be passed as printPipeContext's parameter.
printRDDElement     Use this function to customize how to pipe elements. This function will be called with each RDD element as the 1st parameter, and the print line function (like out.println()) as the 2nd parameter. An example of pipe the RDD data of groupBy() in a streaming way, instead of constructing a huge String to concat all the elements: def printRDDElement(record:(String, Seq[String]), f:String=>Unit) = for (e <- record._2) {f(e)}
separateWorkingDir     Use separate working directories for each task.
bufferSize     Buffer size for the stdin writer for the piped process.
encoding     Char encoding used for interacting (via stdin, stdout and stderr) with the piped process
returns     the result RDD

----------
scala> val rdd=sc.parallelize(0 to 7,4)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[24] at parallelize at <console>:24

scala> rdd.glom.collect
res10: Array[Array[Int]] = Array(Array(0, 1), Array(2, 3), Array(4, 5), Array(6, 7))

scala> rdd.pipe("head -n 1").collect
res11: Array[String] = Array(0, 2, 4, 6)





Sample-抽样
def sample(withReplacement: Boolean, fraction: Double, seed: Long = Utils.random.nextLong): RDD[T]
Return a sampled subset of this RDD.

withReplacement     can elements be sampled multiple times (replaced when sampled out)
fraction     expected size of the sample as a fraction of this RDD's size without replacement: probability that each element is chosen; fraction must be [0, 1] with replacement: expected number of times each element is chosen; fraction must be >= 0
seed     seed for the random number generator

----------
随机选择RDD元素,返回后构成新RDD。withReplacement,是否重复抽样,true-重复抽样,false-不重复抽样;fraction,抽样比例;seed-随机数种子。

scala> val rdd=sc.parallelize(0 to 9,1)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[27] at parallelize at <console>:24

scala> rdd.sample(false,0.5).collect
res12: Array[Int] = Array(0, 1, 3, 5, 6, 7, 8, 9)

scala> rdd.sample(false,0.5).collect
res13: Array[Int] = Array(0, 2, 4, 6, 8)

scala> rdd.sample(false,0.5).collect
res14: Array[Int] = Array(2, 3, 6)

scala> rdd.sample(false,0.5).collect
res15: Array[Int] = Array(0, 2, 4, 6, 9)

scala> rdd.sample(false,0.8).collect
res16: Array[Int] = Array(0, 1, 2, 3, 6, 7, 8)

scala> rdd.sample(true,0.5).collect
res17: Array[Int] = Array(0, 3)

scala> rdd.sample(true,0.5).collect
res18: Array[Int] = Array(2, 3, 4, 8, 8)




sortBy-排序
def sortBy[K](f: (T) ⇒ K, ascending: Boolean = true, numPartitions: Int = this.partitions.length)(implicit ord: Ordering[K], ctag: ClassTag[K]): RDD[T]
Return this RDD sorted by the given key function.

----------
对RDD元素进行排序,并输出排序后的RDD。ascending=true,升序排列;ascending=false,降序排列。

scala> val rdd=sc.parallelize(List(2,1,4,3),2)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[35] at parallelize at <console>:24

scala> rdd.sortBy(x=>x,true).collect
res19: Array[Int] = Array(1, 2, 3, 4)

scala> rdd.sortBy(x=>x,false).collect
res20: Array[Int] = Array(4, 3, 2, 1)

scala> val z=sc.parallelize(Array(("h",10),("a",2),("c",3)),2)
z: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[16] at parallelize at <console>:24

scala> z.glom.collect
res18: Array[Array[(String, Int)]] = Array(Array((h,10)), Array((a,2), (c,3)))

scala> z.sortBy(x=>x._1,true).glom.collect
res20: Array[Array[(String, Int)]] = Array(Array((a,2), (c,3)), Array((h,10)))

(h,10)          => (h,10)          => (a,2),(c,3)
(a,2),(c,3)    => (a,2),(c,3)     => (h,10)




cartesian-笛卡尔积
def cartesian[U](other: RDD[U])(implicit arg0: ClassTag[U]): RDD[(T, U)]
Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements (a, b) where a is in this and b is in other.

----------
scala> val rdd1=sc.parallelize(List("a","b","c","d"),1)
rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[46] at parallelize at <console>:24

scala> val rdd2=sc.parallelize(List(1,2,3),1)
rdd2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[47] at parallelize at <console>:24

scala> rdd1.cartesian(rdd2).collect
res21: Array[(String, Int)] = Array((a,1), (a,2), (a,3), (b,1), (b,2), (b,3), (c,1), (c,2), (c,3), (d,1), (d,2), (d,3))

scala> val rdd3=rdd1.cartesian(rdd2)
rdd3: org.apache.spark.rdd.RDD[(String, Int)] = CartesianRDD[49] at cartesian at <console>:28

scala> rdd3.glom.collect
res22: Array[Array[(String, Int)]] = Array(Array((a,1), (a,2), (a,3), (b,1), (b,2), (b,3), (c,1), (c,2), (c,3), (d,1), (d,2), (d,3)))

scala> rdd3.partitions.length
res23: Int = 1

对两个RDD进行笛卡尔积,也就是RDD的每个元素与另外一个RDD中每个元素进行JOIN。该算子可能导致内存溢出。

a,b,c,d  \         (a,1)
               =>    (a,2)  
1,2,3     /         (a,3)
                       (b,1)
                       …...



intersection-交集
def intersection(other: RDD[T]): RDD[T]
def intersection(other: RDD[T], numPartitions: Int): RDD[T]
def intersection(other: RDD[T], partitioner: Partitioner)(implicit ord: Ordering[T] = null): RDD[T]

Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did. Performs a hash partition across the cluster. Note that this method performs a shuffle internally. 

numPartitions     How many partitions to use in the resulting RDD
partitioner Partitioner to use for the resulting RDD

----------
scala> val rdd1=sc.parallelize(List(1,2,3,4),1)
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[51] at parallelize at <console>:24

scala>  val rdd2=sc.parallelize(List(2,3,6,7),2)
rdd2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[52] at parallelize at <console>:24

scala> rdd1.intersection(rdd2).collect
res24: Array[Int] = Array(2, 3)

下面每行是上面rdd1/2的一个分区,通过intersection变换计算交集,也就是AB。

1,2,3,4    => \
                         2
                         3
2,3          =>      
6,7          => /




subtract-补集
def subtract(other: RDD[T]): RDD[T]
def subtract(other: RDD[T], numPartitions: Int): RDD[T]
def subtract(other: RDD[T], p: Partitioner)(implicit ord: Ordering[T] = null): RDD[T]

Return an RDD with the elements from this that are not in other.
Uses this partitioner/partition size, because even if other is huge, the resulting RDD will be <= us.

----------
scala> val rdd1=sc.parallelize(List(1,2,3,4),1)
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[51] at parallelize at <console>:24

scala>  val rdd2=sc.parallelize(List(2,3,6,7),2)
rdd2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[52] at parallelize at <console>:24

scala> rdd1.subtract(rdd2).collect
res25: Array[Int] = Array(1, 4)

下面每行是上面rdd1/2的一个分区,通过subtract变换计算交集,也就是A-B。

1,2,3,4     => \
                         1,4
2,3          =>
6,7          => /




union-并集
def ++
def union(other: RDD[T]): RDD[T]

Return the union of this RDD and another one. Any identical elements will appear multiple times (use .distinct() to eliminate them).

----------
scala> val rdd1=sc.parallelize(List(1,2,3,4),1)
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[51] at parallelize at <console>:24

scala>  val rdd2=sc.parallelize(List(2,3,6,7),2)
rdd2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[52] at parallelize at <console>:24

scala> rdd1.union(rdd2).collect
res26: Array[Int] = Array(1, 2, 3, 4, 2, 3, 6, 7)

scala> rdd1.union(rdd2).glom.collect
res27: Array[Array[Int]] = Array(Array(1, 2, 3, 4), Array(2, 3), Array(6, 7))

scala> (rdd1++rdd2).collect
res28: Array[Int] = Array(1, 2, 3, 4, 2, 3, 6, 7)

下面每行是上面rdd1/2的一个分区,通过subtract变换计算交集,也就是A∪B。

1,2,3,4     => \  1,2,3,4
                         
2,3          =>      2,3    
6,7          => /    6,7





zip-联结
def zip[U](other: RDD[U])(implicit arg0: ClassTag[U]): RDD[(T, U)]

Zips this RDD with another one, returning key-value pairs with the first element in each RDD, second element in each RDD, etc. Assumes that the two RDDs havethe *same number of partitions* and the *same number of elements in each partition* (e.g. one was made through a map on the other).

----------
scala> val rdd1=sc.parallelize(List(1,2,3,4),1)
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[51] at parallelize at <console>:24

scala>  val rdd2=sc.parallelize(List(2,3,6,7),2)
rdd2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[52] at parallelize at <console>:24

scala> rdd1.zip(rdd2).glom.collect //rdd1与rdd2的分区数不同,zip操作出错
java.lang.IllegalArgumentException: Can't zip RDDs with unequal numbers of partitions: List(1, 2)
  at org.apache.spark.rdd.ZippedPartitionsBaseRDD.getPartitions(ZippedPartitionsRDD.scala:57)
  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)
  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)
  at scala.Option.getOrElse(Option.scala:121)
  at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)
  at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)
  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)
  at scala.Option.getOrElse(Option.scala:121)
  at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:1911)
  at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:893)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
  at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
  at org.apache.spark.rdd.RDD.collect(RDD.scala:892)
  ... 48 elided

scala> val rdd3=sc.parallelize(List(2,3,6,7),1)
rdd3: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[70] at parallelize at <console>:24

scala> val rdd4=rdd1.zip(rdd3)
rdd4: org.apache.spark.rdd.RDD[(Int, Int)] = ZippedPartitionsRDD2[71] at zip at <console>:28

scala> rdd4.glom.collect
res35: Array[Array[(Int, Int)]] = Array(Array((1,2), (2,3), (3,6), (4,7)))

scala> rdd4.collect
res36: Array[(Int, Int)] = Array((1,2), (2,3), (3,6), (4,7))


下面每行是上面rdd1/3,通过zip变换计算,形成一个key/value的RDD。

                           (1,2)
1,2,3,4     => \    (2,3)
 2,3,6,7     => /   (3,6)
                           (4,7)
                         




map-创建RDD
def map[U: ClassTag](f: T => U): RDD[U]
Applies a transformation function on each item of the RDD and returns the result as a new RDD.

----------
scala> val words=sc.parallelize(List("apple","banana","berry","cherry","cumquat","haw"),1)
words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[78] at parallelize at <console>:24

scala> words.collect
res44: Array[String] = Array(apple, banana, berry, cherry, cumquat, haw)

scala> val pairs=words.map(x=>(x(0),x))
pairs: org.apache.spark.rdd.RDD[(Char, String)] = MapPartitionsRDD[79] at map at <console>:26


下面是word,通过map变换计算。

apple, banana, berry, cherry, cumquat, haw     =>     (a,apple), (b,banana), (b,berry), (c,cherry), (c,cumquat), (h,haw)




keyBy-创建key/value的RDD
def keyBy[K](f: (T) ⇒ K): RDD[(K, T)]
Creates tuples of the elements in this RDD by applying f. The result of the function becomes the key and the original data item becomes the value of the newly created tuples.

----------
scala> val words=sc.parallelize(List("apple","banana","berry","cherry","cumquat","haw"),1)
words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[80] at parallelize at <console>:24

scala> words.collect
res51: Array[String] = Array(apple, banana, berry, cherry, cumquat, haw)

scala> val pairs=words.keyBy(_.length)
pairs: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[81] at keyBy at <console>:26

scala> pairs.collect
res52: Array[(Int, String)] = Array((5,apple), (6,banana), (5,berry), (6,cherry), (7,cumquat), (3,haw))

下面是word,通过keyBy变换计算。

apple, banana, berry, cherry, cumquat, haw     =>     (5,apple), (6,banana), (5,berry), (6,cherry), (7,cumquat), (3,haw)




combineByKey-按key进行聚合
def combineByKey[C](createCombiner: (V) ⇒ C, mergeValue: (C, V) ⇒ C, mergeCombiners: (C, C) ⇒ C): RDD[(K, C)]
def combineByKey[C](createCombiner: (V) ⇒ C, mergeValue: (C, V) ⇒ C, mergeCombiners: (C, C) ⇒ C, numPartitions: Int): RDD[(K, C)]
def combineByKey[C](createCombiner: (V) ⇒ C, mergeValue: (C, V) ⇒ C, mergeCombiners: (C, C) ⇒ C, partitioner: Partitioner, mapSideCombine: Boolean = true, serializer: Serializer = null): RDD[(K, C)]

Very efficient implementation that combines the values of a RDD consisting of two-component tuples by applying multiple aggregators one after another.

----------
scala> val pair=sc.parallelize(List(("fruit","apple"),("fruit","banana"),("vegetable","cucumber"),("fruit","cherry"),("vegetable","bean"),("vegetable","pepper")),2)
pair: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[82] at parallelize at <console>:24

scala> val combinedPair=pair.combineByKey(List(_),(x:List[String],y:String)=>y::x,(x:List[String],y:List[String])=>x:::y)
combinedPair: org.apache.spark.rdd.RDD[(String, List[String])] = ShuffledRDD[83] at combineByKey at <console>:26

scala> combinedPair.collect
res54: Array[(String, List[String])] = Array((fruit,List(banana, apple, cherry)), (vegetable,List(cucumber, pepper, bean)))

下面是pair的两个分区,通过combineByKey变换计算。

(fruit,apple),(fruit,banana),(vegetable,cucumber)       =>  (fruit, List(banana, apple, cherry)
(fruit,cherry),(vegetable,bean),(vegetable,pepper)     =>    (vegetable, List(cucumber, pepper, bean))

说明,SCALA的4种操作符如下
1):: 该方法被称为cons,意为构造,向队列的头部追加数据,创造新的列表。用法为 x::list,其中x为加入到头部的元素,无论x是列表与否,它都只将成为新生成列表的第一个元素,也就是说新生成的列表长度为list的长度+1(btw, x::list等价于list.::(x))
2):+和+: 两者的区别在于:+方法用于在尾部追加元素,+:方法用于在头部追加元素,和::很类似,但是::可以用于pattern match ,而+:则不行. 关于+:和:+,只要记住冒号永远靠近集合类型就OK了。
3)++ 该方法用于连接两个集合,list1++list2
4)::: 该方法只能用于连接两个List类型的集合

scala> "A"::"B"::Nil
res0: List[String] = List(A, B)

scala> "A"+:"B"+:Nil
res1: List[String] = List(A, B)

scala> Nil:+"A":+"B"
res2: List[String] = List(A, B)

scala> res0 ++ res1
res3: List[String] = List(A, B, A, B)

scala> res0 ::: res1
res4: List[String] = List(A, B, A, B)

scala> res0 :: res1
res5: List[java.io.Serializable] = List(List(A, B), A, B)







flatMapValues-对所有 Value进行flatMap聚合
def flatMapValues[U](f: (V) ⇒ TraversableOnce[U]): RDD[(K, U)]
Pass each value in the key-value pair RDD through a flatMap function without changing the keys; this also retains the original RDD's partitioning.

----------
scala> val rdd=sc.parallelize(List("a","boy"),1).keyBy(_.length)
rdd: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[85] at keyBy at <console>:24

scala> rdd.glom.collect
res55: Array[Array[(Int, String)]] = Array(Array((1,a), (3,boy)))

scala> rdd.flatMapValues(x=>"*"+x+"*").collect
res57: Array[(Int, Char)] = Array((1,*), (1,a), (1,*), (3,*), (3,b), (3,o), (3,y), (3,*))

下面是rdd的两个分区,通过flatMapValues变换计算。

a,boy => (1,a), (3,boy) => (1,*), (1,a), (1,*), (3,*), (3,b), (3,o), (3,y), (3,*)





groupByKey-按key进行聚合
def groupByKey(): RDD[(K, Iterable[V])]
def groupByKey(numPartitions: Int): RDD[(K, Iterable[V])]
def groupByKey(partitioner: Partitioner): RDD[(K, Iterable[V])]

Group the values for each key in the RDD into a single sequence. Hash-partitions the resulting RDD with the existing partitioner/parallelism level、numPartitions partitions、by passing a Partitioner respectively. The ordering of elements within each group is not guaranteed, and may even differ each time the resulting RDD is evaluated.

Note: This operation may be very expensive. If you are grouping in order to perform an aggregation (such as a sum or average) over each key, using PairRDDFunctions.aggregateByKey or PairRDDFunctions.reduceByKey will provide much better performance.
As currently implemented, groupByKey must be able to hold all the key-value pairs for any key in memory. If a key has too many values, it can result in an OutOfMemoryError.

----------
scala> val pairs=sc.parallelize(List(("fruit","apple"),("vegetable","cucumber"),("fruit","cherry"),("vegetable","bean"),("fruit","banana"),("vegetable","pepper")),2)
pairs: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[88] at parallelize at <console>:24

scala> pairs.groupByKey.collect
res58: Array[(String, Iterable[String])] = Array((fruit,CompactBuffer(apple, cherry, banana)), (vegetable,CompactBuffer(cucumber, bean, pepper)))

下面是pairs的两个分区,通过groupByKey变换计算。

(fruit,apple),(vegetable,cucumber),(fruit,cherry)       =>  (fruit, CompactBuffer(banana, apple, cherry)
(vegetable,bean),(fruit,banana),(vegetable,pepper)     =>    (vegetable, CompactBuffer(cucumber, pepper, bean))





keys-提取key
def keys: RDD[K]
Return an RDD with the keys of each tuple.

----------
scala> val pairs=sc.parallelize(List("apple","banana","berry","cherry","cumquat","haw"),1).keyBy(_.length)
pairs: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[91] at keyBy at <console>:24

scala> pairs.keys.collect
res59: Array[Int] = Array(5, 6, 5, 6, 7, 3)

scala> pairs.collect
res60: Array[(Int, String)] = Array((5,apple), (6,banana), (5,berry), (6,cherry), (7,cumquat), (3,haw))

下面是pairs的分区,通过keys变换计算。

(5,apple), (6,banana), (5,berry), (6,cherry), (7,cumquat), (3,haw)     =>     5,6,5,6,7,3




mapValues-对value进行变换
def mapValues[U](f: (V) ⇒ U): RDD[(K, U)]
Pass each value in the key-value pair RDD through a map function without changing the keys; this also retains the original RDD's partitioning.

----------
scala> val pairs=sc.parallelize(List("apple","banana","berry","cherry","cumquat","haw"),1).keyBy(_.length)
pairs: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[91] at keyBy at <console>:24

scala> pairs.mapValues(v=>v+" "+v(0)).collect
res61: Array[(Int, String)] = Array((5,apple a), (6,banana b), (5,berry b), (6,cherry c), (7,cumquat c), (3,haw h))

下面是pairs的分区,通过keys变换计算。

(5,apple), (6,banana), (5,berry), (6,cherry), (7,cumquat), (3,haw)     =>     (5,apple a), (6,banana b), (5,berry b), (6,cherry c), (7,cumquat c), (3,haw h)





partitionBy-按Key值重新分区
def partitionBy(partitioner: Partitioner): RDD[(K, V)]
Return a copy of the RDD partitioned using the specified partitioner.

----------
scala>  val pairs=sc.parallelize(0 to 9,2).keyBy(x=>x)
pairs: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[106] at keyBy at <console>:24

scala> pairs.glom.collect
res69: Array[Array[(Int, Int)]] = Array(Array((0,0), (1,1), (2,2), (3,3), (4,4)), Array((5,5), (6,6), (7,7), (8,8), (9,9)))

scala> import org.apache.spark.HashPartitioner
import org.apache.spark.HashPartitioner

scala> val partitionedPairs=pairs.partitionBy(new HashPartitioner(2))
partitionedPairs: org.apache.spark.rdd.RDD[(Int, Int)] = ShuffledRDD[108] at partitionBy at <console>:27

scala> partitionedPairs.glom.collect
res70: Array[Array[(Int, Int)]] = Array(Array((0,0), (2,2), (4,4), (6,6), (8,8)), Array((1,1), (3,3), (5,5), (7,7), (9,9)))





reduceByKey-按key值进行reduce操作
def reduceByKey(func: (V, V) ⇒ V): RDD[(K, V)]
def reduceByKey(func: (V, V) ⇒ V, numPartitions: Int): RDD[(K, V)]
def reduceByKey(partitioner: Partitioner, func: (V, V) ⇒ V): RDD[(K, V)]

Merge the values for each key using an associative and commutative reduce function.This will also perform the merging locally on each mapper before sending results to a reducer, similarly to a "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/ parallelism level, or hash-partitioned with numPartitions partitions. Please note that any function f you provide, should be commutative in order to generate reproducible results.

----------
scala> val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)
a: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[116] at parallelize at <console>:24

scala> val b = a.map(x => (x.length, x))
b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[117] at map at <console>:26

scala> b.reduceByKey(_ + _).collect
res54: Array[(Int, String)] = Array((4,lion), (3,dogcat), (7,panther), (5,tigereagle))

下面是a/b的分区,通过reduceByKey变换计算。

dog,tiger,lion          =>     (3,dog),(5,tiger),(4,lion)          =>\
                                                                                              (4,lion),(3,dogcat),(7,panther),(5,tigereagle)
cat,panther,eagle   =>     (3,cat),(7,panther),(5,eagle)   =>/  





sortByKey-按key进行排序
def sortByKey(ascending: Boolean = true, numPartitions: Int = self.partitions.size): RDD[P]

This function sorts the input RDD's data and stores it in a new RDD.The output RDD is a shuffled RDD because it stores data that is output by a reducer which has been shuffled. The implementation of this function is actually very clever. First, it uses a range partitioner to partition the data in ranges within the shuffled RDD. Then it sorts these ranges individually with mapPartitions using standard sort mechanisms.

----------
scala> val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)
a: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[116] at parallelize at <console>:24

scala> val b = a.map(x => (x.length, x))
b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[117] at map at <console>:26

scala> pairs.sortByKey(true).collect
res64: Array[(Int, String)] = Array((3,haw), (5,apple), (5,berry), (6,banana), (6,cherry), (7,cumquat))

scala> pairs.sortByKey(false,2).glom.collect
res65: Array[Array[(Int, String)]] = Array(Array((7,cumquat), (6,banana), (6,cherry)), Array((5,apple), (5,berry), (3,haw)))


下面是a/b的分区,通过reduceByKey变换计算,使用降序排列。

dog,tiger,lion          =>     (3,dog),(5,tiger),(4,lion)          =>\
                                                                                             (7,cumquat), (6,banana), (6,cherry)), Array((5,apple), (5,berry), (3,haw)
cat,panther,eagle   =>     (3,cat),(7,panther),(5,eagle)   =>/  





values-提取value
def values: RDD[V]
Return an RDD with the values of each tuple.

----------
scala>  val pairs=sc.parallelize(0 to 9,2).keyBy(x=>x)
pairs: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[106] at keyBy at <console>:24

scala> pairs.values.collect
res72: Array[Int] = Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)




cogroup-按key聚合
def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))]
def cogroup[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (Iterable[V], Iterable[W]))]
def cogroup[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (Iterable[V], Iterable[W]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)]): RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)], numPartitions: Int): RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)], partitioner: Partitioner): RDD[(K, (Iterable[V], Iterable[W1], Iterable[W2]))]

A very powerful set of functions that allow grouping up to 3 key-value RDDs together using their keys.

----------
scala> val a=sc.parallelize(List(1,2,1,3),1)
a: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[126] at parallelize at <console>:24

scala> val b=a.map((_,"b"))
b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[127] at map at <console>:26

scala> val c=a.map((_,"c"))
c: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[128] at map at <console>:26

scala> b.collect
res58: Array[(Int, String)] = Array((1,b), (2,b), (1,b), (3,b))

scala> c.collect
res59: Array[(Int, String)] = Array((1,c), (2,c), (1,c), (3,c))

scala> b.cogroup(c).collect
res60: Array[(Int, (Iterable[String], Iterable[String]))] = Array((1,(CompactBuffer(b, b),CompactBuffer(c, c))), (3,(CompactBuffer(b),CompactBuffer(c))), (2,(CompactBuffer(b),CompactBuffer(c))))

scala> val d=a.map((_,"d"))
d: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[131] at map at <console>:26

scala> b.cogroup(c,d).collect
res61: Array[(Int, (Iterable[String], Iterable[String], Iterable[String]))] = Array((1,(CompactBuffer(b, b),CompactBuffer(c, c),CompactBuffer(d, d))), (3,(CompactBuffer(b),CompactBuffer(c),CompactBuffer(d))), (2,(CompactBuffer(b),CompactBuffer(c),CompactBuffer(d))))





join按key联结
def join[W](other: RDD[(K, W)]): RDD[(K, (V, W))]
def join[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (V, W))]
def join[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, W))]

Return an RDD containing all pairs of elements with matching keys in this and other. Each pair of elements will be returned as a (k, (v1, v2)) tuple, where (k, v1) is in this and (k, v2) is in other. 

----------
scala>  val pair1=sc.parallelize(List((1,"a"),(2,"b"),(3,"c")),2)
pair1: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[111] at parallelize at <console>:25

scala> val pair2=sc.parallelize(List((1,"apple"),(2,"banana")),2)
pair2: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[112] at parallelize at <console>:25

scala> pair1.join(pair2,1).collect
res75: Array[(Int, (String, String))] = Array((1,(a,apple)), (2,(b,banana)))

下面是pair1/2的分区,通过join变换计算。

(1,a)
(2,b),(3,c)     (1,(a,apple))
               =>
(1,apple)     (2,(b,banana))
(2,banana)





leftOuterJoin按key值进行左外联结
def leftOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))]
def leftOuterJoin[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (V, Option[W]))]
def leftOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, Option[W]))]

Performs an left outer join using two key-value RDDs. Please note that the keys must be generally comparable to make this work correctly.

----------
scala> val a=sc.parallelize(List("a","boy","cat"),1).keyBy(_.length)
a: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[120] at keyBy at <console>:25

scala> val b=sc.parallelize(List("boy","cat","dragon"),1).keyBy(_.length)
b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[122] at keyBy at <console>:25

scala> a.collect
res76: Array[(Int, String)] = Array((1,a), (3,boy), (3,cat))

scala> b.collect
res77: Array[(Int, String)] = Array((3,boy), (3,cat), (6,dragon))

scala> a.leftOuterJoin(b).collect
res79: Array[(Int, (String, Option[String]))] = Array((1,(a,None)), (3,(boy,Some(boy))), (3,(boy,Some(cat))), (3,(cat,Some(boy))), (3,(cat,Some(cat))))

下面是a/b的分区,通过leftOuterJoin变换计算。

(1,a),(3,boy),(3,cat) 
                                        =>  (1,(a,None)), (3,(boy,Some(boy))), (3,(boy,Some(cat))), (3,(cat,Some(boy))), (3,(cat,Some(cat)))
(3,boy),(3,cat),(6,dragon)




rightOuterJoin按key值进行右外联结
def rightOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (Option[V], W))]
def rightOuterJoin[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (Option[V], W))]
def rightOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (Option[V], W))]

Performs an right outer join using two key-value RDDs. Please note that the keys must be generally comparable to make this work correctly.

----------
scala> val a=sc.parallelize(List("a","boy","cat"),1).keyBy(_.length)
a: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[120] at keyBy at <console>:25

scala> val b=sc.parallelize(List("boy","cat","dragon"),1).keyBy(_.length)
b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[122] at keyBy at <console>:25

scala> a.rightOuterJoin(b).collect
res81: Array[(Int, (Option[String], String))] = Array((6,(None,dragon)), (3,(Some(boy),boy)), (3,(Some(boy),cat)), (3,(Some(cat),boy)), (3,(Some(cat),cat)))

下面是a/b的分区,通过rightOuterJoin变换计算。

(1,a),(3,boy),(3,cat) 
                                        =>  (6,(None,dragon)), (3,(Some(boy),boy)), (3,(Some(boy),cat)), (3,(Some(cat),boy)), (3,(Some(cat),cat))
(3,boy),(3,cat),(6,dragon)





subtractByKey按key值进行求补
def subtractByKey[W: ClassTag](other: RDD[(K, W)]): RDD[(K, V)]
def subtractByKey[W: ClassTag](other: RDD[(K, W)], numPartitions: Int): RDD[(K, V)]
def subtractByKey[W: ClassTag](other: RDD[(K, W)], p: Partitioner): RDD[(K, V)]

Very similar to subtract, but instead of supplying a function, the key-component of each pair will be automatically used as criterion for removing items from the first RDD.

----------
scala> val a=sc.parallelize(List("a","boy","cat"),1).keyBy(_.length)
a: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[120] at keyBy at <console>:25

scala> val b=sc.parallelize(List("boy","cat","dragon"),1).keyBy(_.length)
b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[122] at keyBy at <console>:25

scala> a.subtractByKey(b).collect
res83: Array[(Int, String)] = Array((1,a))

下面是a/b的分区,通过subtractByKey变换计算。

(1,a),(3,boy),(3,cat) 
                                        =>  (1,a)
(3,boy),(3,cat),(6,dragon)



行动算子
aggregate聚合操作
def aggregate[U](zeroValue: U)(seqOp: (U, T) ⇒ U, combOp: (U, U) ⇒ U)(implicit arg0: ClassTag[U]): U

The aggregate function allows the user to apply two different reduce functions to the RDD. 
- The first reduce function is applied within each partition to reduce the data within each partition into a single result. 
- The second reduce function is used to combine the different reduced results of all partitions together to arrive at one final result.

The ability to have two separate reduce functions for intra partition versus across partition reducing adds a lot of flexibility. For example the first reduce function can be the max function and the second one can be the sum function. The user also specifies an initial value. Here are some important facts.
1)The initial value is applied at both levels of reduce. So both at the intra partition reduction and across partition reduction.
2)Both reduce functions have to be commutative and associative.
3)Do not assume any execution order for either partition computations or combining partitions.
4)Why would one want to use two input data types? Let us assume we do an archaeological site survey using a metal detector. While walking through the site we take GPS coordinates of important findings based on the output of the metal detector. Later, we intend to draw an image of a map that highlights these locations using the aggregate function. In this case the zeroValue could be an area map with no highlights. The possibly huge set of input data is stored as GPS coordinates across many partitions. seqOp (first reducer) could convert the GPS coordinates to map coordinates and put a marker on the map at the respective position. combOp (second reducer) will receive these highlights as partial maps and combine them into a single final output map.

seqOp
     an operator used to accumulate results within a partition
combOp
     an associative operator used to combine results from different partitions

----------
scala> val rdd=sc.parallelize(List(1,2,3,4,5,6),2)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[1] at parallelize at <console>:24

scala> rdd.glom.collect
res1: Array[Array[Int]] = Array(Array(1, 2, 3), Array(4, 5, 6))

scala> rdd.aggregate(0)(_+_,Math.max(_,_))
res2: Int = 15

下面每行是rdd的两个分区。首先对每个分区内的所有元素累加求和((_+_)),然后对分区元素的和,求最大值(Math.max(_,_))。
1,2,3  =>  6
                    =>  15
4,5,6  =>  15




collect收集元素
def collect(): Array[T]
def collect[U: ClassTag](f: PartialFunction[T, U]): RDD[U]
def toArray(): Array[T]

Converts the RDD into a Scala array and returns it. If you provide a standard map-function (i.e. f = T -> U) it will be applied before inserting the values into the result array. Note this method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory.

----------
scala> val data=sc.parallelize(0 to 9,2)
data: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[3] at parallelize at <console>:24

scala> data.collect
res4: Array[Int] = Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)






collectAsMap收集Key/Value型RDD中的元素
def collectAsMap(): Map[K, V]
Return the key-value pairs in this RDD to the master as a Map.

Warning: this doesn't return a multimap (so if you have multiple values to the same key, only one value per key is preserved in the map returned)
Note this method should only be used if the resulting data is expected to be small, as all the data is loaded into the driver's memory.

----------
scala> val pairRDD=sc.parallelize(List((1,"a"),(2,"b"),(3,"c"),(4,"d")),2)
pairRDD: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[0] at parallelize at <console>:24

scala> pairRDD.collectAsMap
res0: scala.collection.Map[Int,String] = Map(2 -> b, 4 -> d, 1 -> a, 3 -> c)   





count计算元素个数
def count(): Long
Return the number of elements in the RDD.

----------
scala> val pairRDD=sc.parallelize(List((1,"a"),(2,"b"),(3,"c"),(4,"d")),2)
pairRDD: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[0] at parallelize at <console>:24

scala> pairRDD.count
res8: Long = 4




countByKey按key值统计key/value型RDD中的元素个数
def countByKey(): Map[K, Long]
Count the number of elements for each key, collecting the results to a local Map.

Note that this method should only be used if the resulting map is expected to be small, as the whole thing is loaded into the driver's memory.To handle very large results, consider using rdd.mapValues(_ => 1L).reduceByKey(_ + _), which returns an RDD[T, Long] instead of a map.

----------
scala> val pairRDD=sc.parallelize(List(("fruit","apple"),("fruit","banana"),("fruit","cherry"),("vegetable","bean"),("vegetable","cucumber"),("vegetable","pepper")),2)
pairRDD: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[6] at parallelize at <console>:24

scala> pairRDD.countByKey
res9: scala.collection.Map[String,Long] = Map(fruit -> 3, vegetable -> 3)





countByValue统计RDD中元素出现次数
def countByValue()(implicit ord: Ordering[(K, V)] = null): Map[(K, V), Long]
Return the count of each unique value in this RDD as a local map of (value, count) pairs.

Returns a map that contains all unique values of the RDD and their respective occurrence counts.(Warning: This operation will finally aggregate the information in a single reducer.)

----------
这种方法可以计算wordcount,但是需要注意可能导致内存溢出。

scala> val pairRDD=sc.parallelize(List(("fruit","apple"),("fruit","banana"),("fruit","cherry"),("vegetable","bean"),("vegetable","cucumber"),("vegetable","pepper")),2)
pairRDD: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[6] at parallelize at <console>:24

scala> pairRDD.countByValue
res10: scala.collection.Map[(String, String),Long] = Map((vegetable,bean) -> 1, (fruit,cherry) -> 1, (vegetable,cucumber) -> 1, (fruit,banana) -> 1, (fruit,apple) -> 1, (vegetable,pepper) -> 1)

(fruit,apple),(fruit,banana),(fruit,cherry)                                        (fruit,apple)->1,(fruit,banana)->1,(fruit,cherry) ->1
                                                                                               =>
(vegetable,bean),(vegetable,cucumber),(vegetable,pepper)       (vegetable,bean)->1,(vegetable,cucumber)->1,(vegetable,pepper)->1





first得到首个元素
def first(): T
Return the first element in this RDD.

----------
scala> val words=sc.parallelize(List("first","second","third"),1)
words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[16] at parallelize at <console>:24

scala> words.first
res12: String = first



glom返回分区情况
def glom(): RDD[Array[T]]
Return an RDD created by coalescing all elements within each partition into an array. Assembles an array that contains all elements of the partition and embeds it in an RDD. Each returned array contains the contents of one partition.

----------
每个第二级Array为一个分区。注意,返回所有信息,形成新RDD给driver。如果数据量过大,可能导致out of memory问题。

scala> val num=sc.parallelize(0 to 10,4)
num: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[17] at parallelize at <console>:24

scala> num.glom.collect
res14: Array[Array[Int]] = Array(Array(0, 1), Array(2, 3, 4), Array(5, 6, 7), Array(8, 9, 10))



fold合并
def fold(zeroValue: T)(op: (T, T) ⇒ T): T

Aggregate the elements of each partition, and then the results for all the partitions, using a given associative function and a neutral "zero value". The function op(t1, t2) is allowed to modify t1 and return it as its result value to avoid object allocation; however, it should not modify t2.

This behaves somewhat differently from fold operations implemented for non-distributed collections in functional languages like Scala. This fold operation may be applied to partitions individually, and then fold those results into the final result, rather than apply the fold to each element sequentially in some defined ordering. For functions that are not commutative, the result may differ from that of a fold applied to a non-distributed collection.

zeroValue
     the initial value for the accumulated result of each partition for the op operator, and also the initial value for the combine results from different partitions for the op operator - this will typically be the neutral element (e.g. Nil for list concatenation or 0 for summation)

op
     an operator used to both accumulate results within a partition and combine results from different partitions

----------
scala> val words=sc.parallelize(List("A","B","C","D"),2)
words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[19] at parallelize at <console>:24

scala> words.glom.collect
res15: Array[Array[String]] = Array(Array(A, B), Array(C, D))

scala> words.fold(" |")(_+"."+_)
res17: String = " |. |.A.B. |.C.D"

scala> words.fold(" ")(_+"."+_)
res18: String = " . .A.B. .C.D"




foreach逐个处理RDD元素
def foreach(f: (T) ⇒ Unit): Unit
Applies a function f to all elements of this RDD.

----------
与map不同,foreach仅对RDD的每个元素进行处理,不返回新RDD。而map返回RDD。

scala> val words=sc.parallelize(List("A","B","C","D"),2)
words: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[21] at parallelize at <console>:24

scala> words.foreach(x=>println(x+" is a letter."))
A is a letter.
C is a letter.
D is a letter.
B is a letter.




lookup查找元素
def lookup(key: K): Seq[V]

Return the list of values in the RDD for key key. This operation is done efficiently if the RDD has a known partitioner by only searching the partition that the key maps to.

----------
scala> val pairs=sc.parallelize(List("apple","banana","berry","cherry","cumquat","haw"),2).keyBy(_.length)
pairs: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[23] at keyBy at <console>:24

scala> pairs.collect
res21: Array[(Int, String)] = Array((5,apple), (6,banana), (5,berry), (6,cherry), (7,cumquat), (3,haw))

scala> pairs.lookup(5)
res22: Seq[String] = WrappedArray(apple, berry)




max求最大值
def max()(implicit ord: Ordering[T]): T

Returns the max of this RDD as defined by the implicit Ordering[T]. returns the maximum element of the RDD

----------
scala> val nums=sc.parallelize(0 to 9,1)
nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.max
res23: Int = 9




min求最小值
def min()(implicit ord: Ordering[T]): T

Returns the min of this RDD as defined by the implicit Ordering[T]. returns the minimum element of the RDD

----------
scala> val nums=sc.parallelize(0 to 9,1)
nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.min
res24: Int = 0



partitions返回RDD各分区信息
final def partitions: Array[Partition]

Get the array of partitions of this RDD, taking into account whether the RDD is checkpointed or not.

----------
查询RDD的分区数量,经常使用rdd.partitions.length,十分有用


take获取前n个元素
def take(num: Int): Array[T]

Take the first num elements of the RDD. It works by first scanning one partition, and use the results from that partition to estimate the number of additional partitions needed to satisfy the limit.
Note due to complications in the internal implementation, this method will raise an exception if called on an RDD of Nothing or Null.This method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory.

----------
scala> val nums=sc.parallelize(0 to 9,1)
nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.take(4)
res25: Array[Int] = Array(0, 1, 2, 3)



takeOrdered获取排序后的前n个元素
def takeOrdered(num: Int)(implicit ord: Ordering[T]): Array[T]

Returns the first k (smallest) elements from this RDD as defined by the specified implicit Ordering[T] and maintains the ordering. This does the opposite of top. For example:
num     k, the number of elements to return
ord     the implicit ordering for T
returns     an array of top elements

Note
this method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory.

----------
sc.parallelize(Seq(10, 4, 2, 12, 3)).takeOrdered(1)
// returns Array(2)

sc.parallelize(Seq(2, 3, 4, 5, 6)).takeOrdered(2)
// returns Array(2, 3)


scala> val nums=sc.parallelize(0 to 9,1)
nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.takeOrdered(3)
res26: Array[Int] = Array(0, 1, 2)



takeSample提取n个样本
def takeOrdered(num: Int)(implicit ord: Ordering[T]): Array[T]

Returns the first k (smallest) elements from this RDD as defined by the specified implicit Ordering[T] and maintains the ordering. This does the opposite of top. For example:

num     k, the number of elements to return
ord     the implicit ordering for T
returns     an array of top elements

Note
this method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory.

----------
sc.parallelize(Seq(10, 4, 2, 12, 3)).takeOrdered(1)
// returns Array(2)

sc.parallelize(Seq(2, 3, 4, 5, 6)).takeOrdered(2)
// returns Array(2, 3)

scala> val nums=sc.parallelize(0 to 9,1)
nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.takeSample(false,3,5)
res28: Array[Int] = Array(4, 3, 5)



top寻找值最大的前几个元素
def top(num: Int)(implicit ord: Ordering[T]): Array[T]

Returns the top k (largest) elements from this RDD as defined by the specified implicit Ordering[T] and maintains the ordering. This does the opposite of takeOrdered. For example:
num     k, the number of top elements to return
ord     the implicit ordering for T
returns     an array of top elements

Note
this method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory.

----------
sc.parallelize(Seq(10, 4, 2, 12, 3)).top(1)
// returns Array(12)

sc.parallelize(Seq(2, 3, 4, 5, 6)).top(2)
// returns Array(6, 5)

scala> val nums=sc.parallelize(0 to 9,1)
nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.top(2)
res29: Array[Int] = Array(9, 8)



saveAsObjectFile存储为二进制文件
def saveAsObjectFile(path: String): Unit
Save this RDD as a SequenceFile of serialized objects.

----------
scala> val nums=sc.parallelize(0 to 9,1)
nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> nums.saveAsObjectFile("~/obj")



saveAsTextFile存储为文本文件
def saveAsTextFile(path: String, codec: Class[_ <: CompressionCodec]): Unit
Save this RDD as a compressed text file, using string representations of elements.

def saveAsTextFile(path: String): Unit
Save this RDD as a text file, using string representations of elements.

----------
scala> val nums=sc.parallelize(0 to 9,1)
nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26] at parallelize at <console>:24

scala> num.saveAsTextFile("~/text")



saveAsNewAPIHadoopFile存储为hadoop文件



saveAsNewAPIHadoopDataSet存储为Hadoop数据集





缓存算子

cache缓存RDD
def cache(): RDD.this.type

Persist this RDD with the default storage level (MEMORY_ONLY).

----------
scala> val num=sc.parallelize(0 to 9,1)
num: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[5] at parallelize at <console>:24

scala> nums.collect
res33: Array[Int] = Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)

scala> val result=num.map(x=>x*x)
result: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[36] at map at <console>:26

scala> result.count
res35: Long = 11

scala> result.cache
res36: result.type = MapPartitionsRDD[36] at map at <console>:26

scala> result.count
res37: Long = 11



checkpoint建立RDD的检查点
def checkpoint(): Unit

Mark this RDD for checkpointing. It will be saved to a file inside the checkpoint directory set with SparkContext#setCheckpointDir and all references to its parent RDDs will be removed. This function must be called before any job has been executed on this RDD. It is strongly recommended that this RDD is persisted in memory, otherwise saving it on a file will require recomputation.

----------
scala> val rdd=sc.makeRDD(1 to 9,2)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[37] at makeRDD at <console>:24

scala> val flatMapRDD=rdd.flatMap(x=>Seq(x,x))
flatMapRDD: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[38] at flatMap at <console>:26

scala> flatMapRDD.collect
res39: Array[Int] = Array(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9)

scala> sc.setCheckpointDir("my_checkpoint")

scala> flatMapRDD.checkpoint

scala> flatMapRDD.dependencies.head.rdd
res42: org.apache.spark.rdd.RDD[_] = ParallelCollectionRDD[37] at makeRDD at <console>:24

scala> flatMapRDD.collect
res43: Array[Int] = Array(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9)

scala> flatMapRDD.dependencies.head.rdd
res44: org.apache.spark.rdd.RDD[_] = ParallelCollectionRDD[37] at makeRDD at <console>:24



persist持久化RDD
def persist(newLevel: StorageLevel): RDD.this.type
Set this RDD's storage level to persist its values across operations after the first time it is computed. This can only be used to assign a new storage level if the RDD does not have a storage level set yet. Local checkpointing is an exception.

----------
scala> num.collect
res45: Array[Int] = Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)

scala> num.getStorageLevel
res48: org.apache.spark.storage.StorageLevel = StorageLevel(1 replicas)

scala> num.persist()
res50: num.type = ParallelCollectionRDD[17] at parallelize at <console>:24

scala> num.getStorageLevel
res51: org.apache.spark.storage.StorageLevel = StorageLevel(memory, deserialized, 1 replicas)

scala> num.coalesce(2,false)
res52: org.apache.spark.rdd.RDD[Int] = CoalescedRDD[39] at coalesce at <console>:27

scala> num.glom.collect
res53: Array[Array[Int]] = Array(Array(0, 1), Array(2, 3, 4), Array(5, 6, 7), Array(8, 9, 10))

scala> num.persist()
res54: num.type = ParallelCollectionRDD[17] at parallelize at <console>:24

scala> num.getStorageLevel
res55: org.apache.spark.storage.StorageLevel = StorageLevel(memory, deserialized, 1 replicas)






0 0
原创粉丝点击