RDD中的转换和action(二)PairRDD操作
来源:互联网 发布:java免费书籍下载txt 编辑:程序博客网 时间:2024/05/17 20:35
package RDDimport org.apache.spark.{SparkConf, SparkContext}/** * Created by legotime on 2016/5/5. */object pairRDD { def myfunc1(index: Int, iter: Iterator[(String)]) : Iterator[String] = { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } def myfunc2(index:Int,iter:Iterator[(Int,String)]):Iterator[String]={ iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } def main(args: Array[String]) { val conf = new SparkConf().setAppName("pair RDD").setMaster("local") val sc = new SparkContext(conf) val SingleRDD = sc.parallelize(List("scala","python","java","Spark","hadoop"),2) SingleRDD.mapPartitionsWithIndex(myfunc1).collect.foreach(println) //[partID:0, val: scala] //[partID:0, val: python] //[partID:1, val: java] //[partID:1, val: Spark] //[partID:1, val: hadoop] //-----------------------------单个pairRDD------------------------------------- val pairRDD = SingleRDD.map(x => (x.length,x)) pairRDD.mapPartitionsWithIndex(myfunc2).collect.foreach(println) //[partID:0, val: (5,scala)] //[partID:0, val: (6,python)] //[partID:1, val: (4,java)] //[partID:1, val: (5,Spark)] //[partID:1, val: (6,hadoop)] //合并具有相同键(key)的值 pairRDD.reduceByKey(_+_).collect().foreach(println) //先本地partition相同的key聚合,之后再partition和partition结合 //(4,java) //(6,pythonhadoop) //(5,scalaSpark) //对具有相同键的值进行分组 pairRDD.groupByKey().collect.foreach(println) //(4,CompactBuffer(java)) //(6,CompactBuffer(python, hadoop)) //(5,CompactBuffer(scala, Spark)) //对key中的每个值应用一个函数 pairRDD.mapValues(x=>"I am "+x).collect.foreach(println) //(5,I am scala) //(6,I am python) //(4,I am java) //(5,I am Spark) //(6,I am hadoop) pairRDD.flatMapValues(x=>"I am "+x).collect.foreach(print) //(5,I)(5, )(5,a)(5,m)(5, )(5,s)(5,c)(5,a)(5,l)(5,a)(6,I)(6, )(6,a)(6,m)(6, )(6,p)(6,y)(6,t)(6,h)(6,o)(6,n)(4,I)(4, )(4,a)(4,m)(4, )(4,j)(4,a)(4,v)(4,a)(5,I)(5, )(5,a)(5,m)(5, )(5,S)(5,p)(5,a)(5,r)(5,k)(6,I)(6, )(6,a)(6,m)(6, )(6,h)(6,a)(6,d)(6,o)(6,o)(6,p)16/05/05 22:27:52 INFO SparkContext: Starting job: collect at pairRDD.scala:55 pairRDD.keys.collect.foreach(println) pairRDD.values.collect.foreach(println) pairRDD.sortByKey().collect.foreach(println) //(4,java) //(5,scala) //(5,Spark) //(6,python) //(6,hadoop) //-----------------------------两个pairRDD------------------------------------- val tempPairRDD = sc.parallelize(List((5,"flink"))) tempPairRDD.collect.foreach(println) pairRDD.subtract(tempPairRDD).mapPartitionsWithIndex(myfunc2).collect.foreach(println) pairRDD.join(tempPairRDD).collect.foreach(println) //确保tempPairRDD第一个键存在(左连接) pairRDD.leftOuterJoin(tempPairRDD).collect.foreach(println) //(4,(java,None)) //(6,(python,None)) //(6,(hadoop,None)) //(5,(scala,Some(flink))) //(5,(Spark,Some(flink))) //确保PairRDD第一个键存在(右连接) pairRDD.rightOuterJoin(tempPairRDD).collect.foreach(println) //(4,(CompactBuffer(java),CompactBuffer())) //(6,(CompactBuffer(python, hadoop),CompactBuffer())) //(5,(CompactBuffer(scala, Spark),CompactBuffer(flink))) pairRDD.cogroup(tempPairRDD).collect.foreach(println) //(4,(CompactBuffer(java),CompactBuffer())) //(6,(CompactBuffer(python, hadoop),CompactBuffer())) //(5,(CompactBuffer(scala, Spark),CompactBuffer(flink))) //==============================pairRDD的行动操作=================================== val ActionRDD = sc.parallelize(List((1,2),(3,4),(5,6),(1,6)),2) ActionRDD.countByValue.foreach(println) //((1,2),1) //((5,6),1) //((1,6),1) //((3,4),1) ActionRDD.countByKey.foreach(println) //(1,2) //(3,1) //(5,1) ActionRDD.collectAsMap().foreach(println) //(5,6) //(1,6) //(3,4) ActionRDD.lookup(1).foreach(println) //2 //6 }}
//更多操作参考:http://homepage.cs.latrobe.edu.au/zhe/ZhenHeSparkRDDAPIExamples.html
0 0
- RDD中的转换和action(二)PairRDD操作
- spark RDD算子(十)之PairRDD的Action操作countByKey, collectAsMap
- RDD中的转换和action(一)基本函数
- spark的RDD中的action(执行)和transformation(转换)两种操作中常见函数介绍
- spark rdd详解二(transformation与action操作)
- spark——pairRDD的简单操作(二)
- spark RDD transformation和action操作
- Spark核心编程:操作RDD(transformation和action案例实战)
- rdd常见转换操作
- [2.2]Spark DataFrame操作(二)之通过反射实现RDD与DataFrame的转换
- [2.3]Spark DataFrame操作(二)之通过编程动态完成RDD与DataFrame的转换
- “戏”说Spark-Spark核心-RDD转换操作算子详解(二)
- Spark PairRDD 转化二
- Spark总结(三)——RDD的Action操作
- pairRDD的join操作
- spark RDD算子(十一)之RDD Action 保存操作saveAsTextFile,saveAsSequenceFile,saveAsObjectFile,saveAsHadoopFile 等
- RDD和DataFrame转换(Java+Scala)
- RDD和DataFram转换
- hdu 2058 The sum problem
- PID,UID,sharedUserId以及Android开启多进程模式
- Cordova 6.1 + ionic 安装和配置
- 前端资料小备份
- CI框架切换语言包
- RDD中的转换和action(二)PairRDD操作
- 网上的常见的画虚线方法~
- 分类器性能评估之Lift和Gain(3)
- 第一次用Editplus写程序
- SQLiteDatabase类:数据库对象
- volatile 变量作用
- 小菜鸟开发工作滴滴总结
- POJ 2752 Seek the Name, Seek the Fame
- x86中断编程