键值对RDD
来源:互联网 发布:linux系统关闭selinux 编辑:程序博客网 时间:2024/06/06 12:35
键值对RDD也就是JavaPairRDD,键值对RDD通常用来进行聚合计算
键值对RDD的创建,及常用操作
/** * Created by hbin on 2016/12/9. */import java.io.Serializable;import java.util.Arrays;import java.util.List;import breeze.optimize.linear.LinearProgram;import io.netty.util.internal.StringUtil;import org.apache.commons.lang.StringUtils;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaDoubleRDD;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.*;import scala.Tuple2;/** * spark对数据的核心抽象 RDD(弹性分布式数据集) * RDD就是分布式的元素集合,在spark中对数据的所有操作不外乎创建RDD * 转化已有RDD以及调用RDD操作进行求值,spark会自动将RDD中的数据分发到集群上, * 并将操作并行化 */public class BasicMap { public static void main(String[] args) throws Exception { SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); JavaRDD<String> RDD1=jsc.parallelize(Arrays.asList("1","2","3","4","5","6","7","8","9"),6);// 并行化 PairFunction<String,String,String> keyData=new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String s) throws Exception { return new Tuple2<>(s.split(" ")[0],s); } }; JavaPairRDD<String,String> pairs=RDD1.mapToPair(keyData); JavaPairRDD<String,String> newPairs=pairs.mapValues(new Function<String, String>() { @Override public String call(String s) throws Exception { return s+"*"; } }); System.out.println("newPairs="+newPairs.collect()); System.out.println(" pairs.collect()="+pairs.collect()+" RDD1="+RDD1.collect()); System.out.println(" groupByKey()="+pairs.groupByKey().collect()); System.out.println("keys()="+pairs.keys().collect()); System.out.println("values()="+pairs.values().collect()); System.out.println("sortByKey()="+pairs.sortByKey().collect()); }}执行结果:
newPairs=[(1,1*), (2,2*), (3,3*), (4,4*), (5,5*), (6,6*), (7,7*), (8,8*), (9,9*)]
pairs.collect()=[(1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7), (8,8), (9,9)] RDD1=[1, 2, 3, 4, 5, 6, 7, 8, 9]
groupByKey()=[(6,[6]), (7,[7]), (1,[1]), (8,[8]), (2,[2]), (9,[9]), (3,[3]), (4,[4]), (5,[5])]
keys()=[1, 2, 3, 4, 5, 6, 7, 8, 9]
values()=[1, 2, 3, 4, 5, 6, 7, 8, 9]
sortByKey()=[(1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7), (8,8), (9,9)]
针对两个pair RDD的转化操作
代码示例
/** * Created by hbin on 2016/12/9. */import java.io.Serializable;import java.util.Arrays;import java.util.List;import breeze.optimize.linear.LinearProgram;import io.netty.util.internal.StringUtil;import org.apache.commons.lang.StringUtils;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaDoubleRDD;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.*;import scala.Tuple2;/** * spark对数据的核心抽象 RDD(弹性分布式数据集) * RDD就是分布式的元素集合,在spark中对数据的所有操作不外乎创建RDD * 转化已有RDD以及调用RDD操作进行求值,spark会自动将RDD中的数据分发到集群上, * 并将操作并行化 */public class BasicMap { public static void main(String[] args) throws Exception { SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); JavaRDD<String> RDD1=jsc.parallelize(Arrays.asList("1","2","3","4","5","6","7","8","9"),6);// 并行化 JavaRDD<String> RDD2=jsc.parallelize(Arrays.asList("A","B","C","D","E","F","G","H","J","1","2","3","4","5","6","7"),6);// 并行化 PairFunction<String,String,String> keyData1=new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String s) throws Exception { return new Tuple2<>(s.split(" ")[0],s); } }; PairFunction<String,String,String> keyData2=new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String s) throws Exception { return new Tuple2<>(s.split("")[0],s); } }; JavaPairRDD<String,String> pairsRDD1=RDD1.mapToPair(keyData1); JavaPairRDD<String,String> parirsRDD2=RDD2.mapToPair(keyData2); System.out.println("pairsRDD1="+pairsRDD1.collect()); System.out.println("pairsRDD2="+parirsRDD2.collect()); System.out.println("pairsRDD1.subtractByKey(parirsRDD2).collect()="+pairsRDD1.subtractByKey(parirsRDD2).collect()); System.out.println("pairsRDD1.join(parirsRDD2)="+pairsRDD1.join(parirsRDD2).collect()); System.out.println("pairsRDD1.rightOuterJoin(parirsRDD2)="+pairsRDD1.rightOuterJoin(parirsRDD2).collect()); System.out.println("pairsRDD1.leftOuterJoin(parirsRDD2)="+pairsRDD1.leftOuterJoin(parirsRDD2).collect()); System.out.println("pairsRDD1.cogroup(parirsRDD2)="+pairsRDD1.cogroup(parirsRDD2).collect()); }}执行结果:
pairsRDD1=[(1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7), (8,8), (9,9)]
pairsRDD2=[(A,A), (B,B), (C,C), (D,D), (E,E), (F,F), (G,G), (H,H), (J,J), (1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7)]
pairsRDD1.subtractByKey(parirsRDD2).collect()=[(8,8), (9,9)]
pairsRDD1.join(parirsRDD2)=[(6,(6,6)), (7,(7,7)), (1,(1,1)), (2,(2,2)), (3,(3,3)), (4,(4,4)), (5,(5,5))]
pairsRDD1.rightOuterJoin(parirsRDD2)=[(B,(Optional.absent(),B)), (6,(Optional.of(6),6)), (H,(Optional.absent(),H)), (7,(Optional.of(7),7)), (C,(Optional.absent(),C)), (1,(Optional.of(1),1)), (2,(Optional.of(2),2)), (J,(Optional.absent(),J)), (D,(Optional.absent(),D)), (3,(Optional.of(3),3)), (E,(Optional.absent(),E)), (4,(Optional.of(4),4)), (F,(Optional.absent(),F)), (G,(Optional.absent(),G)), (5,(Optional.of(5),5)), (A,(Optional.absent(),A))]
pairsRDD1.leftOuterJoin(parirsRDD2)=[(6,(6,Optional.of(6))), (7,(7,Optional.of(7))), (1,(1,Optional.of(1))), (8,(8,Optional.absent())), (2,(2,Optional.of(2))), (9,(9,Optional.absent())), (3,(3,Optional.of(3))), (4,(4,Optional.of(4))), (5,(5,Optional.of(5)))]
pairsRDD1.cogroup(parirsRDD2)=[(B,([],[B])), (6,([6],[6])), (H,([],[H])), (7,([7],[7])), (C,([],[C])), (1,([1],[1])), (8,([8],[])), (2,([2],[2])), (J,([],[J])), (D,([],[D])), (9,([9],[])), (3,([3],[3])), (E,([],[E])), (4,([4],[4])), (F,([],[F])), (G,([],[G])), (5,([5],[5])), (A,([],[A]))]
- Spark-键值对RDD
- 键值对RDD
- Spark 键值对RDD操作
- Spark入门(五):键值对RDD
- 键值对RDD的创建方式
- Spark 的键值对(pair RDD)操作,Scala实现
- Spark学习之键值对(pair RDD)操作
- spark RDD算子(四)之创建键值对RDD mapToPair flatMapToPair
- Spark学习之键值对(pair RDD)操作(3)
- spark RDD算子(五)之键值对聚合操作 combineByKey
- spark RDD算子(六)之键值对聚合操作reduceByKey,foldByKey,排序操作sortByKey
- spark RDD算子(七)之键值对分组操作 groupByKey,cogroup
- spark RDD算子(八)之键值对关联操作 subtractByKey, join, rightOuterJoin, leftOuterJoin
- Spark之键值RDD转换
- 键值对
- Spark常用函数讲解--键值RDD转换
- Spark常用函数之键值RDD转换
- 关于键值对
- JDBD的PreparedStatement使用like
- macOs 10.12.2总于修复鼠标驱动了
- php中文件的处理
- xcode6.1新建一个ios程序,为什么自动运行viewController这个类
- Oracle用户、权限、角色管理
- 键值对RDD
- 当spring 容器初始化完成后执行某个方法
- 解析 GetRerootedSIL
- cookie和session的区别
- 传智播客成功挂牌新三板,即将开启颠覆式民办高等教育
- Java - 搭建 Spring MVC 框架
- python os.path模块函数中文说明
- springmvc,mybatis,freemarker,maven-基于注解的整合
- 驱动是否编译进内核