Spark PairRDD API
来源:互联网 发布:excel重复数据合计 编辑:程序博客网 时间:2024/05/18 02:57
import org.apache.spark.HashPartitioner;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.Function2;import org.apache.spark.api.java.function.PairFunction;import org.apache.spark.api.java.function.VoidFunction;import org.apache.spark.storage.StorageLevel;import scala.Tuple2;import java.util.Arrays;import java.util.List;/** * Created by Frank on 2017/8/3. *///!!!!!注意javaPairRdd是javaRDD的子集,所以javaRdd的函数他都有public class PairRddDemo { static SparkConf conf = new SparkConf().setAppName("app").setMaster("local"); static JavaSparkContext sc = new JavaSparkContext(conf); public static void main(String[] a){ List<Integer> list= Arrays.asList(1,2,3,3,2,1,6,6,3); JavaRDD<Integer> rdd = sc.parallelize(list); JavaPairRDD pairRDD=rdd.mapToPair(new PairFunction() { public Tuple2 call(Object o) throws Exception { return new Tuple2(o,1); } }); JavaPairRDD pairRDD2=rdd.mapToPair(new PairFunction() { public Tuple2 call(Object o) throws Exception { return new Tuple2(o,2); } }); //单个rdd转换操作 eachPrint(groupBykey(pairRDD)); eachPrint(reduceByKey(pairRDD)); eachPrint(combineByKey(pairRDD)); eachPrint(mapValues(pairRDD)); eachPrint(flatMapValues(pairRDD)); eachPrint(pairRDD.keys()); eachPrint(pairRDD.values()); eachPrint(pairRDD.sortByKey()); //两个rdd装换操作 //key交集的剔除 eachPrint(pairRDD.subtractByKey(pairRDD2)); //key一样的笛卡尔乘积还有righOuterJoin和leftOuterjoin eachPrint(pairRDD.join(pairRDD2)); //lookUp取出某一个特定key的value集返回一个list eachPrint(sc.parallelize(pairRDD.lookup(new Integer(1)))); //数据分区,良好的数据分区对join bykey等运算是非常又用的,可以直接在一台机子上完成运算 //减少了网络传输 //下面是根据hash分区最多分100个区的写法,注意分区完了要持久化,否则下次还是原样的分布 pairRDD.partitionBy(new HashPartitioner(100)).persist(StorageLevel.MEMORY_AND_DISK()); //对于输出结果有时候我们也想让他分好区,比如map操作想让返回的和原来的在一个分区 //但是spark的这些方法都是不按照原来的rdd存储的,是随机的,只有pair的一些方法会和原来保持一致 //cogroup groupWith join(多个) groupByKey reduceByKey combineByKey sort 【flat】mapValues(父rdd有分区) filter(父rdd有分区) } //groupByKey把同一个key的value放到一个list中,返回缩小版的pairRdd public static JavaPairRDD groupBykey(JavaPairRDD pairRDD){ return pairRDD.groupByKey(); } //reduceByKey把同一个key的value进行聚合成一个值,返回缩小版的pairRdd public static JavaPairRDD reduceByKey(JavaPairRDD pairRDD){ return pairRDD.reduceByKey(new Function2<Integer,Integer,Integer>() { public Integer call(Integer o, Integer o2) throws Exception { return o+o2; } }); } //combineByKey把同一个key的value拿出来进行分析(求均值) public static JavaPairRDD combineByKey(JavaPairRDD pairRDD){ return pairRDD.combineByKey( //注意三个参数都是对相同的key的操作 //第一个参数,如果这个key是初次出现则返回一个 value:1的初值tuple new Function<Integer, Tuple2<Integer, Integer>>() { public Tuple2<Integer, Integer> call(Integer o) throws Exception { return new Tuple2<Integer, Integer>(o, 1); } }, //第二个参数如果不是第二次出现的key,则初值tuple的value字段+这个key的value,然后初值tuple的num字段+1 //第三个以此类推 new Function2<Tuple2<Integer, Integer>, Integer, Tuple2<Integer, Integer>>() { public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> o, Integer o2) throws Exception { return new Tuple2<Integer, Integer>(o._1() + o2, o._2() + 1); } }, //最后初值tuple加完后(其实可以算均值了)但是因为第二个参数是在各个分区上计算完的结果,还需要最后的汇总 //第三个参数就是汇总各个分区的value的和,num的和。这样最后返回的就是 key:[value的和,value的个数] new Function2<Tuple2<Integer,Integer>,Tuple2<Integer,Integer>,Tuple2<Integer,Integer>>() { public Tuple2<Integer,Integer> call(Tuple2<Integer,Integer> o, Tuple2<Integer,Integer> o2) throws Exception { return new Tuple2<Integer, Integer>(o._1() + o2._1(),o._2()+o2._2()); } }); } //mapValues只对value进行map运算key保持返回等大的pairRdd public static JavaPairRDD mapValues(JavaPairRDD pairRDD){ return pairRDD.mapValues(new Function<Integer,String>() { public String call(Integer o) throws Exception { return o.toString()+"st"; } }); } //flatMapValues只对value进行的flatmap运算返回扩大版pairRdd public static JavaPairRDD flatMapValues(JavaPairRDD pairRDD){ return pairRDD.flatMapValues(new Function<Integer,Iterable>() { public List call(Integer o) throws Exception { return Arrays.asList(o.toString()+"st","多个"); } }); } public static void eachPrint(JavaPairRDD rdd){ System.out.println("-------------------------------------------"); rdd.foreach(new VoidFunction() { public void call(Object o) throws Exception { Tuple2 tuple2=(Tuple2)o; System.out.println(tuple2._1+":"+tuple2._2); } }); } public static void eachPrint(JavaRDD rdd){ System.out.println("-------------------------------------------"); rdd.foreach(new VoidFunction() { public void call(Object s) throws Exception { System.out.println(s); } }); }}
-------------------------------------------hello frankhello david-------------------------------------------[[hello frank]][[hello david]][[hi lidazhao]][[hi lidazhao]]-------------------------------------------hello frankhello davidhi lidazhao-------------------------------------------hellofrankhellodavidhilidazhaohilidazhao--------------------------------------------------------------------------------------hello frankhello david[[hello frank]][[hello david]][[hi lidazhao]][[hi lidazhao]]-------------------------------------------hello frankhello david-------------------------------------------hi lidazhaohi lidazhao-------------------------------------------hello frank:hello frankhello frank:hello davidhello david:hello frankhello david:hello davidhi lidazhao:hello frankhi lidazhao:hello davidhi lidazhao:hello frankhi lidazhao:hello david-------------------------------------------[hello frank, hello david, hi lidazhao, hi lidazhao]-------------------------------------------4-------------------------------------------{hello frank=1, hello david=1, hi lidazhao=2}-------------------------------------------[hi lidazhao, hi lidazhao, hello frank]-------------------------------------------[hello frank, hello david, hi lidazhao]-------------------------------------------[hello david, hello frank, hi lidazhao]-------------------------------------------[hi lidazhao]-------------------------------------------hello frank+hello david+hi lidazhao+hi lidazhao-------------------------------------------chushi+chushi+hello frank+hello david+hi lidazhao+hi lidazhao-------------------------------------------44-------------------------------------------(15,5)-------------------------------------------(15,5)
阅读全文
0 0
- Spark PairRDD API
- <spark>pairRDD
- Spark pairRDD
- Spark PairRDD 转化一
- Spark PairRDD 转化二
- Spark学习—PairRDD编程
- Spark PairRDD 行动与数据分区
- spark——pairRDD的简单操作(一)
- spark——pairRDD的简单操作(二)
- spark pairRDD基本操作(三)——附带wordcount程序
- spark RDD算子(十)之PairRDD的Action操作countByKey, collectAsMap
- Spark API
- Spark API
- 初识Spark之 Spark API
- 如何生成PairRDD
- pairRDD的join操作
- spark api学习
- spark RDD API详解
- cvc-complex-type.2.4.a: Invalid content was found starting with element 'async-supported'. 错误
- Excel在统计分析中的应用—第三章—数据库统计函数与数据透视表-Part1-(数据查询与筛选、分类汇总)
- 欢迎使用CSDN-markdown编辑器
- UVALive 6575 Odd and Even Zeroes
- Disruptor并发框架入门
- Spark PairRDD API
- cmd-bat批处理命令延时方法
- 编程题四
- Redis---------springboot整合Redis实例
- 移动端刷新组件XtnScroll--Angular4实现
- squid普通代理配置
- servlet单实例多线程模式
- C# 对字符串进行分离 split 字符串分割 "aaaa---bbbb---ccc"方法
- POJ 1789 Truck History (Prim算法求最小生成树)