Spark RDD API
来源:互联网 发布:excel重复数据合计 编辑:程序博客网 时间:2024/05/21 11:03
import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaDoubleRDD;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.*;import scala.Tuple2;import java.io.Serializable;import java.security.SecureRandom;import java.util.Arrays;import java.util.Iterator;import java.util.List;/** * Created by Frank on 2017/8/2. */public class RddDemo{ static SparkConf conf = new SparkConf().setAppName("app").setMaster("local"); static JavaSparkContext sc = new JavaSparkContext(conf); public static void main(String[] a){ List<String> list= Arrays.asList("hello frank","hello david","hi lidazhao","hi lidazhao"); JavaRDD<String> rdd = sc.parallelize(list); //单个rdd处理 eachPrint(filter(rdd)); eachPrint(map(rdd)); eachPrint(distinct(rdd)); eachPrint(flatmap(rdd)); eachPrint(sample(rdd)); //处理两个rdd eachPrint(union(filter(rdd),map(rdd))); eachPrint(intersection(filter(rdd),rdd)); eachPrint(subtract(rdd,filter(rdd))); eachPrint(cartesian(rdd,filter(rdd))); //行动操作 返回实际类型而不是rdd print(rdd.collect());//list print(rdd.count());//long print(rdd.countByValue());//map print(rdd.top(3));//list print(rdd.take(3)); print(rdd.takeOrdered(3)); print(rdd.takeSample(false,1)); print(reduce(rdd)); print(fold(rdd)); print(aggregate2(rdd)); //JavaDoubleRDD相当于JavaRDD<Double>,但是又专门封装了数学方法如max sum mean期望 stdev标准差 variance方差 print(aggregate3(sc.parallelize(Arrays.asList(1,2,3,4,5)).mapToPair(new PairFunction<Integer,Integer,Integer>() { public Tuple2<Integer, Integer> call(Integer integer) throws Exception { return new Tuple2<Integer, Integer>(integer,1); } }))); print(aggregate4(sc.parallelize(Arrays.asList(1,2,3,4,5)))); } //filter 过滤含有hello的 public static JavaRDD filter(JavaRDD rdd){ return rdd.filter(new Function<String, Boolean>() { public Boolean call(String s) throws Exception { return s.contains("hello"); } }); } //map 对每一个String加[[]]的处理 public static JavaRDD map(JavaRDD rdd){ return rdd.map(new Function<String,String>() { public String call(String s) throws Exception { return "[["+s+"]]"; } }); } //distinct 去重复 public static JavaRDD distinct(JavaRDD rdd){ return rdd.distinct(); } //flatmap 一对多返回 public static JavaRDD flatmap(JavaRDD rdd){ return rdd.flatMap(new FlatMapFunction<String,String>() { public Iterator<String> call(String s) throws Exception { return Arrays.asList(s.split(" ")).iterator(); } }); } //sample 随机抽样设定概率 public static JavaRDD sample(JavaRDD rdd){ return rdd.sample(false,0.3); } //union 合并(不去重复) public static JavaRDD union(JavaRDD rdd1,JavaRDD rdd2){ return rdd1.union(rdd2); } //intersection 交集 public static JavaRDD intersection(JavaRDD rdd1,JavaRDD rdd2){ return rdd1.intersection(rdd2); } //subtract 从rdd1中移除rdd2的 public static JavaRDD subtract(JavaRDD rdd1,JavaRDD rdd2){ return rdd1.subtract(rdd2); } //cartesian 笛卡尔积 public static JavaPairRDD cartesian(JavaRDD rdd1, JavaRDD rdd2){ return rdd1.cartesian(rdd2); } //reduce a+b+c+e+d... public static String reduce(JavaRDD rdd){ return rdd.reduce(new Function2<String,String,String>() { public String call(String o, String o2) throws Exception { return o+"+"+o2; } }).toString(); } //fold 类似于reduce不过要提供初值先进行一次运算 public static String fold(JavaRDD rdd){ return rdd.fold("chushi", new Function2<String,String,String>() { public String call(String o, String o2) throws Exception { return o+"+"+o2; } }).toString(); } //aggregate 也类似于reduce但是运算返回值可以不同于输入值 public static Object aggregate(JavaRDD rdd){ return rdd.aggregate(new kv("",0), new Function2<kv,String,kv>() { public kv call(kv o, String o2) throws Exception { o.count++; o.word+=o2; return o; } }, new Function2<kv,kv,kv>() { public kv call(kv o, kv o2) throws Exception { o.word+=o2.word; o.count+=o2.count; return o; } }); } public static Object aggregate2(JavaRDD rdd){ return rdd.aggregate(0, new Function2<Integer, String, Integer>() { public Integer call(Integer o, String o2) throws Exception { return o + o2.length(); } }, new Function2<Integer, Integer, Integer>() { public Integer call(Integer o, Integer o2) throws Exception { return o+o2; } }); } public static Object aggregate3(JavaPairRDD rdd){ return rdd.aggregate(new Tuple2<Integer,Integer>(0,0), new Function2<Tuple2<Integer,Integer>, Tuple2<Integer,Integer>, Tuple2<Integer,Integer>>() { public Tuple2<Integer,Integer> call(Tuple2<Integer,Integer> o, Tuple2<Integer,Integer> o2) throws Exception { return new Tuple2<Integer, Integer>(o._1+o2._1,o._2+o2._2); } }, new Function2<Tuple2<Integer,Integer>, Tuple2<Integer,Integer>, Tuple2<Integer,Integer>>() { public Tuple2<Integer,Integer> call( Tuple2<Integer,Integer> o, Tuple2<Integer,Integer> o2) throws Exception { return new Tuple2<Integer, Integer>(o._1+o2._1,o._2+o2._2); } }); } public static Object aggregate4(JavaRDD rdd){ return rdd.aggregate(new Tuple2<Integer, Integer>(0, 0), new Function2<Tuple2<Integer, Integer>, Integer, Tuple2<Integer, Integer>>() { public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> o, Integer o2) throws Exception { return new Tuple2<Integer, Integer>(o._1() + o2, o._2() + 1); } }, new Function2<Tuple2<Integer,Integer>,Tuple2<Integer,Integer>,Tuple2<Integer,Integer>>() { public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> o, Tuple2<Integer, Integer> o2) throws Exception { return new Tuple2<Integer, Integer>(o._1+o2._1,o._2+o2._2); } } ); } public static class kv implements Serializable { String word; Integer count; kv(String word,Integer count){ this.word=word; this.count=count; } public String toString(){ return word+"\r\n"+count; } } public static void eachPrint(JavaRDD rdd){ System.out.println("-------------------------------------------"); rdd.foreach(new VoidFunction<String>() { public void call(String s) throws Exception { System.out.println(s); } }); } public static void eachPrint(JavaPairRDD<String,String> rdd){ System.out.println("-------------------------------------------"); rdd.foreach(new VoidFunction<Tuple2<String, String>>() { public void call(Tuple2<String, String> stringStringTuple2) throws Exception { System.out.println(stringStringTuple2._1+":"+stringStringTuple2._2); } }); } public static void print(Object object){ System.out.println("-------------------------------------------"); System.out.println(object); }}
-------------------------------------------hello frankhello david-------------------------------------------[[hello frank]][[hello david]][[hi lidazhao]][[hi lidazhao]]-------------------------------------------hello frankhello davidhi lidazhao-------------------------------------------hellofrankhellodavidhilidazhaohilidazhao-------------------------------------------hi lidazhaohi lidazhao-------------------------------------------hello frankhello david[[hello frank]][[hello david]][[hi lidazhao]][[hi lidazhao]]-------------------------------------------hello frankhello david-------------------------------------------hi lidazhaohi lidazhao-------------------------------------------hello frank:hello frankhello frank:hello davidhello david:hello frankhello david:hello davidhi lidazhao:hello frankhi lidazhao:hello davidhi lidazhao:hello frankhi lidazhao:hello david-------------------------------------------[hello frank, hello david, hi lidazhao, hi lidazhao]-------------------------------------------4-------------------------------------------{hello frank=1, hello david=1, hi lidazhao=2}-------------------------------------------[hi lidazhao, hi lidazhao, hello frank]-------------------------------------------[hello frank, hello david, hi lidazhao]-------------------------------------------[hello david, hello frank, hi lidazhao]-------------------------------------------[hi lidazhao]-------------------------------------------hello frank+hello david+hi lidazhao+hi lidazhao-------------------------------------------chushi+chushi+hello frank+hello david+hi lidazhao+hi lidazhao-------------------------------------------44-------------------------------------------(15,5)-------------------------------------------(15,5)
阅读全文
0 0
- spark RDD API详解
- Spark-RDD API
- spark rdd api
- spark rdd api
- Spark RDD API详解
- spark-rdd-api
- Spark RDD API 详解
- Spark RDD API详解
- spark rdd操作API
- Spark RDD API
- Spark RDD API
- spark RDD api
- Spark 之RDD API大全
- 【实践】Spark RDD API实战
- Spark RDD---api(map&reduce)
- Spark RDD API 基本操作
- Spark RDD API扩展开发2 : 自定义RDD
- Spark RDD API扩展开发(1)
- Java中Synchronized的用法
- 浅谈最短路中的Dijskra算法
- 链表创建队列
- openSession和getCurrentSession区别
- KMP 算法(1):如何理解 KMP
- Spark RDD API
- 批量提取一个文件夹中的文件名
- BZOJ 4806 炮(DP)
- cvc-complex-type.2.4.a: Invalid content was found starting with element 'async-supported'. 错误
- Excel在统计分析中的应用—第三章—数据库统计函数与数据透视表-Part1-(数据查询与筛选、分类汇总)
- 欢迎使用CSDN-markdown编辑器
- UVALive 6575 Odd and Even Zeroes
- Disruptor并发框架入门
- Spark PairRDD API