Spark基础实例
来源:互联网 发布:swmm模型软件 编辑:程序博客网 时间:2024/04/28 14:50
spark 操作的几个步骤
1 数据关联 textFile 和 parallelize
2 转换操作(JavaRDD和JavaPairRDD他们可以通过mapToPair and flatMapToPair转换)
3 action操作,获取数据结果
一、wordcount的例子
- //单词统计
- public static void wordCount(JavaSparkContext ctx ){
- String filePath = "e://log1.log";
- JavaRDD<String> lines = ctx.textFile(filePath, 1);
- JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
- @Override
- public Iterable<String> call(String s) {
- return Arrays.asList(SPACE.split(s));
- }
- });
- JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() {
- @Override
- public Tuple2<String, Integer> call(String s) {
- return new Tuple2<String, Integer>(s, 1);
- }
- });
- JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() {
- @Override
- public Integer call(Integer i1, Integer i2) {
- return i1 + i2;
- }
- });
- List<Tuple2<String, Integer>> output = counts.collect();
- for (Tuple2<?, ?> tuple : output) {
- System.out.println(tuple._1() + " : " + tuple._2());
- }
- }
二、 各种Transformations 和action测试准备数据
- public static void testMap(JavaSparkContext ctx) {
- List<Integer> data = Arrays.asList(1, 2, 3, 4, 5,1,2,3);
- JavaRDD<Integer> distData = ctx.parallelize(data);
- List<Integer> data2 = Arrays.asList(1, 2, 3, 4, 5,6);
- JavaRDD<Integer> distData2 = ctx.parallelize(data2);
- List<String> data3 = Arrays.asList("wang zhan,xiao ming,li xin,wang qiang,e,f".split(","));
- JavaRDD<String> data3RDD = ctx.parallelize(data3);
- JavaRDD<Integer> lineLengths ;
- JavaRDD<String> returnStr;
- JavaPairRDD<String, Integer> returnStr2;
- JavaPairRDD<String, Integer> returnStr3;
- // lineLengths = map( distData);
- // lineLengths = filter(distData );
- // lineLengths = sample(distData );
- // lineLengths = union(distData, distData2) ;
- // lineLengths = intersection(distData, distData2) ;
- // lineLengths = distinct(distData ) ;
- // returnStr = flatMap(data3RDD); //数据扁平打散
- // returnStr3 = mapToPair(returnStr);// 数据变成键值对的形式
- // reduceByKey(returnStr3); //对key进行分组计算
- // List<Integer> list = lineLengths.collect();
- // WordCount.print(list);
- // List<String> list2 = returnStr.collect() ;
- // WordCount.print(list2);
- // testPersist( data3RDD );
- // List<Tuple2<String, Integer>> listTuple = returnStr2.collect();
- // printTuple( listTuple );
- // returnStr3 = reduceByKey(returnStr2 );
- // List<Tuple2<String, Integer>> listTuple2 = returnStr3.collect();
- // printTuple( listTuple2 );
- //reduceByKey(returnStr2 );
- //action
- //reduce( returnStr2);
- // count(returnStr2 );
- }
三、groupByKey
- //对key进行分组处理,但如果需要统计求和则最好不要这样处理
- private static JavaPairRDD<String, Integer> groupByKey(JavaPairRDD<String, Integer> returnStr3) {
- JavaPairRDD<String, Iterable<Integer>> rdd = returnStr3.groupByKey();
- return null;
- }
四、数据去重
- //数据去重
- private static JavaRDD<Integer> distinct(JavaRDD<Integer> distData) {
- JavaRDD<Integer> rdd3 = distData.distinct( );
- print( rdd3);
- return null;
- }
五、交集数据
- //获取rdd数据的交集 数据
- private static JavaRDD<Integer> intersection(JavaRDD<Integer> distData,
- JavaRDD<Integer> distData2) {
- JavaRDD<Integer> rdd3 = distData.intersection(distData2 );
- print(rdd3 );
- return null;
- }
六数据持久化
- /**
- * 持久化数据
- * @param data3RDD
- */
- private static void testPersist(JavaRDD<String> data3RDD) {
- System.out.println( "持久化数据到目录。。。");
- data3RDD.persist(StorageLevel.MEMORY_ONLY());
- // data3RDD.checkpoint();
- // data3RDD.isCheckpointed() ;
- }
七、count统计
- //计算统计 元素两两传入到reduce中然后计算统计
- private static void count(JavaPairRDD<String, Integer> returnStr2) {
- System.out.println("元素总数:"+returnStr2.count() ); //获取元素总数
- System.out.println("元素总数:"+returnStr2.first() );//获取第一个元素
- System.out.println("元素总数:"+returnStr2.take(2) );//获取RDD的前2个元素
- System.out.println("元素countByKey总数:"+returnStr2.countByKey( ) ); //根据key进行统计数量
- //returnStr2.saveAsTextFile("E://test.txt") ;//数据保存到文件中
- returnStr2.foreach(new VoidFunction<Tuple2<String,Integer>>() {
- @Override
- public void call(Tuple2<String, Integer> t) throws Exception {
- System.out.println( t );
- }
- });
- }
八、reduce操作
- //计算统计 元素两两传入到reduce中然后计算统计
- private static void reduce(JavaPairRDD<String, Integer> returnStr2) {
- Tuple2<String, Integer> t = returnStr2.reduce(
- new Function2<Tuple2<String,Integer>, Tuple2<String,Integer>, Tuple2<String,Integer>>() {
- @Override
- public Tuple2<String, Integer> call( Tuple2<String, Integer> v1, Tuple2<String, Integer> v2) throws Exception {
- System.out.println( v1 +" "+v2 );
- return new Tuple2<String, Integer>(v1._1+v2._1,v1._2+v2._2() );
- }
- } );
- System.out.println("reduce结果 :"+t._1 +" "+t._2);
- }
九、reduceBykey使用
- //将原来的RDD每一个行数据 变成一个数组,然后所有的数组数据存到一个总得RDD数组中
- public static JavaPairRDD<String, Integer> reduceByKey(JavaPairRDD<String, Integer> rdd ) {
- JavaPairRDD<String, Integer> counts = rdd.reduceByKey(
- new Function2<Integer, Integer, Integer>() {//泛型分别是 :两个计算参数 ,最后是返回值
- @Override
- public Integer call(Integer i1, Integer i2) {//每次把key相同的数据,与上一次执行的结果,依次传进来计算
- System.out.println(i1+" == "+i2);
- return i1 + i2;
- }
- });
- printPair(counts );
- return counts;
- }
十 flatMap使用
- //将原来的RDD每一个行数据 变成一个数组,然后所有的数组数据存到一个总得RDD数组中
- public static JavaRDD<String> flatMap(JavaRDD<String> rdd ) {
- JavaRDD<String> lineLengths = rdd.flatMap( //返回值是输出的类型
- new FlatMapFunction<String, String>() { //第一个参数是输入,第二个参数是输出
- @Override
- public Iterable<String> call(String str) throws Exception {
- return Arrays.asList(SPACE.split(str));
- }
- }) ;
- printStr(lineLengths );
- return lineLengths;
- }
十一、mapToPair使用
- // 将普通的RDD转换为 map数据的RDD方便计算处理 , a a->1
- public static JavaPairRDD<String, Integer> mapToPair(JavaRDD<String> rdd ){
- JavaPairRDD<String, Integer> ones = rdd.mapToPair( new PairFunction<String, String, Integer>() {
- @Override
- public Tuple2<String, Integer> call(String t) throws Exception {
- return new Tuple2<String, Integer>( t, 1);
- }
- }) ;
- return ones ;
- }
十二、 合并Rdd
- // 合并rdd
- public static JavaRDD<Integer> union(JavaRDD<Integer> rdd , JavaRDD<Integer> rdd2) {
- JavaRDD<Integer> lineLengths = rdd.union(rdd2) ;
- print(lineLengths);
- return lineLengths;
- }
十三、抽样
- // Return a sampled subset of this RDD. 返回一个RDD子集抽样
- public static JavaRDD<Integer> sample(JavaRDD<Integer> rdd) {
- JavaRDD<Integer> lineLengths = rdd.sample(false , 0.4 ) ;
- print(lineLengths);
- return lineLengths;
- }
十四、map使用
- // map 源中的每一个元素都进行一个函数操作,生成一个新的RDD ,即每个元素进行一次转换
- public static JavaRDD<Integer> map(JavaRDD<Integer> rdd) {
- JavaRDD<Integer> lineLengths = rdd.map(new Function<Integer, Integer>() {
- @Override
- public Integer call(Integer v1) throws Exception {
- return v1+1;
- }
- });
- print( lineLengths);
- return lineLengths;
- }
十五、打印数据
- /**
- * 如果rdd中数据过多,则调用take获取一部分数据打印
- * 1 直接rdd.foreach 打印rdd的数据,数据打印在各个executor中
- * 2 调用rdd.collect.foreach 打印数据,数据打印在driver上
- * @param lineLengths
- */
- public static void print(JavaRDD<Integer> lineLengths ){
- // lineLengths.foreach( new VoidFunction<Integer>() {
- // @Override
- // public void call(Integer t) throws Exception {
- // }
- // });
- System.out.println("开始打印");
- lineLengths.collect().forEach( new Consumer<Integer>(){
- @Override
- public void accept(Integer t) {
- System.out.println( t);
- }
- });
- System.out.println("结束打印");
- }
- public static void printStr(JavaRDD<String> lineLengths ){
- System.out.println("开始打印");
- lineLengths.collect().forEach( new Consumer<String>(){
- @Override
- public void accept(String t) {
- System.out.println( t);
- }
- });
- System.out.println("结束打印");
- }
- /**
- * 打印tuble
- * @param lineLengths
- */
- public static void printPair( JavaPairRDD<String, Integer> rdd ){
- System.out.println("开始打印");
- rdd.collect().forEach( new Consumer<Tuple2<String,Integer>>(){
- @Override
- public void accept(Tuple2<String, Integer> t) {
- System.out.println( t._1() +" "+t._2() );
- }
- });
- System.out.println("结束打印");
- }
- public static void print(List list) {
- if (list == null || list.size() == 0) {
- return;
- }
- for (int i = 0; i < list.size(); i++) {
- System.out.println(list.get(i));
- }
- }
- /**
- * 打印map的RDD
- * @param listTuple
- */
- public static void printTuple(List<Tuple2<String, Integer>> listTuple) {
- if (listTuple == null || listTuple.size() == 0) {
- return;
- }
- for (int i = 0; i < listTuple.size(); i++) {
- System.out.println(listTuple.get(i));
- }
- }
十六、过滤
- //对每一个元素进行过滤,然后返回 ,false 的数据会被过滤掉
- public static JavaRDD<Integer> filter(JavaRDD<Integer> rdd) {
- JavaRDD<Integer> lineLengths = rdd.filter(new Function<Integer, Boolean>() {
- @Override
- public Boolean call(Integer v1) throws Exception {
- if (v1 != 1) {
- return true;
- }
- return false;
- }
- });
- print( lineLengths);
- return lineLengths;
- }
0 0
- Spark基础实例
- Spark实例
- Spark基础transformation操作实例(Java版)
- Spark基础transformation操作实例(Scala版)
- Spark基础
- spark基础
- spark基础
- spark 基础
- Spark基础
- spark基础
- Spark基础
- Spark基础
- Spark基础
- Spark基础-为什么选择spark
- Spark Streaming实例分析
- Spark Streaming实例分析
- spark简单实例
- Spark使用实例
- STM32F103对于固件库V3.5.0打开和关闭总中断
- Why Uber Engineering Switched from Postgres to MySQL
- IOS iPhone开发中发送e-mail的3种方式
- spring mvc数据绑定
- iOS GCD完全解析
- Spark基础实例
- 【HD 1108】变形课
- arcgis flex 地图开发入门实践(适合完全不懂的)
- 调参建议
- liunx 安装 redis
- linux 软连接
- 十分钟玩转 jQuery、实例大全
- android6.0 power显示(亮度等)深入分析(二)DisplayManagerService
- Selenium WebDriver使用经验杂记