Spark算子(三)
来源:互联网 发布:painttool sai mac 编辑:程序博客网 时间:2024/05/29 14:11
Point 1:RepartitionOperator
package com.spark.operator;import java.util.ArrayList;import java.util.Arrays;import java.util.Iterator;import java.util.List;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function2;public class RepartitionOperator { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("RepartitionOperator") .setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); // repartition算子,用于任意将RDD的partition增多或者减少! // coalesce仅仅将RDD的partition减少! // 建议使用的场景 // 一个很经典的场景,使用Spark SQL从HIVE中查询数据时候,spark SQL会根据HIVE // 对应的hdfs文件的block的数量决定加载出来的RDD的partition有多少个! // 这里默认的partition的数量是我们根本无法设置的 // 有些时候,可能它会自动设置的partition的数量过于少了,为了进行优化 // 可以提高并行度,就是对RDD使用repartition算子! // 公司要增加部门 List<String> staffList = Arrays.asList("xuruyun1","xuruyun2","xuruyun3" ,"xuruyun4","xuruyun5","xuruyun6" ,"xuruyun7","xuruyun8","xuruyun9" ,"xuruyun10","xuruyun11","xuruyun12"); JavaRDD<String> staffRDD = sc.parallelize(staffList, 3); JavaRDD<String> staffRDD2 = staffRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() { private static final long serialVersionUID = 1L; @Override public Iterator<String> call(Integer index, Iterator<String> iterator) throws Exception { List<String> list = new ArrayList<String>(); while(iterator.hasNext()){ String staff = iterator.next(); list.add("部门["+(index+1)+"]"+staff); } return list.iterator(); } }, true); for(String staffInfo : staffRDD2.collect()){ System.out.println(staffInfo); } JavaRDD<String> staffRDD3 = staffRDD2.repartition(6); JavaRDD<String> staffRDD4 = staffRDD3.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() { private static final long serialVersionUID = 1L; @Override public Iterator<String> call(Integer index, Iterator<String> iterator) throws Exception { List<String> list = new ArrayList<String>(); while(iterator.hasNext()){ String staff = iterator.next(); list.add("部门["+(index+1)+"]"+staff); } return list.iterator(); } }, true); for(String staffInfo : staffRDD4.collect()){ System.out.println(staffInfo); } sc.close(); }}
Point 2:ReduceOperator
package com.spark.operator;import java.util.Arrays;import java.util.List;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function2;public class ReduceOperator { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("ReduceOperator") .setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); // 有一个集合,里面有1到10,10个数字,现在我们通过reduce来进行累加 List<Integer> numberList = Arrays.asList(1,2,3,4,5); JavaRDD<Integer> numbers = sc.parallelize(numberList); // reduce操作的原理:首先将第一个和第二个元素,传入call方法 // 计算一个结果,接着把结果再和后面的元素依次累加 // 以此类推 int sum = numbers.reduce(new Function2<Integer, Integer, Integer>() { private static final long serialVersionUID = 1L; @Override public Integer call(Integer v1, Integer v2) throws Exception { return v1+v2; } }); System.out.println(sum); sc.close(); };}
Point 3:ReduceByKeyOperator
package com.spark.operator;import java.util.Arrays;import java.util.List;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function2;import org.apache.spark.api.java.function.VoidFunction;import scala.Tuple2;// reduceByKey = groupByKey + reduce// shuffle 洗牌 = map端 + reduce端// spark里面这个reduceByKey在map端自带Combinerpublic class ReduceByKeyOperator { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("LineCount").setMaster( "local"); JavaSparkContext sc = new JavaSparkContext(conf); List<Tuple2<String,Integer>> scoreList = Arrays.asList( new Tuple2<String, Integer>("xuruyun" , 150), new Tuple2<String, Integer>("liangyongqi" , 100), new Tuple2<String, Integer>("wangfei" , 100), new Tuple2<String, Integer>("wangfei" , 80)); JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(scoreList); rdd.reduceByKey(new Function2<Integer, Integer, Integer>() { private static final long serialVersionUID = 1L; @Override public Integer call(Integer v1, Integer v2) throws Exception { return v1+v2; } }).foreach(new VoidFunction<Tuple2<String,Integer>>() { private static final long serialVersionUID = 1L; @Override public void call(Tuple2<String, Integer> tuple) throws Exception { System.out.println("name : " + tuple._1 + " score :" + tuple._2); } }); sc.close(); }}
阅读全文
0 0
- Spark算子(三)
- Spark RDD算子【三】combineByKey
- spark RDD算子(三) distinct,union,intersection,subtract,cartesian
- Spark算子(一)
- Spark算子(二)
- Spark算子(四)
- Spark算子(五)
- Spark算子(六)
- Spark算子(七)
- Spark算子(八)
- Spark算子(九)
- Spark算子执行流程详解之三
- Spark笔记三之RDD,算子
- 大数据算子(spark)
- spark算子实战(二)
- Spark 算子
- spark算子
- spark 算子
- SVM 核函数的选择
- 深入浅出WPF(1)——什么是WPF
- angular使用bootstrap手动引导
- LCT(Cave 洞穴勘测,BZOJ 2049)
- Ubuntu下修改系统的默认启动级别(图形界面转命令行或命令行转图形界面)
- Spark算子(三)
- 【loj】#6000. 「网络流 24 题」搭配飞行员(二分图匹配)
- 基础常识与原理分析汇总.
- js 递归
- JVM-运行时数据区(Run-time Data Areas)
- redis主从复制及切换
- Shell字符串截取实现
- mapreduce对日志数据上下行流量汇总
- E