<spark>JavaAggregate,自定义数据处理

来源:互联网 发布:苹果mac系统更新 编辑:程序博客网 时间:2024/04/29 17:38
import org.apache.kafka.common.metrics.stats.Avg;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function2;import java.io.Serializable;import java.util.Arrays;/** * 求平均值 * Created by hadoop on 17-2-23. */public class JavaAggregate {    public static void main(String[] args) throws  Exception {        SparkConf conf = new SparkConf().setAppName("Aggregate");        JavaSparkContext sc = new JavaSparkContext(conf);        JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1,2,3,4,5,6));        class AvgCount implements Serializable {            public int total;            public int num;            public AvgCount(int total, int num) {                this.total = total;                this.num = num;            }            public double avg() {                return total / (double) num;            }        }        //增加数据        Function2<AvgCount, Integer, AvgCount> addAndCount = new Function2<AvgCount, Integer, AvgCount>() {            @Override            public AvgCount call(AvgCount avgCount, Integer integer) throws Exception {                avgCount.total += integer;                avgCount.num += 1;                return avgCount;            }        };        //将数据加在一起        Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {            @Override            public AvgCount call(AvgCount avgCount, AvgCount avgCount2) throws Exception {                avgCount.total += avgCount2.total;                avgCount.num += avgCount2.num;                return avgCount;            }        };        //需要传入一个单位元用于运算的初始化        AvgCount initial = new AvgCount(0, 0);        AvgCount result = rdd.aggregate(initial, addAndCount, combine);        System.out.println(result.avg());    }}
0 0
原创粉丝点击