<spark>flatmap 和 map

来源:互联网 发布:平面设计和淘宝美工 编辑:程序博客网 时间:2024/05/25 12:22

flatmap

import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.FlatMapFunction;import java.util.Arrays;/** * Created by hadoop on 17-2-23. */public class JavaSplit {    public static void main(String[] args) throws Exception{        SparkConf conf = new SparkConf().setAppName("Split");        JavaSparkContext sc = new JavaSparkContext(conf);        JavaRDD<String> lines = sc.parallelize(Arrays.asList("Hello world","Hello Leo01"));        JavaRDD<String> words = lines.flatMap(                new FlatMapFunction<String, String>() {                    @Override                    public Iterable<String> call(String s) throws Exception {                        return Arrays.asList(s.split(" "));                    }                }        );        System.out.println(words.collect());    }}

map

import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import java.util.Arrays;/** * Created by hadoop on 17-2-23. */public class JavaPow {    public static void main(String[] args) throws Exception{        SparkConf conf = new SparkConf().setAppName("Pow");        JavaSparkContext sc = new JavaSparkContext(conf);        JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1,2,3,4,5,6));        JavaRDD<Integer> result = rdd.map(new Function<Integer,Integer>(){            public Integer call(Integer x)            {                return x*x;            }        });        //System.out.println(StringUtils.join(result.collect(),","));        System.out.println(result.collect());    }}

flatmap处理过的数据是扁平化的数组,map处理的数据则是根据处理的数据条数来储存
如:{“1 2”,”3 4”} 分别经过flatmap和map处理过后的数据形式是
flatmap:{“1”,”2”,”3”,”4”};
map:{{“1”,”2”},{“3”,”4”}};

0 0
原创粉丝点击