Spark RDD 转化

来源:互联网 发布:sqlalchemy sql语句 编辑:程序博客网 时间:2024/06/04 00:50
package com.fei.simple_project;import java.util.Arrays;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaDoubleRDD;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.DoubleFunction;import org.apache.spark.api.java.function.FlatMapFunction;import org.apache.spark.api.java.function.Function;import org.apache.spark.storage.StorageLevel;import scala.Tuple2;public class App {    public static void main( String[] args )    {        SparkConf conf = new SparkConf().setAppName("Simple Application");        JavaSparkContext sc = new JavaSparkContext(conf);                //使用parallelize创建RDD        JavaRDD<Integer> lines1 = sc.parallelize(Arrays.asList(1,2,3,4));        lines1.persist(StorageLevel.MEMORY_ONLY());        System.out.println("222222222Count is:"+lines1.count());                JavaRDD<String> lines2 = sc.parallelize(Arrays.asList("pandas", "like","i like pandas"));        lines2.persist(StorageLevel.MEMORY_ONLY());        System.out.println("333333333333Count is:"+lines2.count());                //转化操作        //map, 每个元素返回一个对象      JavaRDD<String> str1RDD = lines2.map(new Function<String, String>(){public String call(String x)  {return x.split(" ")[0];}      });      System.out.println("44444444444likeRDD Count is:"+str1RDD.count()+" "+ str1RDD.first());       for(String it:str1RDD.collect()){      System.out.println(it+"   ");      }      //flat map,所有元素放入一个对象中返回,返回可迭代内容      JavaRDD<String> flatMapRDD = lines2.flatMap(new FlatMapFunction<String, String>(){public Iterable<String> call(String x) throws Exception {return Arrays.asList(x.split(" "));}      }) ;      System.out.println("55555555flatMapRDD Count is:"+flatMapRDD.count()+" "+ flatMapRDD.first());            for(String it:flatMapRDD.collect()){      System.out.println(it+"   ");      }            //distinct,集合去重,不常用,开销大      JavaRDD<String> distinctRDD = flatMapRDD.distinct();      System.out.println("6666666666distinctRDD Count is:"+distinctRDD.count()+" "+ distinctRDD.first());            for(String it:distinctRDD.collect()){      System.out.println(it+"   ");      }      //union,合并,包括重复      JavaRDD<String> unionRDD = flatMapRDD.union(str1RDD);      System.out.println("777777777777777unionRDD Count is:"+unionRDD.count()+" "+ unionRDD.first());            for(String it:unionRDD.collect()){      System.out.println(it+"   ");      }      //intersection,返回共有的      JavaRDD<String> intersectionRDD = flatMapRDD.intersection(str1RDD);      System.out.println("88888888888intersectionRDD Count is:"+intersectionRDD.count()+" "+ intersectionRDD.first());            for(String it:intersectionRDD.collect()){      System.out.println(it+"   ");      }            //subtract,返回只在第一个中      JavaRDD<String> subtractRDD = flatMapRDD.subtract(str1RDD);      System.out.println("9999999999subtractRDD Count is:"+subtractRDD.count());            for(String it:subtractRDD.collect()){      System.out.println(it+"   ");      }                  //cartesian,返回笛卡尔积      JavaPairRDD<String, String> cartesianRDD = flatMapRDD.cartesian(str1RDD);      System.out.println("aaaaaaaaacartesianRDD Count is:"+cartesianRDD.count()+" "+ cartesianRDD.first());            for(Tuple2<String, String> it:cartesianRDD.collect()){      System.out.println(it._1+"   "+it._2+"\n");      }                  //mapToDouble,RDD类型转换      //其他转换还有:flatMapToDouble,flatMapToPair, mapToPair      JavaDoubleRDD doubleRDD = lines1.mapToDouble(new DoubleFunction<Integer>(){public double call(Integer x) throws Exception {return (double)x*x;}      });      System.out.println("aaaaaaaaadoubleRDD Count is:"+doubleRDD.count()+" "+ doubleRDD.first());      for(double it:doubleRDD.collect()){      System.out.println(it+"   ");      }       //mean是平均数      System.out.println("9999999999doubleRDD mean is:"+doubleRDD.mean());          }}

222222222Count is:4                                                             333333333333Count is:344444444444likeRDD Count is:3 pandaspandas   like   i   55555555flatMapRDD Count is:5 pandaspandas   like   i   like   pandas   6666666666distinctRDD Count is:3 pandaspandas   i   like   777777777777777unionRDD Count is:8 pandaspandas   like   i   like   pandas   pandas   like   i   88888888888intersectionRDD Count is:3 pandaspandas   i   like   9999999999subtractRDD Count is:0aaaaaaaaacartesianRDD Count is:15 (pandas,pandas)pandas   pandaspandas   likepandas   ilike   pandaslike   likelike   ii   pandaslike   pandaspandas   pandasi   likelike   likepandas   likei   ilike   ipandas   iaaaaaaaaadoubleRDD Count is:4 1.01.0   4.0   9.0   16.0   9999999999doubleRDD mean is:7.5


0 0