spark源码阅读笔记Dataset(二)Dataset中Actions、function、transformations

来源:互联网 发布:徐玉玉网络诈骗事例 编辑:程序博客网 时间:2024/06/05 16:19
package Datasetimport org.apache.spark.sql.functions._import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}/**  * Created by legotime  */object dataSetOperation {  case class Person(name: String, age: Long)  val sparkSession = SparkSession.builder().appName("data set example")    .master("local").getOrCreate()  import sparkSession.implicits._  val rdd = sparkSession.sparkContext.textFile("hdfs://master:9000/src/main/resources/people.txt")  val dataSet = rdd.map(_.split(",")).map(p =>Person(p(0),p(1).trim.toLong)).toDS()  //---------------------------------------------------------------Actions--------------------------------------  def dataSet_collect() = {    //Returns an array that contains all of Rows in this Dataset.    dataSet.collect().foreach(println)    /**      * Person(Michael,29)      * Person(Andy,30)      * Person(Justin,19)      */  }  def dataSet_collectAsList()={    //Returns a Java list that contains all of Rows in this Dataset.    println(dataSet.collectAsList)    /**      * [Person(Michael,29), Person(Andy,30), Person(Justin,19)]      */  }  def dataSet_count() = {    //Returns the number of rows in the Dataset.    println(dataSet.count())    /**      * 3      */  }  def dataSet_describe()={    //Computes statistics for numeric columns, including count, mean, stddev, min, and max    dataSet.describe("name","age").show    /** 这个函数可以计算可中基本统计信息      * +-------+-------+-----------------+      * |summary|   name|              age|      * +-------+-------+-----------------+      * |  count|      3|                3|      * |   mean|   null|             26.0|      * | stddev|   null|6.082762530298219|      * |    min|   Andy|               19|      * |    max|Michael|               30|      * +-------+-------+-----------------+      */  }  def dataSet_first()={    //Returns the first row.    println(dataSet.first())    /**      * Person(Michael,29)      */  }  def dataSet_foreachPartition()={    //Applies a function f to each partition of this Dataset.    dataSet.foreachPartition{ part =>      println(part.toList)    }    /**      * List(Person(Michael,29), Person(Andy,30), Person(Justin,19))      */  }  def dataSet_head()={    //Returns the first n rows.    dataSet.head(2).foreach(println)    /**      * Person(Michael,29)      * Person(Andy,30)      */  }  def dataSet_reduce()={    //(Scala-specific) Reduces the elements of this Dataset using the specified binary function.    val data: Dataset[String] = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt").as[String]    println(data.reduce(_+_))    /**      * Michael, 29Andy, 30Justin, 19      */  }  def dataSet_show() ={    println("----------默认全部打印---------------")    dataSet.show()    println("----------only showing top 2 rows---------------")    dataSet.show(2)    println("---------Displays the top 20 rows of Dataset in a tabular form.------------")    dataSet.show(true)  }  def dataSet_toLocalIterator()={    //Return an iterator that contains all of Rows in this Dataset.    val tmp = dataSet.toLocalIterator()    while (tmp.hasNext){      println(tmp.next())    }    /**      * Person(Michael,29)      * Person(Andy,30)      * Person(Justin,19)      */  }  //---------------------------------------------------------------Basic Dataset functions---------------------  def dataSet_as()={    //Returns a new Dataset where each record has been mapped on to the specified type    val data: DataFrame = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt")    data.as[String]    data.show()    /**      * +-----------+      * |      value|      * +-----------+      * |Michael, 29|      * |   Andy, 30|      * | Justin, 19|      * +-----------+      */  }  def dataSet_cache()={    dataSet.cache()    /**      * 对数据进行缓存      */  }  def dataSet_columns() ={    //Returns all column names as an array.    dataSet.columns.foreach(println)    /**      * name      * age      */  }  def dataSet_createOrReplaceTempView()={    //Creates a temporary view using the given name.    dataSet.createOrReplaceTempView("myPerson")    val dataFrame = sparkSession.sql ("SELECT name, age FROM myPerson WHERE age BETWEEN 13 AND 19")    dataFrame.show()    /**      * +------+---+        |  name|age|        +------+---+        |Justin| 19|        +------+---+      */  }  def dataSet_createTempView() = {    //Creates a temporary view using the given name.    dataSet.createOrReplaceTempView("myPerson")    val dataFrame = sparkSession.sql ("SELECT name, age FROM myPerson WHERE age BETWEEN 13 AND 19")    dataFrame.map(teenager => "Name: " + teenager(0)).show()    /**      * +------------+        |       value|        +------------+        |Name: Justin|        +------------+      */  }  def dataSet_dtypes()={    //Returns all column names and their data types as an array.    dataSet.dtypes.foreach(println)    /**      * (name,StringType)        (age,LongType)      */  }  def dataSet_explain()={    //Prints the physical plan to the console for debugging purposes.    //dataSet.explain()    /**      * == Physical Plan ==        Scan ExistingRDD[name#2,age#3L]      */    //Prints the plans (logical and physical) to the console for debugging purposes.    dataSet.explain(true)    /**      * == Parsed Logical Plan ==        LogicalRDD [name#2, age#3L]        == Analyzed Logical Plan ==        name: string, age: bigint        LogicalRDD [name#2, age#3L]        == Optimized Logical Plan ==        LogicalRDD [name#2, age#3L]        == Physical Plan ==        Scan ExistingRDD[name#2,age#3L]      */  }  def dataSet_inputFiles()={    println(dataSet.inputFiles.toList)    //List()  }  def dataSet_isLocal() ={    //Returns true if the collect and take methods can be run locally (without any Spark executors).    dataSet.isLocal    //false  }  def dataSet_isStreaming() ={    dataSet.isStreaming  }  def dataSet_javaRDD()={    //Returns the content of the Dataset as a JavaRDD of Ts.    println(dataSet.toJavaRDD)    //MapPartitionsRDD[7] at toJavaRDD at dataSetOperation.scala:222  }  def dataSet_persist()={    //Persist this Dataset with the given storage level.    dataSet.persist()    /**      * 省却的情况下是(MEMORY_AND_DISK).      */  }  def dataSet_printSchema()={    //Prints the schema to the console in a nice tree format.    dataSet.printSchema()    /**      * root       |-- name: string (nullable = true)       |-- age: long (nullable = false)      */  }  def dataSet_rdd()={    //Represents the content of the Dataset as an RDD of T.    println(dataSet.rdd)    /**返回RDD形式      *MapPartitionsRDD[7] at rdd at dataSetOperation.scala:243      */  }  def dataSet_schema()={    //Returns the schema of this Dataset.    println(dataSet.schema)    /**      * StructType(StructField(name,StringType,true), StructField(age,LongType,false))      */  }  def dataSet_toDF()={    //Converts this strongly typed collection of data to generic Dataframe.    dataSet.toDF().show()    /**      * +-------+---+        |   name|age|        +-------+---+        |Michael| 29|        |   Andy| 30|        | Justin| 19|        +-------+---+      */    //Converts this strongly typed collection of data to generic DataFrame with columns renamed    dataSet.toDF("man","ID").show()//列的数目要和原来一样    /**      * +-------+---+        |    man| ID|        +-------+---+        |Michael| 29|        |   Andy| 30|        | Justin| 19|        +-------+---+      */  }  def dataSet_toJavaRDD()={    //Returns the content of the Dataset as a JavaRDD of Ts.    println(dataSet.toJavaRDD)    //MapPartitionsRDD[7] at toJavaRDD at dataSetOperation.scala:285  }  def dataSet_unpersist()={    //Mark the Dataset as non-persistent, and remove all blocks for it from memory and disk.    dataSet.unpersist(true)    //dataSet.unpersist()  }  def dataSet_write()={    //Interface for saving the content of the non-streaming Dataset out into external storage.    dataSet.write    /**      * 实验阶段      */  }  def dataSet_writeStream()={    //Interface for saving the content of the non-streaming Dataset out into external storage.    dataSet.writeStream    /**      * 实验阶段      */  }  def dataSet_registerTempTable()={    //Registers this Dataset as a temporary table using the given name.    // The lifetime of this temporary table is tied to the SparkSession that was used to create this Dataset.    dataSet.registerTempTable("myPerson")    val dataFrame = sparkSession.sql ("SELECT name, age FROM myPerson WHERE age BETWEEN 13 AND 19")    dataFrame.map(teenager => "Name: " + teenager(0)).show()    /**已经用createTempView代替,后来版本会慢慢取消,      * +------------+        |       value|        +------------+        |Name: Justin|        +------------+      */  }  //---------------------------------------------------------------Typed transformations----------------------  def dataSet_AS()={    val tmpDS: Dataset[Person] = dataSet.as("oldDataSet")  }  def dataSet_alias()={    /**本质是调用as      * def alias(alias: Symbol): Dataset[T] = as(alias)      * def alias(alias: String): Dataset[T] = as(alias)      *      * [name: string, age: bigint]      */  }  def dataSet_coalesce()={    //本质:Repartition(numPartitions, shuffle = false, logicalPlan)    //给dataSet重新设置partition数目,和RDD一样    //但是数据量非常小的时候,发现重新设置分区数不起作用    def myfunc(index: Int, iter: Iterator[(Person)]) : Iterator[String] = {      iter.toList.map(x => "[partID:" +  index + ", val: " + x + "]").iterator    }    dataSet.coalesce(2).toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)    /**      * [partID:0, val: Person(Michael,29)]        [partID:0, val: Person(Andy,30)]        [partID:0, val: Person(Justin,19)]      */    println(dataSet.coalesce(2).toJavaRDD.rdd.partitions.length)    /**      * 1      */  }  def dataSet_distinct()={    val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100)))    val unionedDS = dataSet.union(tmpDataSet).union(tmpDataSet)    unionedDS.show()    /**      * +--------+---+        | Michael| 29|        |    Andy| 30|        |  Justin| 19|        |legotime|100|        |legotime|100|        +--------+---+      */    unionedDS.distinct().show()    /**      * +--------+---+        |    name|age|        +--------+---+        |    Andy| 30|        |legotime|100|        | Michael| 29|        |  Justin| 19|        +--------+---+      */    // distinct 操作内部是会进行shuffle 排序的  }  def dataSet_dropDuplicates()={    //def dropDuplicates(): Dataset[T]    //def distinct(): Dataset[T]    //def dropDuplicates(colNames: Array[String]): Dataset[T]    //Returns a new Dataset with duplicate rows removed, considering only the subset of columns.    val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19)))    val unionedDS = dataSet.union(tmpDataSet).union(tmpDataSet)    unionedDS.show()    /**      * +--------+---+        |    name|age|        +--------+---+        | Michael| 29|        |    Andy| 30|        |  Justin| 19|        |legotime|100|        |    lego| 19|        |legotime|100|        |    lego| 19|        +--------+---+      */    unionedDS.dropDuplicates().show()    /**      * +--------+---+        |    name|age|        +--------+---+        |    Andy| 30|        |    lego| 19|        |legotime|100|        | Michael| 29|        |  Justin| 19|        +--------+---+      */    unionedDS.dropDuplicates("name").show()    /**      * +--------+---+        |    name|age|        +--------+---+        | Michael| 29|        |    Andy| 30|        |    lego| 19|        |legotime|100|        |  Justin| 19|        +--------+---+      */    unionedDS.dropDuplicates("age").show()    /**      * +--------+---+        |    name|age|        +--------+---+        | Michael| 29|        |  Justin| 19|        |legotime|100|        |    Andy| 30|        +--------+---+      */    unionedDS.dropDuplicates(Array("name","age")).show()    /**      * +--------+---+        |    name|age|        +--------+---+        |    Andy| 30|        |    lego| 19|        |legotime|100|        | Michael| 29|        |  Justin| 19|        +--------+---+      */  }  def dataSet_except()={    //Returns a new Dataset containing rows in this Dataset but not in another Dataset. This is equivalent to EXCEPT in SQL.    val tmpDataSet = sparkSession.createDataset(Seq(Person("Andy",30),Person("lego",19)))    dataSet.except(tmpDataSet).show()    /**      * +-------+---+        |   name|age|        +-------+---+        |Michael| 29|        | Justin| 19|        +-------+---+      */  }  def dataSet_filter()={    //def filter(func: (T) ⇒ Boolean): Dataset[T]    //def filter(conditionExpr: String): Dataset[T]    //def filter(condition: Column): Dataset[T]    dataSet.filter($"age" > 20).show()    dataSet.filter("age > 20").show()    /**      * +-------+---+        |   name|age|        +-------+---+        |Michael| 29|        |   Andy| 30|        +-------+---+      */  }  def dataSet_flatMap()={    //def flatMap[U](func: (T) ⇒ TraversableOnce[U])(implicit arg0: Encoder[U]): Dataset[U]    /**因为序列化问题,报错,期待后续开发      * dataSet.flatMap{ P =>          P.toString        }.show()      */    val tmpDataSet = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt").as[String]    val words = tmpDataSet.flatMap(line => line.split(","))    words.show()    /**      * +-------+        |  value|        +-------+        |Michael|        |     29|        |   Andy|        |     30|        | Justin|        |     19|        +-------+      */    //可以说,只要你想对内部具体的值进行动刀,都离不开flatMap ,flatMap之后可以实现很多要求,比如如下:    words.map((word) =>(word,1)).show()    /**      * +-------+---+        |     _1| _2|        +-------+---+        |Michael|  1|        |     29|  1|        |   Andy|  1|        |     30|  1|        | Justin|  1|        |     19|  1|        +-------+---+      */    //在如下:    words.map((word) =>(word,1)).groupByKey(value => value).count().show()    /**      * +-----------+--------+        |        key|count(1)|        +-----------+--------+        |   [Andy,1]|       1|        |[Michael,1]|       1|        |    [ 29,1]|       1|        | [Justin,1]|       1|        |    [ 30,1]|       1|        |    [ 19,1]|       1|        +-----------+--------+      */    /**      * 实验阶段      */  }  def dataSet_groupByKey()={    val tmpDataSet = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt").as[String]    val words = tmpDataSet.flatMap(line => line.split(","))    words.groupByKey(_.toLowerCase).count().show()    /**      * +-------+--------+        |  value|count(1)|        +-------+--------+        |     29|       1|        |   andy|       1|        |michael|       1|        | justin|       1|        |     19|       1|        |     30|       1|        +-------+--------+      */    /**      * 值得注意的是,它是spark中降低效率前几的一个函数,尽量用其他函数代替。      */  }  def dataSet_intersect()={    //Returns a new Dataset containing rows only in both this Dataset and another Dataset. This is equivalent to INTERSECT in SQL.(交集)    val tmpDataSet = sparkSession.createDataset(Seq(Person("Andy",30),Person("lego",19)))    dataSet.show()    /**      * +-------+---+        |   name|age|        +-------+---+        |Michael| 29|        |   Andy| 30|        | Justin| 19|        +-------+---+      */    dataSet.intersect(tmpDataSet).show()    /**      * +----+---+        |name|age|        +----+---+        |Andy| 30|        +----+---+      */  }  def dataSet_joinWith()={    /**      * 实验阶段      */    //Using inner equi-join to join this Dataset returning a Tuple2 for each pair where condition evaluates to true.    val tmpDataSet = sparkSession.createDataset(Seq(Person("Andy",30),Person("lego",19)))    dataSet.joinWith(tmpDataSet,tmpDataSet("name") ===  dataSet("name")).show()    /**      * +---------+---------+        |       _1|       _2|        +---------+---------+        |[Andy,30]|[Andy,30]|        +---------+---------+      */    dataSet.joinWith(tmpDataSet,tmpDataSet("age") ===  dataSet("age")).show()    /**      * +-----------+---------+        |         _1|       _2|        +-----------+---------+        |  [Andy,30]|[Andy,30]|        |[Justin,19]|[lego,19]|        +-----------+---------+      */    dataSet.joinWith(tmpDataSet,tmpDataSet("age") ===  19).show()    /**      * +------------+---------+        |          _1|       _2|        +------------+---------+        |[Michael,29]|[lego,19]|        |   [Andy,30]|[lego,19]|        | [Justin,19]|[lego,19]|        +------------+---------+      */  }  def dataSet_limit()={    /**      * Returns a new Dataset by taking the first n rows.      * The difference between this function and head is that head is an action and returns an array      * (by triggering query execution) while limit returns a new Dataset.      */    dataSet.show()    /**      * +-------+---+        |   name|age|        +-------+---+        |Michael| 29|        |   Andy| 30|        | Justin| 19|        +-------+---+      */    dataSet.limit(2).show()    /**      * +-------+---+        |   name|age|        +-------+---+        |Michael| 29|        |   Andy| 30|        +-------+---+      */  }  def dataSet_map()={    //Returns a new Dataset that contains the result of applying func to each element.    dataSet.map(Person =>Person.age).show()    /**      * +-----+        |value|        +-----+        |   29|        |   30|        |   19|        +-----+      */  }  def dataSet_mapPartitions()={    def myfunc[Person](iter: Iterator[Person]) : Iterator[(Person, Person)] = {      var res = List[(Person, Person)]()      var pre = iter.next      while (iter.hasNext)      {        val cur = iter.next        res .::= (pre, cur)        pre = cur      }      res.iterator    }    dataSet.mapPartitions(myfunc).show()    /**      * +------------+-----------+        |          _1|         _2|        +------------+-----------+        |   [Andy,30]|[Justin,19]|        |[Michael,29]|  [Andy,30]|        +------------+-----------+      */    /**      * 实验阶段      */  }  def dataSet_orderBy()={    dataSet.show()    /**      * +-------+---+        |   name|age|        +-------+---+        |Michael| 29|        |   Andy| 30|        | Justin| 19|        +-------+---+      */    dataSet.orderBy($"age").show()    /**      * +-------+---+        |   name|age|        +-------+---+        | Justin| 19|        |Michael| 29|        |   Andy| 30|        +-------+---+      */  }  def dataSet_randomSplit()={    dataSet.randomSplit(Array(0.6,0.4),0L).foreach{ds =>      ds.show()    }    /**      * +-------+---+        |   name|age|        +-------+---+        |   Andy| 30|        |Michael| 29|        +-------+---+      */    /**      * +------+---+        |  name|age|        +------+---+        |Justin| 19|        +------+---+      */  }  def dataSet_randomSplitAsList()={    //Returns a Java list that contains randomly split Dataset with the provided weights.    println(dataSet.randomSplitAsList(Array(0.6,0.4),0L).size())    //2  }  def dataSet_repartition()={    def myfunc(index: Int, iter: Iterator[(Person)]) : Iterator[String] = {      iter.toList.map(x => "[partID:" +  index + ", val: " + x + "]").iterator    }    dataSet.repartition(2).toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)    /**      * [partID:0, val: Person(Michael,29)]        [partID:0, val: Person(Justin,19)]        [partID:1, val: Person(Andy,30)]      */    dataSet.repartition($"name").toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)    /**      * [partID:71, val: Person(Michael,29)]        [partID:164, val: Person(Andy,30)]        [partID:169, val: Person(Justin,19)]      */    dataSet.repartition(2,$"name").toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)    /**      * [partID:0, val: Person(Andy,30)]        [partID:1, val: Person(Michael,29)]        [partID:1, val: Person(Justin,19)]      */  }  def dataSet_sample()={    //Returns a new Dataset by sampling a fraction of rows, using a random seed.    dataSet.sample(withReplacement = true,0.6,0L).show()    /**      * +----+---+        |name|age|        +----+---+        |Andy| 30|        +----+---+      */  }  def dataSet_select()={    dataSet.select($"name").show()    /**      * +-------+        |   name|        +-------+        |Michael|        |   Andy|        | Justin|        +-------+      */  }  def dataSet_sort()={    dataSet.show()    /**      * +-------+---+        |   name|age|        +-------+---+        |Michael| 29|        |   Andy| 30|        | Justin| 19|        +-------+---+      */    dataSet.sort($"name",$"age".desc).show()    /**      * +-------+---+        |   name|age|        +-------+---+        |   Andy| 30|        | Justin| 19|        |Michael| 29|        +-------+---+      */  }  def dataSet_sortWithinPartitions()={    def myfunc(index: Int, iter: Iterator[(Person)]) : Iterator[String] = {      iter.toList.map(x => "[partID:" +  index + ", val: " + x + "]").iterator    }    dataSet.repartition(2).toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)    /**      * [partID:0, val: Person(Michael,29)]        [partID:0, val: Person(Justin,19)]        [partID:1, val: Person(Andy,30)]      */    dataSet.repartition(2).sortWithinPartitions($"age").toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)    /**      * [partID:0, val: Person(Justin,19)]        [partID:0, val: Person(Michael,29)]        [partID:1, val: Person(Andy,30)]      */  }  def dataSet_transform()={    //Concise syntax for chaining custom transformations.    dataSet.show()    /**      * +-------+---+        |   name|age|        +-------+---+        |Michael| 29|        |   Andy| 30|        | Justin| 19|        +-------+---+      */    dataSet.transform{ p =>p.sort($"age".desc)}.show()    /**      * +-------+---+        |   name|age|        +-------+---+        |   Andy| 30|        |Michael| 29|        | Justin| 19|        +-------+---+      */  }  def dataSet_union()={    //Returns a new Dataset containing union of rows in this Dataset and another Dataset. This is equivalent to UNION ALL in SQL.    val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19)))    val unionedDS = dataSet.union(tmpDataSet).union(tmpDataSet)    unionedDS.show()    /**      * +--------+---+        |    name|age|        +--------+---+        | Michael| 29|        |    Andy| 30|        |  Justin| 19|        |legotime|100|        |    lego| 19|        |legotime|100|        |    lego| 19|        +--------+---+      */  }  def dataSet_where()={    dataSet.where($"age">20).show()    dataSet.where("age > 20").show()    dataSet.filter($"age">20).show()    dataSet.filter("age >20").show()    /**      * +-------+---+        |   name|age|        +-------+---+        |Michael| 29|        |   Andy| 30|        +-------+---+      */  }  def dataSet_unionAll()={    /**      * Annotation  @deprecated       Deprecate (Since version 2.0.0) use union()      */    val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19)))    val unionedDS = dataSet.unionAll(tmpDataSet).union(tmpDataSet)    unionedDS.show()    /**      * +--------+---+        |    name|age|        +--------+---+        | Michael| 29|        |    Andy| 30|        |  Justin| 19|        |legotime|100|        |    lego| 19|        |legotime|100|        |    lego| 19|        +--------+---+      */  }  //---------------------------------------------------------------Untyped transformations---------------------  def dataSet_agg()={    // import org.apache.spark.sql.functions._    dataSet.groupBy($"age",$"name").agg(max($"name"), avg($"age")).show()    /**      * +---+-------+---------+--------+        |age|   name|max(name)|avg(age)|        +---+-------+---------+--------+        | 29|Michael|  Michael|    29.0|        | 30|   Andy|     Andy|    30.0|        | 19| Justin|   Justin|    19.0|        +---+-------+---------+--------+      */    dataSet.groupBy().agg(max($"name"), avg($"age")).show()    dataSet.agg(max($"name"), avg($"age")).show()    // dataSet.agg(...) is a shorthand for dataSet.groupBy().agg(...)    /**      * +---------+--------+        |max(name)|avg(age)|        +---------+--------+        |  Michael|    26.0|        +---------+--------+      */  }  def dataSet_apply()={    //Selects column based on the column name and return it as a Column. Note that the column name can also reference to a nested column like a.b.    println(dataSet.apply("age"))    //age  }  def dataSet_col()={    //Selects column based on the column name and return it as a Column.    dataSet.select(col("age")).show()    /**      * +---+        |age|        +---+        | 29|        | 30|        | 19|        +---+      */  }  def dataSet_cube()={    //Create a multi-dimensional cube for the current Dataset using the specified columns, so we can run aggregation on them.    dataSet.cube("age","name").agg(avg($"age")).show()    /**      * +----+-------+--------+        | age|   name|max(age)|        +----+-------+--------+        |null|Michael|      29|        |null|   null|      30|        |  29|Michael|      29|        |  19|   null|      19|        |  30|   Andy|      30|        |  30|   null|      30|        |null|   Andy|      30|        |  19| Justin|      19|        |  29|   null|      29|        |null| Justin|      19|        +----+-------+--------+      */  }  def dataSet_drop()={    //Returns a new Dataset with columns dropped. This is a no-op if schema doesn't contain column name(s).    dataSet.drop("age").show()    //Returns a new Dataset with a column dropped. This version of drop accepts a Column rather than a name.    // This is a no-op if the Dataset doesn't have a column with an equivalent expression.    dataSet.drop(col = col("age")).show()    /**      * +-------+        |   name|        +-------+        |Michael|        |   Andy|        | Justin|        +-------+      */  }  def dataSet_groupBy()={    dataSet.groupBy(col("age")).agg{Map(      "age"->"avg",      "name"->"max"    )}.show()    dataSet.groupBy($"age").agg{Map(      "age"->"avg",      "name"->"max"    )}.show()    /**      * +---+--------+---------+        |age|avg(age)|max(name)|        +---+--------+---------+        | 29|    29.0|  Michael|        | 19|    19.0|   Justin|        | 30|    30.0|     Andy|        +---+--------+---------+      */  }  def dataSet_join()={    val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19)))    dataSet.join(tmpDataSet).show()    /**      * +-------+---+--------+---+        |Michael| 29|legotime|100|        |Michael| 29|    lego| 19|        |   Andy| 30|legotime|100|        |   Andy| 30|    lego| 19|        | Justin| 19|legotime|100|        | Justin| 19|    lego| 19|        +-------+---+--------+---+      */    dataSet.join(tmpDataSet,"age").show()    /**      * +---+------+----+        |age|  name|name|        +---+------+----+        | 19|Justin|lego|        +---+------+----+      */    dataSet.join(tmpDataSet,Seq("age","name")).show()    /**      * +---+----+        |age|name|        +---+----+        +---+----+      */  }  def dataSet_na()={    //Returns a DataFrameNaFunctions for working with missing data.    dataSet.na.drop("all").show()    /**      * +-------+---+        |   name|age|        +-------+---+        |Michael| 29|        |   Andy| 30|        | Justin| 19|        +-------+---+      */  }  def dataSet_rollup()={    //Create a multi-dimensional rollup for the current Dataset using the specified columns, so we can run aggregation on them    dataSet.rollup("age", "name").avg().show()    /**      * +----+-------+--------+        | age|   name|avg(age)|        +----+-------+--------+        |null|   null|    26.0|        |  29|Michael|    29.0|        |  19|   null|    19.0|        |  30|   Andy|    30.0|        |  30|   null|    30.0|        |  19| Justin|    19.0|        |  29|   null|    29.0|        +----+-------+--------+      */  }  def dataSet_select_2()={    dataSet.select("age","name","age").show()    /**      * +---+-------+---+        |age|   name|age|        +---+-------+---+        | 29|Michael| 29|        | 30|   Andy| 30|        | 19| Justin| 19|        +---+-------+---+      */  }  def dataSet_selectExpr()={    //Selects a set of SQL expressions. This is a variant of select that accepts SQL expressions.    dataSet.selectExpr("name","age+1","name as NAME","age as AGE").show()    dataSet.select(expr("name"),expr("age+1"), expr("name as NAME"), expr("age as AGE"))    /**      * +-------+---------+-------+---+        |   name|(age + 1)|   NAME|AGE|        +-------+---------+-------+---+        |Michael|       30|Michael| 29|        |   Andy|       31|   Andy| 30|        | Justin|       20| Justin| 19|        +-------+---------+-------+---+      */  }  def dataSet_stat()={    //Returns a DataFrameStatFunctions for working statistic functions support.    dataSet.stat.sampleBy("age",Map("age"->0.5,"name"->0.5),0L).show()    /**      * +----+---+        |name|age|        +----+---+        +----+---+      */  }  def dataSet_withColumn()={    //Returns a new Dataset by adding a column or replacing the existing column that has the same name.    dataSet.withColumn("NAME",col("name")).show()    /**      * +-------+---+        |   NAME|age|        +-------+---+        |Michael| 29|        |   Andy| 30|        | Justin| 19|        +-------+---+      */  }  def dataSet_withColumnRenamed()={    dataSet.withColumnRenamed("name","newName").show()    /**      * +-------+---+        |newName|age|        +-------+---+        |Michael| 29|        |   Andy| 30|        | Justin| 19|        +-------+---+      */  }  def dataSet_explode()={    /**      * Annotations    @deprecated        Deprecated   (Since version 2.0.0) use flatMap() or select() with functions.explode() instead        Since       2.0.0      */    dataSet.explain()  }  def main(args: Array[String]) {    //dataSet_collect()    //dataSet_collectAsList    //dataSet_count    //dataSet_describe    //dataSet_first    //dataSet_foreachPartition    //dataSet_head    //dataSet_reduce    //dataSet_show    //dataSet_toLocalIterator    //dataSet_as    //dataSet_cache    //dataSet_columns    //dataSet_createOrReplaceTempView    //dataSet_createTempView    //dataSet_dtypes    //dataSet_explain    //dataSet_inputFiles    //println(dataSet_isLocal)    //dataSet_isStreaming    //dataSet_javaRDD    //dataSet_persist    //dataSet_printSchema    //dataSet_rdd    //dataSet_schema    //dataSet_toDF    //dataSet_toJavaRDD    //dataSet_unpersist    //dataSet_write    //dataSet_write    //dataSet_writeStream    //dataSet_registerTempTable    //dataSet_alias    //dataSet_coalesce    //dataSet_distinct    //dataSet_dropDuplicates    //dataSet_except    //dataSet_filter    //dataSet_flatMap    //dataSet_groupByKey    //dataSet_intersect    //dataSet_joinWith    //dataSet_limit    //dataSet_map    dataSet_mapPartitions    //dataSet_orderBy    //dataSet_randomSplit    //dataSet_randomSplitAsList    //dataSet_repartition    //dataSet_sample    //dataSet_select    //dataSet_sort    //dataSet_sortWithinPartitions    //dataSet_transform    //dataSet_union    //dataSet_where    //dataSet_unionAll    //dataSet_agg    //dataSet_apply    //dataSet_col    //dataSet_cube    //dataSet_drop    //dataSet_groupBy    //dataSet_join    //dataSet_na    //dataSet_rollup    //dataSet_select_2    //dataSet_selectExpr    //dataSet_stat    //dataSet_stat    //dataSet_withColumn    //dataSet_withColumnRenamed    //dataSet_explode  }}
0 0
原创粉丝点击