第44课:Spark 2.0编程实战之DataSet案例开发实战

来源:互联网 发布:数控车床手工编程入门 编辑:程序博客网 时间:2024/06/03 17:46

 第44课:Spark 2.0编程实战之DataSet案例开发实战

people.json

{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}

peopleScores.json

{"n":"Michael", "score":88}

{"n":"Andy", "score":100}

{"n":"Justin", "score":89}

 

运行结果

 

16/09/16 17:48:51 INFO CodeGenerator: Code generated in 13.484901 ms
+---+----+----+-----+
|age|name|   n|score|
+---+----+----+-----+
| 30|Andy|Andy|  100|
+---+----+----+-----

 

 

== Physical Plan ==
*HashAggregate(keys=[name#1], functions=[avg(score#25L), avg(age#0L)])
+- Exchange hashpartitioning(name#1, 200)
   +- *HashAggregate(keys=[name#1], functions=[partial_avg(score#25L), partial_avg(age#0L)])
      +- *Project [age#0L, name#1, score#25L]
         +- *BroadcastHashJoin [name#1], [n#24], Inner, BuildRight
            :- *Project [age#0L, name#1]
            :  +- *Filter ((isnotnull(age#0L) && (age#0L > 20)) && isnotnull(name#1))
            :     +- *Scan json [age#0L,name#1] Format: JSON, InputPaths: file:/G:/IMFBigDataSpark2016/spark-2.0.0-bin-hadoop2.6/examples/src/main/resources/people.json, PushedFilters: [IsNotNull(age), GreaterThan(age,20), IsNotNull(name)], ReadSchema: struct<age:bigint,name:string>
            +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true]))
               +- *Project [n#24, score#25L]
                  +- *Filter isnotnull(n#24)
                     +- *Scan json [n#24,score#25L] Format: JSON, InputPaths: file:/G:/IMFBigDataSpark2016/spark-2.0.0-bin-hadoop2.6/examples/src/main/resources/peopleScores.json, PushedFilters: [IsNotNull(n)], ReadSchema: struct<n:string,score:bigint>

 

 

源代码

package com.dt.spark200

import org.apache.spark.sql.SparkSession

object DataSetsops {
  case class Person(name:String,age:Long)
  def main(args: Array[String]): Unit = {
   
     val spark = SparkSession
       .builder()
       .appName("DatasetOps")
       .master("local")
       .config("spark.sql.warehouse.dir", "file:///G:/IMFBigDataSpark2016/IMFScalaWorkspace_spark200/Spark200/spark-warehouse")
       .getOrCreate()
      
  import spark.implicits._ 
  import org.apache.spark.sql.functions._
  val personDF= spark.read.json("G:\\IMFBigDataSpark2016\\spark-2.0.0-bin-hadoop2.6\\examples\\src\\main\\resources\\people.json")
 // personDF.show()
 // personDF.collect().foreach (println)
 // println(personDF.count())
  
  val personDS = personDF.as[Person]
 // personDS.show()
 // personDS.printSchema()
  //val dataframe=personDS.toDF()
  
  personDF.createOrReplaceTempView("persons")
  spark.sql("select * from persons where age > 20").show()
    spark.sql("select * from persons where age > 20").explain()
  
   val personScoresDF= spark.read.json("G:\\IMFBigDataSpark2016\\spark-2.0.0-bin-hadoop2.6\\examples\\src\\main\\resources\\peopleScores.json")
 // personDF.join(personScoresDF,$"name"===$"n").show()
   personDF.filter("age > 20").join(personScoresDF,$"name"===$"n").show()
 
  personDF.filter("age > 20")
   .join(personScoresDF,$"name"===$"n")
   .groupBy(personDF("name"))
   .agg(avg(personScoresDF("score")),avg(personDF("age")))
   .explain()
   //.show()
  
  while(true) {}

    spark.stop()
  }
}

 

0 0
原创粉丝点击