第45课 Spark 2.0实战之Dataset:map、flatMap、mapPartitions、dropDuplicate、coalesce、repartition等

来源:互联网 发布:36芯三网合一网络箱 编辑:程序博客网 时间:2024/06/06 18:16

第45课 Spark 2.0实战之Dataset:map、flatMap、mapPartitions、dropDuplicate、coalesce、repartition等

 

package com.dt.spark200

import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ArrayBuffer

object DataSetsops {
  case class Person(name:String,age:Long)
  def main(args: Array[String]): Unit = {
   
     val spark = SparkSession
       .builder()
       .appName("DatasetOps")
       .master("local")
       .config("spark.sql.warehouse.dir", "file:///G:/IMFBigDataSpark2016/IMFScalaWorkspace_spark200/Spark200/spark-warehouse")
       .getOrCreate()
      
  import spark.implicits._ 
  import org.apache.spark.sql.functions._
  val personDF= spark.read.json("G:\\IMFBigDataSpark2016\\spark-2.0.0-bin-hadoop2.6\\examples\\src\\main\\resources\\people.json")
  val personScoresDF= spark.read.json("G:\\IMFBigDataSpark2016\\spark-2.0.0-bin-hadoop2.6\\examples\\src\\main\\resources\\peopleScores.json")
  val personDS = personDF.as[Person]
    
/*    personDS.map{person=>
      (person.name,if (person.age == null) 0 else person.age +100 )
         
     }.show()
    
     personDS.mapPartitions{persons =>
       val result = ArrayBuffer[(String,Long)]()
       while(persons.hasNext){
         val person = persons.next()
         result +=((person.name,person.age+10000))
       }
        result.iterator
     }.show
    
    
     personDS.dropDuplicates("name").show
     personDS.distinct().show*/
    
     println(personDS.rdd.partitions.size)
     val repartitionDs= personDS.repartition(4)
   println(repartitionDs.rdd.partitions.size)
   val coalesced= repartitionDs.coalesce(2)
     println(coalesced.rdd.partitions.size)
    coalesced.show
   
 
  // personDF.show()
 // personDF.collect().foreach (println)
 // println(personDF.count())
  
  //val personDS = personDF.as[Person]
 // personDS.show()
 // personDS.printSchema()
  //val dataframe=personDS.toDF()
  
 /* personDF.createOrReplaceTempView("persons")
  spark.sql("select * from persons where age > 20").show()
    spark.sql("select * from persons where age > 20").explain()
  */
 //  val personScoresDF= spark.read.json("G:\\IMFBigDataSpark2016\\spark-2.0.0-bin-hadoop2.6\\examples\\src\\main\\resources\\peopleScores.json")
 // personDF.join(personScoresDF,$"name"===$"n").show()
 /*  personDF.filter("age > 20").join(personScoresDF,$"name"===$"n").show()
 
  personDF.filter("age > 20")
   .join(personScoresDF,$"name"===$"n")
   .groupBy(personDF("name"))
   .agg(avg(personScoresDF("score")),avg(personDF("age")))
   .explain()
   //.show()
  
   */
   
  while(true) {}

    spark.stop()
  }
}

 

运行结果

 

16/09/16 18:55:03 INFO CodeGenerator: Code generated in 299.363907 ms
1
16/09/16 18:55:03 INFO FileSourceStrategy: Pruning directories with:
16/09/16 18:55:03 INFO FileSourceStrategy: Post-Scan Filters:
16/09/16 18:55:03 INFO FileSourceStrategy: Pruned Data Schema: struct<age: bigint, name: string>
16/09/16 18:55:03 INFO FileSourceStrategy: Pushed Filters:
16/09/16 18:55:03 INFO MemoryStore: Block broadcast_5 stored as values in memory (estimated size 194.6 KB, free 413.5 MB)
16/09/16 18:55:03 INFO MemoryStore: Block broadcast_5_piece0 stored as bytes in memory (estimated size 20.5 KB, free 413.5 MB)
16/09/16 18:55:03 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on 192.168.3.5:64223 (size: 20.5 KB, free: 413.9 MB)
16/09/16 18:55:03 INFO SparkContext: Created broadcast 5 from rdd at DataSetsops.scala:43
16/09/16 18:55:03 INFO FileSourceStrategy: Planning scan with bin packing, max size: 4194387 bytes, open cost is considered as scanning 4194304 bytes.
4
16/09/16 18:55:03 INFO FileSourceStrategy: Pruning directories with:
16/09/16 18:55:03 INFO FileSourceStrategy: Post-Scan Filters:
16/09/16 18:55:03 INFO FileSourceStrategy: Pruned Data Schema: struct<age: bigint, name: string>
16/09/16 18:55:03 INFO FileSourceStrategy: Pushed Filters:
16/09/16 18:55:03 INFO MemoryStore: Block broadcast_6 stored as values in memory (estimated size 194.6 KB, free 413.3 MB)
16/09/16 18:55:03 INFO MemoryStore: Block broadcast_6_piece0 stored as bytes in memory (estimated size 20.5 KB, free 413.3 MB)
16/09/16 18:55:03 INFO BlockManagerInfo: Added broadcast_6_piece0 in memory on 192.168.3.5:64223 (size: 20.5 KB, free: 413.9 MB)
16/09/16 18:55:03 INFO SparkContext: Created broadcast 6 from rdd at DataSetsops.scala:45
16/09/16 18:55:03 INFO FileSourceStrategy: Planning scan with bin packing, max size: 4194387 bytes, open cost is considered as scanning 4194304 bytes.
1
16/09/16 18:55:03 INFO FileSourceStrategy: Pruning directories with:

 

 

0 0