spark 2.2.0学习笔记4之SparkSQLDemo

来源:互联网 发布:华联期货软件 编辑:程序博客网 时间:2024/06/05 03:05

spark 2.2.0学习笔记4之SparkSQLDemo

Info

spark sql—-Spark 用来操作结构化/半结构化数据的程序包

  • 从各种数据源读取数据/支持各种方式的sql查询
    • hive查询
    • 支持udf(用户自定义函数)
  • SchemaRDD(DataFrame)—-存放Row 对象的RDD,每个Row 对象代表一行记录

    SELECT SUM(user.favouritesCount), SUM(retweetCount), user.id FROM tweets
    GROUP BY user.id

样例

  • 从数据源读取生成DataFrame与Dataset
  • 手动创建与Dataset互转
  • 隐式模式转换
  • 手动模式匹配

Code

/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *    http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package spark30.sqlimport org.apache.spark.sql.Rowimport spark30.basic.SparkContextUtil// $example on:init_session$import org.apache.spark.sql.SparkSession// $example off:init_session$// $example on:programmatic_schema$// $example on:data_types$import org.apache.spark.sql.types._// $example off:data_types$// $example off:programmatic_schema$object SparkSQLDemo {  val personJsonPath:String="example-scala-learn\\src\\spark30\\basic\\people.json"  val personTextPath:String="example-scala-learn\\src\\spark30\\basic\\people.txt"  // $example on:create_ds$  // Note: Case classes in Scala 2.10 can support only up to 22 fields. To work around this limit,  // you can use custom classes that implement the Product interface  case class Person(name: String, age: Long)  // $example off:create_ds$  def main(args: Array[String]) {    // $example on:init_session$    val spark = SparkContextUtil.getSparkSession("Spark SQL basic example")     /* SparkSession      .builder()      .appName("Spark SQL basic example")        .master("local")      .config("spark.some.config.option", "some-value")      .getOrCreate()*/    // For implicit conversions like converting RDDs to DataFrames    // $example off:init_session$    runBasicDataFrameExample(spark)    runDatasetCreationExample(spark)    runInferSchemaExample(spark)    runProgrammaticSchemaExample(spark)    spark.stop()  }  private def runBasicDataFrameExample(spark: SparkSession): Unit = {    // $example on:create_df$    val df = spark.read.json(personJsonPath)    // Displays the content of the DataFrame to stdout    df.show()    // +----+-------+    // | age|   name|    // +----+-------+    // |null|Michael|    // |  30|   Andy|    // |  19| Justin|    // +----+-------+    // $example off:create_df$    // $example on:untyped_ops$    // This import is needed to use the $-notation    import spark.implicits._    // Print the schema in a tree format    df.printSchema()    // root    // |-- age: long (nullable = true)    // |-- name: string (nullable = true)    // Select only the "name" column    df.select("name").show()    // +-------+    // |   name|    // +-------+    // |Michael|    // |   Andy|    // | Justin|    // +-------+    // Select everybody, but increment the age by 1    df.select($"name", $"age" + 1).show()    // +-------+---------+    // |   name|(age + 1)|    // +-------+---------+    // |Michael|     null|    // |   Andy|       31|    // | Justin|       20|    // +-------+---------+    // Select people older than 21    df.filter($"age" > 21).show()    // +---+----+    // |age|name|    // +---+----+    // | 30|Andy|    // +---+----+    // Count people by age    df.groupBy("age").count().show()    // +----+-----+    // | age|count|    // +----+-----+    // |  19|    1|    // |null|    1|    // |  30|    1|    // +----+-----+    // $example off:untyped_ops$    // $example on:run_sql$    // Register the DataFrame as a SQL temporary view    df.createOrReplaceTempView("people")    val sqlDF = spark.sql("SELECT * FROM people")    sqlDF.show()    // +----+-------+    // | age|   name|    // +----+-------+    // |null|Michael|    // |  30|   Andy|    // |  19| Justin|    // +----+-------+    // $example off:run_sql$    // $example on:global_temp_view$    // Register the DataFrame as a global temporary view    df.createGlobalTempView("people")    // Global temporary view is tied to a system preserved database `global_temp`    spark.sql("SELECT * FROM global_temp.people").show()    // +----+-------+    // | age|   name|    // +----+-------+    // |null|Michael|    // |  30|   Andy|    // |  19| Justin|    // +----+-------+    // Global temporary view is cross-session    spark.newSession().sql("SELECT * FROM global_temp.people").show()    // +----+-------+    // | age|   name|    // +----+-------+    // |null|Michael|    // |  30|   Andy|    // |  19| Justin|    // +----+-------+    // $example off:global_temp_view$  }  private def runDatasetCreationExample(spark: SparkSession): Unit = {    import spark.implicits._    // $example on:create_ds$    // Encoders are created for case classes    val caseClassDS = Seq(Person("Andy", 32)).toDS()    caseClassDS.show()    // +----+---+    // |name|age|    // +----+---+    // |Andy| 32|    // +----+---+    // Encoders for most common types are automatically provided by importing spark.implicits._    val primitiveDS = Seq(1, 2, 3).toDS()    primitiveDS.map(_ + 1).collect() // Returns: Array(2, 3, 4)    // DataFrames can be converted to a Dataset by providing a class. Mapping will be done by name    val path = personJsonPath    val peopleDS = spark.read.json(path).as[Person]    peopleDS.show()    // +----+-------+    // | age|   name|    // +----+-------+    // |null|Michael|    // |  30|   Andy|    // |  19| Justin|    // +----+-------+    // $example off:create_ds$  }  private def runInferSchemaExample(spark: SparkSession): Unit = {    // $example on:schema_inferring$    // For implicit conversions from RDDs to DataFrames    import spark.implicits._    // Create an RDD of Person objects from a text file, convert it to a Dataframe    val peopleDF = spark.sparkContext      .textFile(personTextPath)      .map(_.split(","))      .map(attributes => Person(attributes(0), attributes(1).trim.toInt))      .toDF()    // Register the DataFrame as a temporary view    peopleDF.createOrReplaceTempView("people")    // SQL statements can be run by using the sql methods provided by Spark    val teenagersDF = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19")    // The columns of a row in the result can be accessed by field index    teenagersDF.map(teenager => "Name: " + teenager(0)).show()    // +------------+    // |       value|    // +------------+    // |Name: Justin|    // +------------+    // or by field name    teenagersDF.map(teenager => "Name: " + teenager.getAs[String]("name")).show()    // +------------+    // |       value|    // +------------+    // |Name: Justin|    // +------------+    // No pre-defined encoders for Dataset[Map[K,V]], define explicitly    implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]]    // Primitive types and case classes can be also defined as    // implicit val stringIntMapEncoder: Encoder[Map[String, Any]] = ExpressionEncoder()    // row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T]    teenagersDF.map(teenager => teenager.getValuesMap[Any](List("name", "age"))).collect()    // Array(Map("name" -> "Justin", "age" -> 19))    // $example off:schema_inferring$  }  private def runProgrammaticSchemaExample(spark: SparkSession): Unit = {    import spark.implicits._    // $example on:programmatic_schema$    // Create an RDD    val peopleRDD = spark.sparkContext.textFile(personTextPath)    // The schema is encoded in a string    val schemaString = "name age"    // Generate the schema based on the string of schema    val fields = schemaString.split(" ")      .map(fieldName => StructField(fieldName, StringType, nullable = true))    val schema = StructType(fields)    // Convert records of the RDD (people) to Rows    val rowRDD = peopleRDD      .map(_.split(","))      .map(attributes => Row(attributes(0), attributes(1).trim))    // Apply the schema to the RDD    val peopleDF = spark.createDataFrame(rowRDD, schema)    // Creates a temporary view using the DataFrame    peopleDF.createOrReplaceTempView("people")    // SQL can be run over a temporary view created using DataFrames    val results = spark.sql("SELECT name FROM people")    // The results of SQL queries are DataFrames and support all the normal RDD operations    // The columns of a row in the result can be accessed by field index or by field name    results.map(attributes => "Name: " + attributes(0)).show()    // +-------------+    // |        value|    // +-------------+    // |Name: Michael|    // |   Name: Andy|    // | Name: Justin|    // +-------------+    // $example off:programmatic_schema$  }}

Code

  • https://github.com/undergrowthlinear/bigdata-learn.git
    • spark30.sql.SparkSQLDemo