RDDs转DF

来源：互联网发布：交友系统源码编辑：程序博客网时间：2024/06/06 17:14

一、方式1：反射的方法，但是生产上不建议使用。因为case class只能定义22个字段，有所限制。

二、方式2：编程的方式，一般三步走。
1、从原始RDD创建一个RDD[Row];
2、在步骤1中创建的RDD[Row]中，创建一个与scheam匹配的结构
3、通过SparkSession提供的createDataFrame方法将schema应用于RDD[Row]

package com.spark.sqlimport org.apache.spark.sql.{Row, SparkSession}import org.apache.spark.sql.types.{StringType, StructField, StructType}/**  * Created with IntelliJ IDEA.  * Description:   * Author: A_ChunUnique  * Date: 2017/11/28  * Time: 14:27  *  **/object CovertRdd {  case class Person(name: String, age: Long)  def main(args: Array[String]): Unit = {    val spark = SparkSession.builder().master("local[2]").appName("RDD Covert DF").getOrCreate()    //runInferSchemaExample(spark)    runProgrammaticSchemaExample(spark)  }  private def runInferSchemaExample(spark: SparkSession): Unit = {    /*    * 方法1：通过反射方法 将RDD转成一个DF    * */    import spark.implicits._    val peopleDF = spark.sparkContext      .textFile("file:///D:/ruoze/people.txt")      .map(_.split(","))      .map(attributes => Person(attributes(0), attributes(1).trim.toInt)).toDF()    peopleDF.createOrReplaceTempView("people")    val teenagersDF = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19")    teenagersDF.map(teenager => "Name: " + teenager(0) + ",Age:" + teenager(1)).show()  }  /*    * 方法1：通过编程方法，将RDD转成一个DF    * */  private def runProgrammaticSchemaExample(spark: SparkSession): Unit = {    import spark.implicits._    // Create an RDD    val peopleRDD = spark.sparkContext.textFile("file:///D:/ruoze/people.txt")    // The schema is encoded in a string    val schemaString = "name age"    // Generate the schema based on the string of schema    val fields = schemaString.split(" ")      .map(fieldName => StructField(fieldName, StringType, nullable = true))    val schema = StructType(fields)    // Convert records of the RDD (people) to Rows    val rowRDD = peopleRDD      .map(_.split(","))      .map(attributes => Row(attributes(0), attributes(1).trim))    // Apply the schema to the RDD    val peopleDF = spark.createDataFrame(rowRDD, schema)    // Creates a temporary view using the DataFrame    peopleDF.createOrReplaceTempView("people")    // SQL can be run over a temporary view created using DataFrames    val results = spark.sql("SELECT name FROM people")    // The results of SQL queries are DataFrames and support all the normal RDD operations    // The columns of a row in the result can be accessed by field index or by field name    results.map(attributes => "Name: " + attributes(0)).show()  }}

阅读全文

0 0