spark SQL源码阅读001——sql.core包核心类——001执行SQL语法解析

来源：互联网发布：mac 如何切换中英文编辑：程序博客网时间：2024/06/07 07:52

</pre>近期的工作，一直需要和SQL引擎打交道，周末有闲心下，把spark如何解析SQL语法，并执行spark的任务记录一下。<p></p><p><span style="white-space:pre"></span>spark SQL自己实现了SQL语法的解析，从而脱离hive的版本跟进。看一下spark SQL 的core核心。从api入门。</p><p></p><pre name="code" class="plain">/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *    http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.spark.examples.sqlimport org.apache.spark.{SparkConf, SparkContext}import org.apache.spark.sql.SQLContextimport org.apache.spark.sql.functions._// One method for defining the schema of an RDD is to make a case class with the desired column// names and types.case class Record(key: Int, value: String)object RDDRelation {  def main(args: Array[String]) {    val sparkConf = new SparkConf().setAppName("RDDRelation")    val sc = new SparkContext(sparkConf)    val sqlContext = new SQLContext(sc)    // Importing the SQL context gives access to all the SQL functions and implicit conversions.    import sqlContext.implicits._    val df = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i"))).toDF()    // Any RDD containing case classes can be registered as a table.  The schema of the table is    // automatically inferred using scala reflection.    df.registerTempTable("records")    // Once tables have been registered, you can run SQL queries over them.    println("Result of SELECT *:")    <span style="color:#ff99ff;">sqlContext.sql("SELECT * FROM records")</span>.collect().foreach(println)    // Aggregation queries are also supported.    val count = sqlContext.<span style="color:#ff99ff;">sql("SELECT COUNT(*) FROM records")</span>.collect().head.getLong(0)    println(s"COUNT(*): $count")    // The results of SQL queries are themselves RDDs and support all normal RDD functions.  The    // items in the RDD are of type Row, which allows you to access each column by ordinal.    val rddFromSql = sqlContext.<span style="color:#ff99ff;">sql("SELECT key, value FROM records WHERE key < 10")</span>    println("Result of RDD.map:")    rddFromSql.map(row => s"Key: ${row(0)}, Value: ${row(1)}").collect().foreach(println)    // Queries can also be written using a LINQ-like Scala DSL.   <span style="color:#6666cc;"> df.where($"key" === 1).orderBy($"value".asc).select($"key")</span>.collect().foreach(println)    // Write out an RDD as a parquet file.    df.saveAsParquetFile("pair.parquet")    // Read in parquet file.  Parquet files are self-describing so the schmema is preserved.    val parquetFile = sqlContext.parquetFile("pair.parquet")    // Queries can be run using the DSL on parequet files just like the original RDD.    parquetFile.<span style="color:#6666cc;">where($"key" === 1).select($"value".as("a"))</span>.collect().foreach(println)    // These files can also be registered as tables.    parquetFile.registerTempTable("parquetFile")    sqlContext.sql("SELECT * FROM parquetFile").collect().foreach(println)    sc.stop()  }}

从api，看起来，使用方式简单。由粉红色的SQL语法解析，和dataframe版本紫色的api包装。主要关注粉红色的SQL语法是如何解析。

sqlContext.sql("SELECT * FROM records")

<pre name="code" class="plain"><span style="color:#ff0000;">org.org.apache.spark.sql.SQLContext.scala</span>

/**   * Executes a SQL query using Spark, returning the result as a [[DataFrame]]. The dialect that is   * used for SQL parsing can be configured with 'spark.sql.dialect'.   *   * @group basic   */  def <span style="color:#ff99ff;">sql</span>(sqlText: String): DataFrame = {    if (conf.dialect == "sql") {      DataFrame(this, <span style="color:#ff6600;">parseSql</span>(sqlText))    } else {      sys.error(s"Unsupported SQL dialect: ${conf.dialect}")    }  }

<strong><span style="font-size:18px;"></span></strong><pre name="code" class="plain"><span style="font-family: Arial, Helvetica, sans-serif; font-size: 12px;"><span style="font-size:18px;">001-执行SQL语法解析，返回一个<span style="color: rgb(51, 204, 255); font-family: monospace; white-space: pre; background-color: rgb(240, 240, 240);">LogicalPlan，逻辑执行计划。 </span></span></span>

protected[sql] def parseSql(sql: String):LogicalPlan= {

    ddlParser(sql, false).getOrElse(sqlParser(sql))  }

<span style="color:#ff0000;">org.apache.spark.sql.sources.DDLParser.scala</span>

def apply(input: String, exceptionOnError: Boolean): Option[LogicalPlan] = {    try {      <span style="color:#33cc00;">Some</span>(<span style="color:#ffff33;">apply</span>(input))    } catch {      case ddlException: DDLException => throw ddlException      case _ if !exceptionOnError => None      case x: Throwable => throw x    }  }

scala.Option

final case class <span style="color:#33ff33;">Some</span>[+A](x: A) extends Option[A] {  def isEmpty = false  def get = x}

org.apache.spark.sql.catalyst.AbstractSparkSQLParser

改函数开始解析SQL语法

def <span style="color:#ffff33;">apply</span>(input: String): LogicalPlan = {    // Initialize the Keywords.    lexical.<span style="color:#33ffff;">initialize</span>(reservedWords)    phrase(start)(new lexical.<span style="color:#ff99ff;">Scanner</span>(input)) match {      case Success(plan, _) => plan      case failureOrError => sys.error(failureOrError.toString)    }  }

org.apache.spark.sql.catalyst.SqlLexical

/* This is a work around to support the lazy setting */  def <span style="color:#33ffff;">initialize</span>(keywords: Seq[String]): Unit = {    <span style="color:#ffff99;">reserved</span>.clear()    <span style="color:#ffff99;">reserved </span>++= keywords  }

</pre><pre name="code" class="plain">/** This component provides a standard lexical parser for a simple, *  [[http://scala-lang.org Scala]]-like language. It parses keywords and *  identifiers, numeric literals (integers), strings, and delimiters. * *  To distinguish between identifiers and keywords, it uses a set of *  reserved identifiers:  every string contained in `reserved` is returned *  as a keyword token. (Note that `=>` is hard-coded as a keyword.) *  Additionally, the kinds of delimiters can be specified by the *  `delimiters` set. * *  Usually this component is used to break character-based input into *  bigger tokens, which are then passed to a token-parser (see *  [[scala.util.parsing.combinator.syntactical.TokenParsers]].) * * @author Martin Odersky * @author Iulian Dragos * @author Adriaan Moors */class StdLexical extends <span style="color:#ffccff;">Lexical </span>with StdTokens {

/** The set of reserved identifiers: these will be returned as `Keyword`s. */  val <span style="color:#ffff99;">reserved </span>= new mutable.HashSet[String]

scala.util.parsing.combinator.lexical.Scanners.Scanner

</pre><pre name="code" class="plain">class <span style="color:#ff99ff;">Scanner</span>(in: Reader[Char]) extends Reader[Token] {    /** Convenience constructor (makes a character reader out of the given string) */    def this(in: String) = this(new CharArrayReader(in.toCharArray()))    private val (tok, rest1, rest2) = whitespace(in) match {      case Success(_, in1) =>        token(in1) match {          case Success(tok, in2) => (tok, in1, in2)          case ns: NoSuccess => (errorToken(ns.msg), ns.next, skip(ns.next))        }      case ns: NoSuccess => (errorToken(ns.msg), ns.next, skip(ns.next))    }    private def skip(in: Reader[Char]) = if (in.atEnd) in else in.rest    override def source: java.lang.CharSequence = in.source    override def offset: Int = in.offset    def first = tok    def rest = new Scanner(rest2)    def pos = rest1.pos    def atEnd = in.atEnd || (whitespace(in) match { case Success(_, in1) => in1.atEnd case _ => false })  }

也就是说，sql（）方法，返回一个LogicalPlan ，从下面基础结构看出，他是一个TreeNode的树状结构。

abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {

abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extendsTreeNode[PlanType]

0 0