spark sql 例子
来源:互联网 发布:2016十大网络主播 编辑:程序博客网 时间:2024/05/29 16:20
该文主要展示的是spark sql 例子
(内容是找了份oracle的例子,翻译成spark sql的)
1、需要准备好四张表,既四个文本文件逗号分隔
2、为这四张表创建好schema,并注册成表
3、时间处理有小部分改动
准备的四张表
表(一)Student (学生表)
字段名
数据类型
可否为空
含 义
Sno
Varchar2(3)
否
学号(主键)
Sname
Varchar2(8)
否
学生姓名
Ssex
Varchar2(2)
否
学生性别
Sbirthday
Date
可
学生出生年月
SClass
Varchar2(5)
可
学生所在班级
表(二)Course(课程表)
属性名
数据类型
可否为空
含 义
Cno
Varchar2(5)
否
课程号(主键)
Cname
Varchar(10)
否
课程名称
Tno
Varchar2(3)
否
教工编号(外键)
表(三)Score(成绩表)
属性名
数据类型
可否为空
含 义
Sno
Varchar2(3)
否
学号(外键)
Cno
Varchar2(5)
否
课程号(外键)
Degree
Number(4,1)
可
成绩
表(四)Teacher(教师表)
属性名
数据类型
可否为空
含 义
Tno
Varchar2(3)
否
教工编号(主键)
Tname
Varchar2(4)
否
教工姓名
Tsex
Varchar2(2)
否
教工性别
Tbirthday
Date
可
教工出生年月
Prof
Varchar2(6)
可
职称
Depart
Varchar(10)
否
教工所在部门
四张表中的数据
例子代码,粘贴可用,注意注释掉不需要的地方即可
package com.cdpsql1import org.apache.spark.sql.{Row, SparkSession}import org.apache.spark.sql.types._import scala.collection.mutableimport java.text.SimpleDateFormat/** * 2017/05/26 * CDP */object SparkSqlExample1 { def main(args: Array[String]): Unit = { /** *************************************************************************************************************** * sparksession */ val spark = SparkSession .builder() .master("local") .appName("test") .config("spark.sql.shuffle.partitions", "5") .getOrCreate() /** *************************************************************************************************************** * 表结构 */ val StudentSchema: StructType = StructType(mutable.ArraySeq( //学生表 StructField("Sno", StringType, nullable = false), //学号 StructField("Sname", StringType, nullable = false), //学生姓名 StructField("Ssex", StringType, nullable = false), //学生性别 StructField("Sbirthday", StringType, nullable = true), //学生出生年月 StructField("SClass", StringType, nullable = true) //学生所在班级 )) val CourseSchema: StructType = StructType(mutable.ArraySeq( //课程表 StructField("Cno", StringType, nullable = false), //课程号 StructField("Cname", StringType, nullable = false), //课程名称 StructField("Tno", StringType, nullable = false) //教工编号 )) val ScoreSchema: StructType = StructType(mutable.ArraySeq( //成绩表 StructField("Sno", StringType, nullable = false), //学号(外键) StructField("Cno", StringType, nullable = false), //课程号(外键) StructField("Degree", IntegerType, nullable = true) //成绩 )) val TeacherSchema: StructType = StructType(mutable.ArraySeq( //教师表 StructField("Tno", StringType, nullable = false), //教工编号(主键) StructField("Tname", StringType, nullable = false), //教工姓名 StructField("Tsex", StringType, nullable = false), //教工性别 StructField("Tbirthday", StringType, nullable = true), //教工出生年月 StructField("Prof", StringType, nullable = true), //职称 StructField("Depart", StringType, nullable = false) //教工所在部门 )) /** *************************************************************************************************************** * 获取当前时间函数 */ def getDate(time: String) = { val now: Long=System.currentTimeMillis() var df: SimpleDateFormat = new SimpleDateFormat(time) df.format(now) } /** *************************************************************************************************************** * 读取数据 */ val StudentData = spark.sparkContext.textFile("input/sqltable/Student").map{ lines => val line = lines.split(",") Row(line(0),line(1),line(2),line(3),line(4)) } val CourseData = spark.sparkContext.textFile("input/sqltable/Course").map{ lines => val line = lines.split(",") Row(line(0),line(1),line(2)) } val ScoreData = spark.sparkContext.textFile("input/sqltable/Score").map{ lines => val line = lines.split(",") Row(line(0),line(1),line(2).toInt) } val TeacherData = spark.sparkContext.textFile("input/sqltable/Teacher").map{ lines => val line = lines.split(",") Row(line(0),line(1),line(2),line(3),line(4),line(5)) } /** *************************************************************************************************************** * 转换成表 */ val StudentTable = spark.createDataFrame(StudentData, StudentSchema) StudentTable.createOrReplaceTempView("Student") val CourseTable = spark.createDataFrame(CourseData, CourseSchema) CourseTable.createOrReplaceTempView("Course") val ScoreTable = spark.createDataFrame(ScoreData, ScoreSchema) ScoreTable.createOrReplaceTempView("Score") val TeacherTable = spark.createDataFrame(TeacherData, TeacherSchema) TeacherTable.createOrReplaceTempView("Teacher") /** *************************************************************************************************************** * 走sql节奏 * 表名,字段名,区分大小写 */ ////1、 查询Student表中的所有记录的Sname、Ssex和Class列。 spark.sql("SELECT sname, ssex, sclass FROM Student").show() ////2、 查询教师所有的单位即不重复的Depart列。 spark.sql("SELECT DISTINCT depart FROM Teacher").show() ////3、 查询Student表的所有记录 spark.sql("SELECT * FROM Student").show() ////4、 查询Score表中成绩在60到80之间的所有记录。 //spark.sql("SELECT * FROM Score WHERE degree BETWEEN 60 and 80").show() spark.sql("SELECT * FROM Score WHERE degree >= 60 and degree <= 80").show() ////5、 查询Score表中成绩为85,86或88的记录。 spark.sql("SELECT * FROM Score WHERE degree = '85' OR degree = '86' OR degree = '88'").show() ////6、 查询Student表中“95031”班或性别为“女”的同学记录。 spark.sql("SELECT * FROM Student WHERE sclass = '95031' OR ssex = 'female'").show() ////7、 以Class降序,升序查询Student表的所有记录。 spark.sql("SELECT * FROM Student ORDER BY sclass DESC").show() spark.sql("SELECT * FROM Student ORDER BY sclass").show() ////8、 以Cno升序、Degree降序查询Score表的所有记录。 spark.sql("SELECT * FROM Score t ORDER BY t.sno ASC, t.degree DESC").show() ////9、 查询“95031”班的学生人数。 spark.sql("SELECT t.sclass totalnum FROM Student t WHERE sclass = '95031'").show() spark.sql("SELECT t.sclass AS totalnum FROM Student t WHERE sclass = '95031'").show() ////10、 查询Score表中的最高分的学生学号和课程号。(子查询或者排序) //// oracle => WHERE rownum = 1 //// spark sql => LIMIT 1 spark.sql("SELECT * FROM (SELECT * FROM Score ORDER BY degree DESC LIMIT 1)").show() spark.sql("SELECT t.sno, t.cno FROM Score t ORDER BY degree DESC").show() spark.sql("SELECT * FROM Score WHERE degree IN(SELECT MAX(degree) FROM Score t)").show() ////11、 查询每门课的平均成绩。 spark.sql("SELECT AVG(degree) average FROM Score t WHERE cno = '3-245'").show() spark.sql("SELECT AVG(degree) average FROM Score WHERE cno = '3-105'").show() spark.sql("SELECT AVG(degree) average FROM Score WHERE cno = '6-166'").show() spark.sql("SELECT cno, AVG(degree) FROM Score t GROUP BY cno").show() ////12、查询Score表中至少有5名学生选修的并以3开头的课程的平均分数。 spark.sql("SELECT cno, AVG(degree) FROM Score WHERE cno LIKE '3%' GROUP BY cno HAVING COUNT(1) >= 5").show() ////13、查询分数大于70,小于90的Sno列。 spark.sql("SELECT sno FROM Score WHERE degree BETWEEN 70 AND 90").show() ////14、查询所有学生的Sname、Cno和Degree列。 spark.sql("SELECT s.sname, t.cno, t.degree FROM Score t, Student s WHERE t.sno = s.sno").show() spark.sql("SELECT s.sname, t.cno, t.degree FROM Score t JOIN Student s ON t.sno = s.sno").show() ////15、查询所有学生的Sno、Cname和Degree列。 spark.sql("SELECT s.sname, t.cno, t.degree FROM Score t JOIN Student s ON t.sno = s.sno").show() ////16、查询所有学生的Sname、Cname和Degree列。 spark.sql("SELECT s.sname, t.degree, c.cname FROM Score t, Student s, Course c WHERE t.sno = s.sno AND t.cno = c.cno").show() spark.sql("SELECT s.sname, t.degree, c.cname FROM Score t " + "JOIN Student s on t.sno = s.sno " + "JOIN Course c on c.cno = t.cno").show() ////17、 查询“95033”班学生的平均分。 spark.sql("SELECT AVG(degree) average FROM Score WHERE sno IN (SELECT sno FROM Student WHERE sclass = '95033')").show() ////19、 查询选修“3-105”课程的成绩高于“109”号同学成绩的所有同学的记录。 spark.sql("SELECT * FROM Score WHERE cno = '3-105' AND degree > (SELECT degree FROM score WHERE sno = '109' AND cno = '3-105')").show() ////20、查询score中选学多门课程的同学中分数为非最高分成绩的记录。 spark.sql("SELECT * FROM Score WHERE sno IN " + "(SELECT sno FROM Score t GROUP BY t.sno HAVING COUNT(1) > 1) AND degree != (SELECT MAX(degree) FROM Score)").show() spark.sql("SELECT * FROM Score WHERE degree != (SELECT MAX(degree) FROM Score)").show() ////21、 查询成绩高于学号为“109”、课程号为“3-105”的成绩的所有记录。 spark.sql("SELECT * FROM Score t WHERE t.degree > (SELECT degree FROM Score WHERE sno = '109' AND cno = '3-105')").show() ////22、查询和学号为108的同学同年出生的所有学生的Sno、Sname和Sbirthday列。 //// oracle => to_char(t.sbirthday,'yyyy') //// spark sql => substring(t.sbirthday, 0, 4) spark.sql("SELECT sno, sname , sbirthday " + "FROM Student " + "WHERE substring(sbirthday, 0, 4) = ( " + "SELECT substring(t.sbirthday, 0, 4) " + "FROM Student t " + "WHERE sno = '108')").show() ////23、查询“张旭“教师任课的学生成绩。 spark.sql("SELECT t.tno, c.cno, c.cname, s.degree FROM Teacher t " + "JOIN Course c ON t.tno = c.tno " + "JOIN Score s ON c.cno = s.cno WHERE t.tname = 'Zhang xu'").show() ////24、查询选修某课程的同学人数多于5人的教师姓名。 spark.sql("SELECT tname FROM Teacher e " + "JOIN Course c ON e.tno = c.tno " + "JOIN(SELECT cno FROM Score GROUP BY cno HAVING COUNT(cno) > 5) t ON c.cno = t.cno").show() ////25、查询95033班和95031班全体学生的记录。 spark.sql("SELECT * FROM Student WHERE sclass IN('95031', '95033')").show() spark.sql("SELECT * FROM Student WHERE sclass LIKE '9503%'").show() ////26、 查询存在有85分以上成绩的课程Cno. spark.sql("SELECT cno FROM Score WHERE degree > 85 GROUP BY cno").show() ////27、查询出“计算机系“教师所教课程的成绩表。 spark.sql("SELECT t.sno, t.cno, t.degree FROM Score t " + "JOIN Course c ON t.cno = c.cno " + "JOIN Teacher e ON c.tno = e.tno WHERE e.depart = 'department of computer'").show() ////28、查询“计算机系”与“电子工程系“不同职称的教师的Tname和Prof。 spark.sql("SELECT tname, prof " + "FROM Teacher " + "WHERE prof NOT IN (SELECT a.prof " + "FROM (SELECT prof " + "FROM Teacher " + "WHERE depart = 'department of computer' " + ") a " + "JOIN (SELECT prof " + "FROM Teacher " + "WHERE depart = 'department of electronic engineering' " + ") b ON a.prof = b.prof) ").show() spark.sql("SELECT tname, prof " + "FROM Teacher " + "WHERE depart = 'department of electronic engineering' " + "AND prof NOT IN (SELECT prof " + "FROM Teacher " + "WHERE depart = 'department of computer') " + "OR depart = 'department of computer' " + "AND prof NOT IN (SELECT prof " + "FROM Teacher " + "WHERE depart = 'department of electronic engineering')").show() ////29、查询选修编号为“3-105“课程且成绩至少高于选修编号为“3-245”的同学的Cno、Sno和Degree,并按Degree从高到低次序排序。 spark.sql("SELECT t.sno, t.cno, degree " + "FROM SCORE t " + "WHERE degree > ( " + "SELECT MIN(degree) " + "FROM score " + "WHERE cno = '3-245' " + ") " + "AND t.cno = '3-105' " + "ORDER BY degree DESC").show() ////30、查询选修编号为“3-105”且成绩高于选修编号为“3-245”课程的同学的Cno、Sno和Degree. // oracle方式 spark.sql("select t.sno, t.cno, t.degree from SCORE t where t.degree > select degree from score where cno='3-245' or cno='3-105'").show() spark.sql("SELECT t.sno, t.cno, t.degree FROM Score t WHERE t.degree > (SELECT MAX(degree) FROM Score WHERE cno = '3-245' ) AND t.cno = '3-105'").show() ////31、 查询所有教师和同学的name、sex和birthday. spark.sql("SELECT sname, ssex, sbirthday FROM Student " + "UNION SELECT tname, tsex, tbirthday FROM Teacher").show() //// 32、查询所有“女”教师和“女”同学的name、sex和birthday. union spark.sql("SELECT sname, ssex, sbirthday " + "FROM Student " + "WHERE ssex = 'female' " + "UNION " + "SELECT tname, tsex, tbirthday " + "FROM Teacher " + "WHERE tsex = 'female'").show() ////33、 查询成绩比该课程平均成绩低的同学的成绩表。 spark.sql("SELECT s.* " + "FROM score s " + "WHERE s.degree < ( " + "SELECT AVG(degree) " + "FROM score c " + "WHERE s.cno = c.cno)").show() ////34、 查询所有任课教师的Tname和Depart. in spark.sql("SELECT tname, depart " + "FROM teacher t " + "WHERE t.tno IN ( " + "SELECT tno " + "FROM course c " + "WHERE c.cno IN (" + "SELECT cno " + "FROM score))").show() ////35 、 查询所有未讲课的教师的Tname和Depart. not in spark.sql("SELECT tname, depart " + "FROM teacher t " + "WHERE t.tno NOT IN ( " + "SELECT tno " + "FROM course c " + "WHERE c.cno IN ( " + "SELECT cno " + "FROM score))").show() ////36、查询至少有2名男生的班号。 group by, having count spark.sql("SELECT SClass " + "FROM Student t " + "WHERE Ssex = 'male' " + "GROUP BY SClass " + "HAVING COUNT(Ssex) >= 2").show() ////37、查询Student表中不姓“王”的同学记录。 not like spark.sql("SELECT * FROM Student t WHERE Sname NOT LIKE('Wang%')").show() ////38、查询Student表中每个学生的姓名和年龄。 ////将函数运用到spark sql中去计算,可以直接拿String的类型计算不需要再转换成数值型 默认是会转换成Double类型计算 spark.sql("SELECT Sname, ("+ getDate("yyyy") +" - substring(sbirthday, 0, 4)) AS age FROM STUDENT t").show() ////浮点型转整型 spark.sql("SELECT Sname, (CAST("+ getDate("yyyy") +" AS INT) - CAST(substring(sbirthday, 0, 4) AS INT)) AS age " + "FROM Student t").show() ////39、查询Student表中最大和最小的Sbirthday日期值。 时间格式最大值,最小值 spark.sql("SELECT MAX(t.sbirthday) AS maximum FROM Student t").show() spark.sql("SELECT MIN(t.sbirthday) AS minimum FROM Student t").show() ////40、以班号和年龄从大到小的顺序查询Student表中的全部记录。 查询结果排序 spark.sql("SELECT * " + "FROM Student " + "ORDER BY SClass DESC, CAST("+ getDate("yyyy") +" AS INT) - CAST(substring(Sbirthday, 0, 4) AS INT) DESC").show() ////41、查询“男”教师及其所上的课程。 select join spark.sql("SELECT TSex, CName " + "FROM Teacher t " + "JOIN course c ON t.tno = c.tno " + "WHERE TSex = 'male'").show() ////42、查询最高分同学的Sno、Cno和Degree列。 子查询 spark.sql("SELECT * " + "FROM Score " + "WHERE degree = ( " + "SELECT MAX(degree) " + "FROM SCORE t)").show() ////43、查询和“李军”同性别的所有同学的Sname. spark.sql("SELECT sname " + "FROM STUDENT t " + "WHERE ssex IN ( " + "SELECT ssex " + "FROM student " + "WHERE sname = 'Liu Jun')").show() ////44、查询和“李军”同性别并同班的同学Sname. spark.sql("SELECT sname " + "FROM Student t " + "WHERE ssex IN ( " + "SELECT ssex " + "FROM student " + "WHERE sname = 'Liu Jun') " + "AND sclass IN (SELECT sclass " + "FROM student " + "WHERE sname = 'Liu Jun')").show() ////45、查询所有选修“计算机导论”课程的“男”同学的成绩表。 spark.sql("SELECT t.sno, t.cno, t.degree " + "FROM Score t " + "JOIN Course c ON t.cno = c.cno " + "JOIN Student s ON s.sno = t.sno " + "WHERE s.SSex = 'male' " + "AND c.CName = 'Introduction to computer'").show() }}
阅读全文
0 0
- Spark SQL的例子
- spark sql例子
- Spark SQL的例子
- spark sql 例子
- Spark SQL 最简单例子
- Spark 之 Spark SQL源码函数解读及UDF/UDAF例子 spark研习第六集
- spark例子
- Spark SQL源码函数解读及UDF/UDAF例子 spark研习第六集
- spark 例子运行- spark pi
- 【spark】spark word count例子
- Spark Streaming+Spark SQL
- spark sql
- Spark SQL
- Spark SQL
- spark-sql
- spark sql
- spark sql
- spark sql
- Cordova微信支付插件开发
- ClassLoader加载Class的过程 解析
- Xcodeproject详解
- C#集合之SortedList
- linux tar
- spark sql 例子
- 详解js闭包
- ubuntu 更改东西后,终端无法打开,图形界面还在
- Spring学习问题整理
- JAVA-JDBC: (4) DAO设计思想及骨架搭建
- CentOS5和CentOS7上的crontab计划任务
- shell脚本刷流量
- Java中文分词hanlp使用
- 开题报告