Spark SQL 两表关联
来源:互联网 发布:c语言实现post请求 编辑:程序博客网 时间:2024/05/16 08:35
import org.apache.spark.sql.SQLContextimport org.apache.spark.storage.StorageLevelimport org.apache.spark.{SparkContext, SparkConf}//define case class for usercase class User(userID: String, gender: String, age: Int, registerDate: String,role: String, region: String)//define case class for consuming datacase class Order(orderID: String, orderDate: String, productID: Int, price: Int, userID: String)object UserConsumingDataStatistics { def main(args: Array[String]) { if (args.length < 1) { println("Usage:UserConsumingDataStatistics userDataFilePath consumingDataFilePath") System.exit(1) } val conf = new SparkConf().setAppName("Spark Exercise:User Consuming Data Statistics") //Kryo serializer is more quickly by default java serializer conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val ctx = new SparkContext(conf) val sqlCtx = new SQLContext(ctx) import sqlCtx.implicits._ //Convert user data RDD to a DataFrame and register it as a temp table val userDF = ctx.textFile(args(0)).map(_.split(" ")).map( u => User(u(0), u(1), u(2).toInt,u(3),u(4),u(5))).toDF() userDF.registerTempTable("user") //Convert consuming data RDD to a DataFrame and register it as a temp table val orderDF = ctx.textFile(args(1)).map(_.split(" ")).map(o => Order( o(0), o(1), o(2).toInt,o(3).toInt,o(4))).toDF() orderDF.registerTempTable("orders") //cache the DF in memory with serializer should make the program run much faster userDF.persist(StorageLevel.MEMORY_ONLY_SER) orderDF.persist(StorageLevel.MEMORY_ONLY_SER) //The number of people who have orders in the year 2015 val count = orderDF.filter(orderDF("orderDate").contains("2015")).join( userDF, orderDF("userID").equalTo(userDF("userID"))).count() println("The number of people who have orders in the year 2015:" + count) //total orders produced in the year 2014 val countOfOrders2014 = sqlCtx.sql("SELECT * FROM orders where orderDate like '2014%'").count() println("total orders produced in the year 2014:" + countOfOrders2014) //Orders that are produced by user with ID 1 information overview val countOfOrdersForUser1 = sqlCtx.sql("SELECT o.orderID,o.productID, o.price,u.userID FROM orders o,user u where u.userID = 1 and u.userID = o.userID").show() println("Orders produced by user with ID 1 showed.") //Calculate the max,min,avg prices for the orders that are producted by user with ID 10 val orderStatsForUser10 = sqlCtx.sql("SELECT max(o.price) as maxPrice, min(o.price) as minPrice,avg(o.price) as avgPrice,u.userID FROM orders o, user u where u.userID = 10 and u.userID = o.userID group by u.userID") println("Order statistic result for user with ID 10:") orderStatsForUser10.collect().map(order => "Minimum Price=" + order.getAs("minPrice") + ";Maximum Price=" + order.getAs("maxPrice") + ";Average Price=" + order.getAs("avgPrice") ).foreach(result => println(result)) }}
0 0
- Spark SQL 两表关联
- 两表关联修改SQL
- spark-sql 关联问题
- SQL/Oracle 两表关联更新
- oracle与sql两表关联更新
- sql 两表关联查询加分组
- SQL/Oracle 两表关联更新
- mysql 两表关联更新sql
- sql语句delete----关联两表删除两表内容
- SQL查询案例---两表关联更新使用VIEW
- 两表关联更新
- 【SQL两表关联级联】两张表关联,统计级联数量ORACLE
- 两表(多表)关联update
- 关联两表的update
- Oracle两表关联更新
- oracle两表关联更新
- Oracle 两表关联更新
- oracle两表关联更新
- caffe安装系列——综述
- 4.7 服务器上的 Git - GitWeb
- 职场28岁现象:最好的关于职业规划的文章
- C# File
- JSP 遍历迭代 Enum 枚举
- Spark SQL 两表关联
- snprintf()函数使用方法
- 内存映射文件总结
- 记录一下Xcode8的一些混蛋事
- J2SE第二章——Java基本语法(一)
- 【JavaCV】JavaCV的人脸检测
- iOS贝塞尔曲线学习笔记(一)
- 凭证 90000000 保存(帐户确定出错)
- angularjs scope rootscope 区别