spark 实战笔记case2
来源:互联网 发布:阿里云增值税专用发票 编辑:程序博客网 时间:2024/06/01 12:12
val t1 = sc.textFile("/tmp/db_case1/order_created/*").map(line => line.split("\t"))val t2 = sc.textFile("/tmp/db_case1/order_picked/*").map(line => line.split("\t"))val t3 = sc.textFile("/tmp/db_case1/order_shipped/*").map(line => line.split("\t"))val t1kv = t1.map(line => ( line(0), ( "created", line(1).substring(0, 19) ) ))val t2kv = t2.map(line => ( line(0), ( "picked" , line(1).substring(0, 19) ) ))val t3kv = t3.map(line => ( line(0), ( "shipped", line(1).substring(0, 19) ) ))def flatValues ( events:((String, (((String, String), Option[(String, String)]), Option[(String, String)]))) ): (String, String, String, String) = { return ( events._1, events._2._1._1._2, events._2._1._2.getOrElse(("",""))._2, events._2._2.getOrElse(("",""))._2 )}t1kv.leftOuterJoin(t2kv).leftOuterJoin(t3kv).map(flatValues).take(10).foreach(println)def flatValues ( events:((String, (((String, String), Option[(String, String)]), Option[(String, String)]))) ): (String, (String, String), (String, String), (String, String)) = { return ( events._1, events._2._1._1, events._2._1._2.getOrElse(("","")), events._2._2.getOrElse(("","")) )}t1kv.leftOuterJoin(t2kv).leftOuterJoin(t3kv).map(flatValues).take(10).foreach(println)def flatValues ( events:(String, ((String, String), Option[(String, String)])) ): (String, Seq[(String,String)]) = { return ( events._1, Seq(events._2._1, events._2._2.getOrElse(("","")) ) )}t1kv.leftOuterJoin(t2kv).map(flatValues).take(10).foreach(println)def flatList ( events:(String, (Seq[(String, String)], Option[(String, String)])) ): (String, Seq[(String,String)]) = { if (events._2._2.isEmpty) return ( events._1, events._2._1 ) else return ( events._1, events._2._1 :+ events._2._2.get )}t1kv.leftOuterJoin(t2kv).map(flatValues).leftOuterJoin(t3kv).map(flatList).take(10).foreach(println)def filterSLA ( events:Seq[(String,String)] ): Boolean = { val eventMap = events.toMap if ( (eventMap contains "created") && (eventMap contains "picked") ) { val format = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss") val ts1 = format.parse(eventMap("created")).getTime val ts2 = format.parse(eventMap("picked")).getTime if (ts2 - ts1 < 7200000) false else true } else { true }}t1kv.leftOuterJoin(t2kv).map(flatValues).leftOuterJoin(t3kv).map(flatList).filter( kv => filterSLA(kv._2) ).collect.foreach(println)t1kv.map(kv => (kv._1, Seq(kv._2))).leftOuterJoin(t2kv).map(flatList).leftOuterJoin(t3kv).map(flatList).filter( kv => filterSLA(kv._2) ).collect.foreach(println)t1kv.map(kv => (kv._1, Seq(kv._2))).leftOuterJoin( t2kv.groupByKey.map(kv => (kv._1, (kv._2.toSeq.sortBy(_._2).toMap.keys.head, kv._2.toSeq.sortBy(_._2).toMap.values.head)))).map(flatList).leftOuterJoin( t3kv.groupByKey.map(kv => (kv._1, (kv._2.toSeq.sortBy(_._2).toMap.keys.head, kv._2.toSeq.sortBy(_._2).toMap.values.head)))).map(flatList).collect.foreach(println)def flatGroup ( groups:(String, (Iterable[(String, String)])) ): (String, (String,String)) = { val groupsMap = groups._2.toSeq.sortBy(_._2).toMap return ( groups._1, (groupsMap.keys.head, groupsMap.values.head) )}t1kv.map(kv => (kv._1, Seq(kv._2))).leftOuterJoin( t2kv.groupByKey.map(flatGroup)).map(flatList).leftOuterJoin( t3kv.groupByKey.map(flatGroup)).map(flatList).collect.foreach(println)t1kv.map(kv => (kv._1, Seq(kv._2))).leftOuterJoin( t2kv.groupByKey.map(flatGroup)).map(flatList).leftOuterJoin( t3kv.groupByKey.map(flatGroup)).map(flatList).filter( kv => filterSLA(kv._2) ).collect.foreach(println)
0 0
- spark 实战笔记case2
- hive 实战笔记case2
- case2
- spark 实战笔记case1
- spark 实战笔记case3
- spark 实战笔记case4
- Spark调研笔记第6篇 - Spark编程实战FAQ
- 大数据Spark企业级实战版【学习笔记】---Spark简介
- 大数据Spark企业级实战版【学习笔记】----Spark术语
- 大数据Spark企业级实战版【学习笔记】----Spark Streaming
- 大数据Spark企业级实战版【学习笔记】----Spark Streaming
- Spark实战
- Spark入门实战系列--8.Spark MLlib(下)--SparkMLlib实战 学习笔记
- 大数据Spark企业级实战版【学习笔记】----Spark技术生态系统之Spark Core
- 大数据Spark企业级实战版【学习笔记】----Spark Shark& Spark SQL
- [hadoop+spark+python]大数据实战随手笔记
- 蜗龙徒行-Spark学习笔记【三】Spark集群中worker节点扩展实战经验
- Spark调研笔记第7篇 - 应用实战: 如何利用Spark集群计算物品相似度
- 欲了解Android Studio,必先知道Gradle
- Jenkins插件升级后丢失权限,版本1.646
- 海思3536 —— common VB
- android studio无线真机调试
- spark 实战笔记case1
- spark 实战笔记case2
- 调序的业务处理方法
- Hibernate主键生成策略解读
- 八大排序算法之直接选择排序
- 定位 new 运算符
- spark 实战笔记case3
- linux系统的安装
- spark 实战笔记case4
- Android Studio主要目录及文件简介