spark 实战笔记case2

来源:互联网 发布:阿里云增值税专用发票 编辑:程序博客网 时间:2024/06/01 12:12
val t1 = sc.textFile("/tmp/db_case1/order_created/*").map(line => line.split("\t"))val t2 = sc.textFile("/tmp/db_case1/order_picked/*").map(line => line.split("\t"))val t3 = sc.textFile("/tmp/db_case1/order_shipped/*").map(line => line.split("\t"))val t1kv = t1.map(line => ( line(0), ( "created", line(1).substring(0, 19) ) ))val t2kv = t2.map(line => ( line(0), ( "picked" , line(1).substring(0, 19) ) ))val t3kv = t3.map(line => ( line(0), ( "shipped", line(1).substring(0, 19) ) ))def flatValues ( events:((String, (((String, String), Option[(String, String)]), Option[(String, String)]))) ): (String, String, String, String) = {  return ( events._1, events._2._1._1._2, events._2._1._2.getOrElse(("",""))._2, events._2._2.getOrElse(("",""))._2 )}t1kv.leftOuterJoin(t2kv).leftOuterJoin(t3kv).map(flatValues).take(10).foreach(println)def flatValues ( events:((String, (((String, String), Option[(String, String)]), Option[(String, String)]))) ): (String, (String, String), (String, String), (String, String)) = {  return ( events._1, events._2._1._1, events._2._1._2.getOrElse(("","")), events._2._2.getOrElse(("","")) )}t1kv.leftOuterJoin(t2kv).leftOuterJoin(t3kv).map(flatValues).take(10).foreach(println)def flatValues ( events:(String, ((String, String), Option[(String, String)])) ): (String, Seq[(String,String)]) = {  return ( events._1, Seq(events._2._1, events._2._2.getOrElse(("","")) ) )}t1kv.leftOuterJoin(t2kv).map(flatValues).take(10).foreach(println)def flatList ( events:(String, (Seq[(String, String)], Option[(String, String)])) ): (String, Seq[(String,String)]) = {  if (events._2._2.isEmpty)    return ( events._1, events._2._1 )  else    return ( events._1, events._2._1 :+ events._2._2.get )}t1kv.leftOuterJoin(t2kv).map(flatValues).leftOuterJoin(t3kv).map(flatList).take(10).foreach(println)def filterSLA ( events:Seq[(String,String)] ): Boolean = {  val eventMap = events.toMap  if ( (eventMap contains "created") && (eventMap contains "picked") ) {    val format = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss")    val ts1 = format.parse(eventMap("created")).getTime    val ts2 = format.parse(eventMap("picked")).getTime    if (ts2 - ts1 < 7200000)      false    else      true  } else {    true  }}t1kv.leftOuterJoin(t2kv).map(flatValues).leftOuterJoin(t3kv).map(flatList).filter( kv => filterSLA(kv._2) ).collect.foreach(println)t1kv.map(kv => (kv._1, Seq(kv._2))).leftOuterJoin(t2kv).map(flatList).leftOuterJoin(t3kv).map(flatList).filter( kv => filterSLA(kv._2) ).collect.foreach(println)t1kv.map(kv => (kv._1, Seq(kv._2))).leftOuterJoin(  t2kv.groupByKey.map(kv => (kv._1, (kv._2.toSeq.sortBy(_._2).toMap.keys.head, kv._2.toSeq.sortBy(_._2).toMap.values.head)))).map(flatList).leftOuterJoin(  t3kv.groupByKey.map(kv => (kv._1, (kv._2.toSeq.sortBy(_._2).toMap.keys.head, kv._2.toSeq.sortBy(_._2).toMap.values.head)))).map(flatList).collect.foreach(println)def flatGroup ( groups:(String, (Iterable[(String, String)])) ): (String, (String,String)) = {  val groupsMap = groups._2.toSeq.sortBy(_._2).toMap  return ( groups._1,  (groupsMap.keys.head, groupsMap.values.head) )}t1kv.map(kv => (kv._1, Seq(kv._2))).leftOuterJoin(  t2kv.groupByKey.map(flatGroup)).map(flatList).leftOuterJoin(  t3kv.groupByKey.map(flatGroup)).map(flatList).collect.foreach(println)t1kv.map(kv => (kv._1, Seq(kv._2))).leftOuterJoin(  t2kv.groupByKey.map(flatGroup)).map(flatList).leftOuterJoin(  t3kv.groupByKey.map(flatGroup)).map(flatList).filter( kv => filterSLA(kv._2) ).collect.foreach(println)
0 0
原创粉丝点击