【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter4 LeftOuterJoin

来源：互联网发布：python txt转xml 编辑：程序博客网时间：2024/05/20 14:41

：scala版

package com.bbw5.dataalgorithms.sparkimport org.apache.spark.SparkConfimport org.apache.spark.SparkContext/** * This class provides a basic implementation of "left outer join" * operation for a given two tables.  This class is provided as * an educational tool to understand the concept of "left outer join" * functionality. * * users table(user_id,location_id) * transactions table(transaction_id,product_id,user_id,quantity,amount) * * Note that Spark API does provide JavaPairRDD.leftOuterJoin() functionality. * * @author baibaw5 * */object SparkLeftOuterJoin {  def main(args: Array[String]) {    val sparkConf = new SparkConf().setAppName("SparkTop10UsingTakeOrdered")    val sc = new SparkContext(sparkConf)    val userFilename = "G:/temp/data/user.txt"    val tranFilename = "G:/temp/data/transaction.txt"    val userFile = sc.textFile(userFilename)    val tranFile = sc.textFile(tranFilename)    //(user_id,location_id)    val userRDD = userFile.map(_.split(",")).map(d => (d(0), ("L", d(1))))    //(user_id,product_id)    val tranRDD = tranFile.map(_.split(",")).map(d => (d(2), ("P", d(1))))    val groupRDD = tranRDD.union(userRDD).groupByKey()    groupRDD.foreach(println)    val plRDD = groupRDD.mapValues { iter =>      val location = iter.filter(p => p._1 == "L").toArray.apply(0)._2      iter.filter(p => p._1 == "P").map(p => (p._2, location))    }.flatMap(a => a._2)    plRDD.groupByKey().mapValues ( x => x.size ).foreach(println)    //version two    val userRDD2 = userFile.map(_.split(",")).map(d => (d(0), d(1)))    //(user_id,product_id)    val tranRDD2 = tranFile.map(_.split(",")).map(d => (d(2), d(1)))        val groupRDD2=tranRDD2.join(userRDD2)    groupRDD2.map(a=>a._2).groupByKey().mapValues ( x => x.size ).foreach(println)      }}

0 0