【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter4 LeftOuterJoin
来源:互联网 发布:python txt转xml 编辑:程序博客网 时间:2024/05/20 14:41
:scala版
package com.bbw5.dataalgorithms.sparkimport org.apache.spark.SparkConfimport org.apache.spark.SparkContext/** * This class provides a basic implementation of "left outer join" * operation for a given two tables. This class is provided as * an educational tool to understand the concept of "left outer join" * functionality. * * users table(user_id,location_id) * transactions table(transaction_id,product_id,user_id,quantity,amount) * * Note that Spark API does provide JavaPairRDD.leftOuterJoin() functionality. * * @author baibaw5 * */object SparkLeftOuterJoin { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("SparkTop10UsingTakeOrdered") val sc = new SparkContext(sparkConf) val userFilename = "G:/temp/data/user.txt" val tranFilename = "G:/temp/data/transaction.txt" val userFile = sc.textFile(userFilename) val tranFile = sc.textFile(tranFilename) //(user_id,location_id) val userRDD = userFile.map(_.split(",")).map(d => (d(0), ("L", d(1)))) //(user_id,product_id) val tranRDD = tranFile.map(_.split(",")).map(d => (d(2), ("P", d(1)))) val groupRDD = tranRDD.union(userRDD).groupByKey() groupRDD.foreach(println) val plRDD = groupRDD.mapValues { iter => val location = iter.filter(p => p._1 == "L").toArray.apply(0)._2 iter.filter(p => p._1 == "P").map(p => (p._2, location)) }.flatMap(a => a._2) plRDD.groupByKey().mapValues ( x => x.size ).foreach(println) //version two val userRDD2 = userFile.map(_.split(",")).map(d => (d(0), d(1))) //(user_id,product_id) val tranRDD2 = tranFile.map(_.split(",")).map(d => (d(2), d(1))) val groupRDD2=tranRDD2.join(userRDD2) groupRDD2.map(a=>a._2).groupByKey().mapValues ( x => x.size ).foreach(println) }}
0 0
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter4 LeftOuterJoin
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter1 Secondary Sort
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter3 Top 10 List
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter3 Top 10 NonUniqueList
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter5 Order Inversion Pattern
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter6 MovingAverage
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter 7 Market Basket Analysis
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter 8 Common Friends
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter 9 Recommendation Items
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter 9 Recommendation People
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter 10 Content-Based Recommend
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter 11 Smarter Email Marketing wit
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter 13 k-Nearest Neighbors
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter 12. K-Means Clustering
- Scaling Up And Out
- Building the Enterprise Fabric for Big Data with Vertica and Spark Integration
- Starting up PySpark for using python with Spark in eclipse
- Developer Training for Spark and Hadoop
- 怎么看java程序的运行时间
- AudioPolicyService 的分析之一
- Android需要精通的技能总结
- Redis持久化
- Android Studio引用mipmap中引用.9图片报错
- 【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter4 LeftOuterJoin
- 好RESTful API的设计原则
- 使用公开密钥机制进行会话密钥
- AWS S3 resource访问:使用 S3 URI 地址格式
- redis持久化2
- Eclipse launch configuration ---自动执行
- jquery queue
- git 配色方案
- 理解 Scroll Views