Spark RDD 实现电影点评用户行为分析 (Scala)
来源:互联网 发布:大数据新手视频 编辑:程序博客网 时间:2024/04/20 08:52
package com.xh.moviesimport org.apache.spark.rdd.RDDimport org.apache.spark.{SparkConf, SparkContext}import scala.collection.mutableimport org.apache.log4j.{Level,Logger}/** * Created by ssss on 3/11/2017. * need understand what's the relationshop between dataset & RDD * occupations small data set need to be broadcast * production env should use Parquet ,but not easy for user to read the contents * Here we use 4 files below * 1,"ratings.dat": UserID::MovieID::Rating::Timestamp ///For each threat, you should assign ratings of risk impacts for each asset * 2,"users.dat": UserID::Gender::Age::OccupationID::Zip-code * 3,"movies.dat": MovieID::Title::Genres * 4,"occupations.dat":OccupationID::OccupationName */object MovieReviewsSystemUserBehaviorAnalysis { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) //only print error log var masterUrl = "local[4]" //4 cores default the process run my laptop for study and testing var dataPath = "data/medium/" //here is where the data saved /** * anyway the process should be run on prod in jar mode * so we need prod master url in case */ if (args.length > 1 ){ //here the spark submit need 2 param in prod env at least masterUrl = args(0) dataPath = args(1) } // spark conetext musted .create the spark context val conf = new SparkConf().setMaster(masterUrl).setAppName("userAnalysis") val sscc = new SparkContext(conf) /** 0001 * after have the sc we need read data from local or hadoop * Here we use rdd read file */ val usersRDD: RDD[String] = sscc.textFile(dataPath + "users.dat") val moviesRDD: RDD[String] = sscc.textFile(dataPath + "movies.dat") val occupationsRDD: RDD[String] = sscc.textFile(dataPath + "occupations.dat") val ratingsRDD: RDD[String] = sscc.textFile(dataPath + "ratings.dat") /** * accroding movie id get user info */ val basicUserRDD = usersRDD.map(_.split("::")).map{ user =>(user(3),(user(0),user(1),user(2))) } val occpuation = occupationsRDD.map(_.split("::")).map(job => (job(0),job(1))) val userInfo = basicUserRDD.join(occpuation) //userInfo.collect().foreach(println) //result --> occuption id user info ,occupation name (4,((2175,M,25),college/grad student)) works //ratings.dat": UserID::MovieID val targetMovie = ratingsRDD.map(_.split("::")).map(x => (x(0),x(1))).filter(_._2.equals("1139")) val targetUser = userInfo.map(x =>((x._2._1._1),x._2)) val finalInfo = targetMovie.join(targetUser) //finalInfo.collect().foreach(println) //result (3518,(1139,((3518,F,18),executive/managerial))) println(finalInfo.count()) /** 0002 * get the must populate mives from rating file table by key value reduceByKey * "ratings.dat": UserID::MovieID::Rating::Timestamp */ val populateRDD = ratingsRDD.map(_.split("::")).map(x => (x(1),x(0),x(2))) val getTotalPoint = populateRDD.map(x => (x._1,(x._3.toInt,1))) // get the (key ,value ) tuple .reduceByKey((v1, v2) =>( v1._1+v2._1, v1._2+v2._2)) // operation to reduce // here is so complex and so clever get the total point and total people each movie .map(x=>(x._2._1 / x._2._2,x._1)) //get the average per people per movie .sortByKey(false).map(x =>(x._2,x._1)) // desc .take(15) //get top 10 //getTotalPoint.persist() //getTotalPoint.checkpoint() //getTotalPoint.cache() //getTotalPoint.collect().foreach(println) // result ---> (2329,(1798027,640)) //getTotalPoint.foreach(println ) //(3607,5) /** * get the most popule movies mean how many people saww the movie and get top 15 */ val mustPopulte = ratingsRDD.map(_.split("::")).map(x => (x(1),1)).reduceByKey(_+_).sortBy(_._2,false).take(15) mustPopulte.foreach(println) //(2858,3428) (260,2991) /**0003 * * calculate the top 10 movies by gender ,we can not get the data from rating ,so we need to join user table .need * aggergate opeartion * mapjoin shuffle is the killer in distribute system ,while mapjoin will not cause shuffle * but what's the mapjoin ? maybe broadcast , which a small table or file * 1,"ratings.dat": UserID::MovieID::Rating::Timestamp * 2,"users.dat": UserID::Gender::Age::OccupationID::Zip-code */ //first create a wide table from rating and users val spiltUser =usersRDD.map(_.split("::")).map(x=>(x(0),x(1),x(2),x(3))) val spiltRating = ratingsRDD.map(_.split("::")).map(x=>(x(0),x(1),x(2),x(3))) val wdie = spiltUser.map( x=> (x._1,x._2)) val genderWideInfo = spiltRating.map(x=> (x._1,(x._1,x._2,x._3))).join(wdie) genderWideInfo.cache() //genderWideInfo.take(10).foreach(println) (2828,((2828,3948,5),M)) val maleWideInfo = genderWideInfo.filter(x => x._2._2.equals("M")).map(x => (x._2._1._2,(x._2._1._3.toDouble,1))) //moveid ,rating point people number .reduceByKey((v1, v2) =>( v1._1+v2._1, v1._2+v2._2)) .map(x=>(x._2._1.toDouble / x._2._2,x._1)) .sortByKey(false).map(x =>(x._2,x._1)) // desc .take(15) //get top 10 val fmaleWideInfo = genderWideInfo.filter(x => x._2._2.equals("F")).map(x => (x._2._1._2,(x._2._1._3.toDouble,1))) //moveid ,rating point people number .reduceByKey((v1, v2) =>( v1._1+v2._1, v1._2+v2._2)) .map(x=>(x._2._1.toDouble / x._2._2,x._1)) .sortByKey(false).map(x =>(x._2,x._1)) // desc .take(15) //get top 10 maleWideInfo.foreach(println) fmaleWideInfo.foreach(println) // get the ages top n number ex. 20-29 ,30-39 so on ,so group by maybe need filter //age has been etl /** * - Age is chosen from the following ranges: * 1: "Under 18" * 18: "18-24" * 25: "25-34" * 35: "35-44" * 45: "45-49" * 50: "50-55" * 56: "56+" * */ //get userid and user age val userAgeInterval = usersRDD.map(_.split("::")).map(x=> (x(0),x(2))).filter(_._2.equals("25")) //"users.dat": UserID::Gender::Age //here just think there are a little data in usrage interval table ,broadcast ----data to ---> executor (one or many tasks in each exector ) //since the data has been filter by age ,so only using user id ,fine val finalUserIdSet = mutable.HashSet() ++ userAgeInterval.map(_._1).collect() // here should be use 2 ++ val finalUserIdSetBroadCast = sscc.broadcast(finalUserIdSet) // here define how to broadcast data set //"ratings.dat": UserID::MovieID::Rating::Timestamp //"movies.dat": MovieID::Title val movieID2Nmae = moviesRDD.map(_.split("::")).map(x=> (x(0),x(1))).collect().toMap val finalUserInfos = ratingsRDD.map(_.split("::")).map(x => (x(0),x(1))).filter(x=> finalUserIdSetBroadCast.value.contains(x._1)) .map( x => (x._2,1)) // movied and calculate .reduceByKey(_+_).sortBy(_._2,false).take(19) //false mean desc (2858,1334) .map(x => (movieID2Nmae.getOrElse(x._1,null),x._2)) //(Men in Black (1997),971) println("top n by age: ") finalUserInfos.foreach(println )while (true){} //using to check status by websscc.stop() }}
0 0
- Spark RDD 实现电影点评用户行为分析 (Scala)
- 《Spark商业案例与性能调优实战100课》第1课:商业案例之通过RDD实现分析大数据电影点评系统中电影的用户行为信息
- Spark商业案例与性能调优实战100课》第2课:商业案例之通过RDD实现分析大数据电影点评系统中电影流行度分析
- Spark商业案例与性能调优实战100课》第2课:商业案例之通过RDD实现分析大数据电影点评系统中电影流行度分析
- Spark日志分析项目Demo(4)--RDD使用,用户行为统计分析
- 《Spark商业案例与性能调优实战100课》第6课:商业案例之通过Spark SQL实现大数据电影用户行为分析
- Spark RDD算子/SparkSQL分别实现对电影数据集的简单数据分析
- spark core之java和scala电影受众分析系统--用户和年龄性别分布
- spark sql之java和scala电影受众分析系统--用户和年龄性别分布
- Spark商业案例与性能调优实战100课》第3课:商业案例之通过RDD分析大数据电影点评系各种类型的最喜爱电影TopN及性能优化技巧
- Spark的RDD检查点实现分析
- Spark 的键值对(pair RDD)操作,Scala实现
- 基于Spark的用户行为路径分析
- spark rdd scala相关使用
- Spark RDD 源码分析
- Spark商业案例与性能调优实战100课》第11课:商业案例之通过纯粹通过DataFrame分析大数据电影点评系仿QQ和微信、淘宝等用户群分析与实战
- Spark-Scala-RDD 入门问题汇总
- Spark项目:知名手机厂商用户行为实时分析系统
- Chrome开发者工具不完全指南(六、插件篇)
- Linux usb子系统(一) _写一个usb鼠标驱动
- 编程思想——程序设计
- 设计模式——工厂方法模式
- solrj的使用,环境准备,工程搭建,索引创建,添加\修改索引,删除索引,查询
- Spark RDD 实现电影点评用户行为分析 (Scala)
- 编写原生android应用
- Linux下MPI环境的安装配置及MPI程序的编译运行
- Site Mesh页面框架
- JavaEE 的EJB容器搭建的错误与解决方法
- 小五音乐
- LeetCode 2. Add Two Numbers
- 使用国内镜像库安装TensorFlow
- ASCII码排序