1.SparkRDD join案例

来源:互联网 发布:淘宝无线视频看不到了 编辑:程序博客网 时间:2024/06/05 04:51

1.本实例主要假设有两个数据源,一个是使用了SparkSQL的API读取JSON文件,另一个数据源来自于自己构造的数据信息(实例中的peopleInformation)。

2.实例演示了JSON文件的读取,和RDD的JOIN操作,以及Schema,RDD和ROW之间的有趣关系;

3.具体实例如下:

import org.apache.spark.SparkConfimport org.apache.spark.SparkContextimport org.apache.spark.sql.SQLContextimport org.apache.spark.sql.types.DataTypesimport org.apache.spark.sql.types.StructFieldimport org.apache.spark.sql.RowFactoryimport org.apache.spark.sql.Rowimport org.apache.spark.sql.types.StructTypeimport org.apache.spark.sql.types.StringTypeimport org.apache.spark.sql.types.IntegerTypeobject SparkSQLWithJoin {  def main(args: Array[String]): Unit = {    val conf = new SparkConf().setMaster("local").setAppName("SparkSQLWithJoin");    val sc = new SparkContext(conf)    val sqlContext = new SQLContext(sc)        //1.从json文件中读取json并生成DataFrame    val df = sqlContext.read.json("xxxxxx\\spark-1.6.0-bin-hadoop2.6\\examples\\src\\main\\resources\\peoples.json")        //2.将DataFrame注册成临时表    df.registerTempTable("peopleScores")        //3.执行SparkSQL查找出成绩>90的人的姓名和成绩    val execellentStudentDF = sqlContext.sql("select name,score from peopleScores where score > 90")        //4.将名字提取数组    val execellentStudent = execellentStudentDF.rdd.map(row => row(0)).collect()        //5.模拟jsons文件生成    val peopleInformation = Array("{\"name\":\"Michael\", \"age\":20}","{\"name\":\"Andy\", \"age\":17}","{\"name\":\"Justin\", \"age\":19}","{\"name\":\"zhangsan\", \"age\":25}")        for(item <- peopleInformation) println(item)        //6.将json读进来的Array转换成RDD    val peopleInformationRDD = sc.parallelize(peopleInformation)        //7.将RDD转换成DataFrame    val peopleInformationDF = sqlContext.read.json(peopleInformationRDD)        //8.注册临时表    peopleInformationDF.registerTempTable("peopleInformations")        //9.拼接sql    var sqltext = "select name,age from peopleInformations where name in ("    for(i <- 0 until execellentStudent.size) {      sqltext += "'" + execellentStudent(i) + "'"      if(i < execellentStudent.size - 1) {        sqltext += ","      }    }    sqltext += ")"        println(sqltext)        //10.执行sql提取DataFrame    val execellentNameAgeDF = sqlContext.sql(sqltext)        //11.将execellentNameAgeDF转换成RDD并执行join        val execellentNameAgeRDD = execellentNameAgeDF.rdd.map(row => (row.getString(0),row.getLong(1).toInt))        val execellentNameScoresRDD = execellentStudentDF.rdd.map(row => (row.getString(0),row.getLong(1).toInt))        val resultRDD = execellentNameAgeRDD join execellentNameScoresRDD        val resultRowRDD = resultRDD.map(item => Row(item._1,item._2._1,item._2._1))        import scala.collection.mutable.ArrayBuffer        //12.构建schema    val schema = StructType(StructField("name",StringType,true)::StructField("age",IntegerType,true)::StructField("score",IntegerType,true):: Nil)        val personInfoDF = sqlContext.createDataFrame(resultRowRDD, schema)    personInfoDF.show        val resultList = personInfoDF.rdd.collect()        for(item <- resultList) println(item)      }}


2 0
原创粉丝点击