Spark RDD API 基本操作

来源:互联网 发布:python字符串处理 编辑:程序博客网 时间:2024/05/18 07:46
object RddTest {  def main(args: Array[String]): Unit = {    val conf = new SparkConf().setMaster("local[*]").setAppName("rdd test")    val sc = new SparkContext(conf)    //mapTest(sc)    //val file= sc.textFile("/user-logs-large.txt")    //file.flatMap(_.split("\\s")).foreach(println)    //distinctTest(sc)    // filterTest(sc)    // keyByTest(sc)    //sortByTest(sc)    // rePartitionTest(sc)    groupBy(sc)    sc.stop()  }  //flatMap map mapPatitions  def mapTest(sc: SparkContext) = {    val file = sc.textFile("/user-logs-large.txt")    val mapResult = file.map(x => {      val info = x.split("\\s")      (info(0), info(1))    })    //mapResult.take(10).foreach(println)    //分区转换    val mapPartitionResult = file.mapPartitions(x => {      var info = new Array[String](3)      for (line <- x) yield {        info = line.split("\\s")        (info(0), info(1))      }    })    //mapPartitionResult.take(10).foreach(println)    //通过转换把一条new_tweet的记录转换成2login的记录    val flatMapResult = file.flatMap(x => {      val info = x.split("\\s")      info(0) match {        case "new_tweet" => for (i <- 1 to 2) yield s"${info(0)} login ${info(2)}"        case _ => Array(x)      }    })    //flatMapResult.take(10).foreach(println)  }  //distinct排重  def distinctTest(sc: SparkContext) = {    val file = sc.textFile("/user-logs-large.txt", 3)    val userRdd = file.map(_.split("\\t")(0)).distinct()    userRdd.foreach(println)  }  //过滤  def filterTest(sc: SparkContext) = {    val file = sc.textFile("/user-logs-large.txt", 3)    val loginFilter = file.filter(_.split("\\s")(1) == "login")    loginFilter.foreach(println)    println(loginFilter.count())  }  //keyBy 结果的key值是自定义的,v是原数据x  def keyByTest(sc: SparkContext) = {    val file = sc.textFile("/user-logs-large.txt", 3)    val userActionType = file.keyBy(x => {      val info = x.split("\\s")      s"${info(0)}-----${info(1)}"    })    userActionType.foreach(println)  }  //sortBy排序  def sortByTest(sc: SparkContext) = {    val file = sc.textFile("/spark/part-00001")    val sortByResult = file.sortBy(x => x.split("\\s+")(1).toInt)    sortByResult.foreach(println)  }  //topN  def topNTest(sc: SparkContext) = {    val list = List(1, 2, 5, 11, 545, 22, 12, 55)    val rdd = sc.parallelize(list, 2)    val takeOrdered = rdd.takeOrdered(3)    takeOrdered.foreach(println) //默认升序    val topN = rdd.top(3)    topN.foreach(println) //默认降序  }  //重新分区  def rePartitionTest(sc: SparkContext) = {    val file = sc.textFile("/user-logs-large.txt")    val result = file.repartition(5)    file.foreachPartition(x => {      println(s"fileRdd分区,该分区数据:${x.size} ")    })    //reParttion分区 宽依赖    result.foreachPartition(x => {      var sum = 0      x.foreach(x => sum += 1)      println(s"resultRdd分区,该分区数据:$sum ")    })    //coalsce分区 窄依赖    val coalResult = file.coalesce(3)    coalResult.foreachPartition(x => {      println(s"coalResultRdd:${x.size}")    })  }  //groupBy  def groupBy(sc: SparkContext) = {    val file = sc.textFile("/user-logs-large.txt")    val groupBy = file.groupBy(x => x.split("\\s")(0))    groupBy.foreachPartition(x => {      println(s"GroupByRdd分区,该分区数据:${x.size} ")    })    groupBy.foreach(x => {      println(s"GroupByRdd的一条记录,key${x._1},value上集合的记录是:${x._2.size}")    })    //计算用户登录次数   groupBy.foreach(x=>{      var sum=0      x._2.foreach(line=>{        line.split("\\s")(1) match{          case "login"=>sum+=1          case _=>        }      })     println(s"user:${x._1},logintimes:$sum")   })  }  def aggSumTest(sc: SparkContext)={    val list=List(3,5,8,9,12,55)    val rdd=sc.parallelize(list,3)    //reduce计算sum    val reduceSum=rdd.reduce(_+_)    //fold计算sum    val foldRedult=rdd.fold(0)(_+_)    //aggreagate把元素连成一个字符串    val aggregateResult=rdd.aggregate("")((c,v)=>{      c match{        case ""=>v.toString        case _=>s"$c,$v"      }    },(c1,c2)=>{      c1 match{        case ""=>c2        case _=>s"$c1,$c2"      }    })      println(s"reduceResult: $reduceSum")      println(s"foldRedult: $foldRedult")      println(s"aggregateResult: $aggregateResult")  }  //persist  def persistTest(sc: SparkContext)={    val file=sc.textFile("/user-logs-large.txt")    file.cache()    file.persist(StorageLevel.MEMORY_ONLY)    //计算用户的数量    file.map(x=>x.split("\\s")(0)).distinct().count    //计算ip的数量    file.map(x=>x.split("\\s")(2)).distinct().count    }