Spark Scala 二次排序

来源:互联网 发布:裤哥大战淘宝店主 编辑:程序博客网 时间:2024/05/17 06:48
package com.second.sortbysparkimport org.apache.spark.{SparkConf, SparkContext}/**  * Created by xxxxx on 3/14/2017.  */object SecondarySort {  def main(args: Array[String]): Unit = {    val sparkConf = new SparkConf().setAppName("second sort").setMaster("local[4]")    val sc = new SparkContext(sparkConf)    //(1,1193)    // (1,661)    val dataRDD = sc.textFile("data/medium/ratings.dat")//问题 在于如果 list 过大 怎么弄 ??    val splittedRDD = dataRDD.map(_.split("::")).map(x =>(x(0),x(1).toInt))  //如果是字符串怎么比较 ??    val groupSort = splittedRDD.groupByKey().sortByKey(false).map(x => (x._1,x._2.toList.sortWith(_>_)))    groupSort.take(10).foreach(println )    val finalRDD = groupSort.flatMap{ x =>      val length = x._2.length      val array = new Array[(String,Int)](length)      for(elem <- 0 until length){        array(elem) = (x._1,x._2(elem))      }      array    }.take(10)    finalRDD.foreach(println )  }}

0 0
原创粉丝点击