Spark: sortBy sortByKey 二次排序

来源:互联网 发布:烟台市网络党校电脑管 编辑:程序博客网 时间:2024/06/10 02:20

Sample data

(考场号,班级号,学号)–> 考场号升序,班级号升序,学号降序

1 1 31 1 41 2 81 3 73 2 93 5 111 4 131 5 122 1 142 1 102 4 12 3 52 4 63 5 23 2 151 1 162 2 173 3 182 2 193 3 20

sortBy

package com.spark.sortimport org.apache.spark.{SparkConf, SparkContext}import scala.reflect.ClassTag/**  * Created by wqh on 2017/9/11.  */object TestsortBy extends App {    val conf = new SparkConf()    conf.setAppName("TestsortBy").setMaster("local[4]")    val sc = new SparkContext(conf)    val rdd1 = sc.textFile("/Users/wqh/Desktop/data/s.txt", 4)    val rdd2 = rdd1.flatMap(line => Array(line.split(" "))).map(t => (t(0), t(1), t(2)))    val mysortBy = new Ordering[Tuple3[String, String, String]] {        override def compare(x: (String, String, String), y: (String, String, String)): Int = {            val r = x._1.compare(y._1)            val r2 = x._2.compare(y._2)            if (r == 0) {                if (r2 == 0) y._3.toInt - x._3.toInt else x._2.toInt - y._2.toInt            } else r        }    }    val rdd3 = rdd2.sortBy(x => x)(mysortBy, ClassTag.apply[Tuple3[String, String, String]](classOf[Tuple3[String, String, String]]))    rdd3.collect().foreach(println)}

sortByKey

package com.spark.sortimport org.apache.spark.{SparkConf, SparkContext}/**  * Created by wqh on 2017/9/11.  */object TestsortByKey extends App {    val conf = new SparkConf()    conf.setAppName("TestsortBykey").setMaster("local[4]")    val sc = new SparkContext(conf)    val rdd1 = sc.textFile("/Users/wqh/Desktop/data/s.txt", 4)    val rdd2 = rdd1.flatMap(line => Array(line.split(" "))).map(t => ((t(0), t(1), t(2)),1))    implicit val mysort = new Ordering[Tuple3[Int,Int,Int]]{        override def compare(x: (Int, Int, Int), y: (Int, Int, Int)): Int = {            val r = x._1.compare(y._1)            val r2 = x._2.compare(y._2)            if (r == 0) {                if (r2 == 0) y._3 - x._3 else x._2 - y._2            } else r        }    }    val rdd3 = rdd2.sortByKey().collect()    for((k,v) <- rdd3){println(k)}    //rdd3.keys.foreach(println) error}

SecondarySort

sortByKey 的二次排序 重新构成需要排序的数据为key 整个line为value

package com.spark.sortimport org.apache.spark.{SparkConf, SparkContext}/**  * Created by wqh on 2017/9/11.  */object SecondarySort extends App {    val conf = new SparkConf()    conf.setAppName("TestsortBy").setMaster("local[4]")    val sc = new SparkContext(conf)    val rdd1 = sc.textFile("/Users/wqh/Desktop/data/s.txt", 4)    val rdd2 = rdd1.map(line => {        val r = line.split(" ")        val key = new SecondarySortKey(r(0).toInt, r(1).toInt,r(2).toInt)        (key, line)    })    val res = rdd2.sortByKey().map(t => t._2)    res.collect().foreach(println)}

重构key的类:class SecondarySortKey

package com.spark.sort/**  * Created by wqh on 2017/9/12.  */class SecondarySortKey(val first: Int, val second: Int, val third: Int) extends Ordered[SecondarySortKey] with Serializable {    override def compare(other: SecondarySortKey): Int = {        val r = first.compare(other.first)        val r2 = second.compare(other.second)        if (r == 0) {            if (r2 == 0) other.third - this.third else this.second - other.second        } else r    }}

Result

1 1 161 1 41 1 31 2 81 3 71 4 131 5 122 1 142 1 102 2 192 2 172 3 52 4 62 4 13 2 153 2 93 3 203 3 183 5 113 5 2