Spark的高级排序(二次排序)

来源:互联网 发布:二手笔记本电脑淘宝 编辑:程序博客网 时间:2024/06/03 19:13

为了多维的排序,需要考虑多个条件,这要求我们自定义key

1 233 223 311 122 114 45

二、使用java实现

2.1、自定义key

使用scala.math.Ordered接口,实现Serializable接口

package com.chb.sparkDemo.secondarySort;import java.io.Serializable;import scala.math.Ordered;/** * Spark 二次排序自定义key * 使用scala.math.Ordered接口 * @author 12285 */public class MyKey implements Ordered<MyKey>, Serializable{    private int firstKey;    private int secondKey;    public MyKey(int firstKey, int secondKey) {        super();        this.firstKey = firstKey;        this.secondKey = secondKey;    }       public int getFirstKey() {        return firstKey;    }    public int getSecondKey() {        return secondKey;    }    public void setFirstKey(int firstKey) {        this.firstKey = firstKey;    }    public void setSecondKey(int secondKey) {        this.secondKey = secondKey;    }    public boolean $greater(MyKey other) {        if (this.getFirstKey() > other.getFirstKey()) {            return true;        }else if(this.getFirstKey() == other.getFirstKey() && this.getSecondKey() > other.getSecondKey()){            return true;        }else {            return false;        }    }    public boolean $greater$eq(MyKey other) {        if ($greater(other) || this.getFirstKey()==other.getFirstKey() && this.getSecondKey() == other.getSecondKey()) {            return true;        }        return false;    }    public boolean $less(MyKey other) {        if (this.getFirstKey() < other.getFirstKey()) {            return true;        }else if(this.getFirstKey() == other.getFirstKey() && this.getSecondKey() < other.getSecondKey()){            return true;        }else {            return false;        }    }    public boolean $less$eq(MyKey other) {        if ($less(other) || this.getFirstKey()==other.getFirstKey() && this.getSecondKey() == other.getSecondKey()) {            return true;        }        return false;    }    public int compare(MyKey other) {        if (this.getFirstKey() != other.getFirstKey()) {            return this.getFirstKey()-other.getFirstKey();        }else {            return this.getSecondKey() - other.getSecondKey();        }    }    public int compareTo(MyKey other) {        if (this.getFirstKey() != other.getFirstKey()) {            return this.getFirstKey()-other.getFirstKey();        }else {            return this.getSecondKey() - other.getSecondKey();        }    }    @Override    public int hashCode() {        final int prime = 31;        int result = 1;        result = prime * result + firstKey;        result = prime * result + secondKey;        return result;    }    @Override    public boolean equals(Object obj) {        if (this == obj)            return true;        if (obj == null)            return false;        if (getClass() != obj.getClass())            return false;        MyKey other = (MyKey) obj;        if (firstKey != other.firstKey)            return false;        if (secondKey != other.secondKey)            return false;        return true;    }}

2.2、具体实现步骤

第一步: 自定义key 实现scala.math.Ordered接口,和Serializeable接口
第二步:将要进行二次排序的数据加载,按照<key,value>格式的RDD
第三步:使用sortByKey 基于自定义的key进行二次排序
第四步:去掉排序的key,只保留排序的结果

2.2.1、 第一步: 自定义key 实现scala.math.Ordered接口,和Serializeable接口

        JavaPairRDD<MyKey, String> mykeyPairs = lines.mapToPair(new PairFunction<String, MyKey, String>() {            private static final long serialVersionUID = 1L;            public Tuple2<MyKey, String> call(String line) throws Exception {                int firstKey = Integer.valueOf(line.split(" ")[0]);                int secondKey = Integer.valueOf(line.split(" ")[1]);                MyKey mykey = new MyKey(firstKey, secondKey);                return new Tuple2<MyKey, String>(mykey, line);            }        });

2.2.2、第三步:使用sortByKey 基于自定义的key进行二次排序

    JavaPairRDD<MyKey, String> sortPairs = mykeyPairs.sortByKey();

2.2.3、第四步:去掉排序的key,只保留排序的结果

JavaRDD<String> result = sortPairs.map(new Function<Tuple2<MyKey,String>, String>() {            private static final long serialVersionUID = 1L;            public String call(Tuple2<MyKey, String> tuple) throws Exception {                return tuple._2;//line            }        });        //打印排序好的结果        result.foreach(new VoidFunction<String>() {            private static final long serialVersionUID = 1L;            public void call(String line) throws Exception {                System.out.println(line);            }        });

三、完整代码

package com.chb.sparkDemo.secondarySort;import io.netty.handler.codec.http.HttpContentEncoder.Result;import java.awt.image.RescaleOp;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.PairFunction;import org.apache.spark.api.java.function.VoidFunction;import scala.Tuple2;/** * Spark二次排序的具体实现步骤: * 第一步: 自定义key 实现scala.math.Ordered接口,和Serializeable接口 * 第二步:将要进行二次排序的数据加载,按照<key,value>格式的RDD * 第三步:使用sortByKey 基于自定义的key进行二次排序 * 第四步:去掉排序的key,只保留排序的结果 * @author 12285 * */public class SecordSortTest {    public static void main(String[] args) {        SparkConf conf = new SparkConf().setMaster("local").setAppName("WordCount");        //内部实际调用的SparkContext        JavaSparkContext jsc = new JavaSparkContext(conf);        //读取文件,将每行数据转换为        JavaRDD<String> lines = jsc.textFile("C:\\Users\\12285\\Desktop\\test");//hadoopRDD        //第二步:将要进行二次排序的数据加载,按照<key,value>格式的RDD        JavaPairRDD<MyKey, String> mykeyPairs = lines.mapToPair(new PairFunction<String, MyKey, String>() {            private static final long serialVersionUID = 1L;            public Tuple2<MyKey, String> call(String line) throws Exception {                int firstKey = Integer.valueOf(line.split(" ")[0]);                int secondKey = Integer.valueOf(line.split(" ")[1]);                MyKey mykey = new MyKey(firstKey, secondKey);                return new Tuple2<MyKey, String>(mykey, line);            }        });        //第三步:使用sortByKey 基于自定义的key进行二次排序        JavaPairRDD<MyKey, String> sortPairs = mykeyPairs.sortByKey();        //第四步:去掉排序的key,只保留排序的结果        JavaRDD<String> result = sortPairs.map(new Function<Tuple2<MyKey,String>, String>() {            private static final long serialVersionUID = 1L;            public String call(Tuple2<MyKey, String> tuple) throws Exception {                return tuple._2;//line            }        });        //打印排序好的结果        result.foreach(new VoidFunction<String>() {            private static final long serialVersionUID = 1L;            public void call(String line) throws Exception {                System.out.println(line);            }        });    }}结果:1 121 232 113 223 314 45

四、使用scala实现

4.1、自定义key

class SecordSortKey(val firstKey: Int, val secondKey: Int)extends Ordered[SecordSortKey] with Serializable{    override def compare(that: SecordSortKey):Int = {      if(this.firstKey != that.firstKey) {        this.firstKey - that.firstKey      }else {        this.secondKey - that.secondKey      }    }    }

4.2、具体实现

import org.apache.spark.SparkConfimport org.apache.spark.SparkContextobject SecordSortTest {    def main(args: Array[String]): Unit = {      val conf = new SparkConf().setMaster("local[2]").setAppName("SecordSort")      val sc = new SparkContext(conf);      val lines = sc.textFile("C:\\Users\\12285\\Desktop\\test");      //第二步:将要进行二次排序的数据加载,按照<key,value>格式的RDD      val pairSortKey = lines.map { line => (        new SecordSortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt),        line      ) };      //第三步:使用sortByKey 基于自定义的key进行二次排序     val sortPair = pairSortKey.sortByKey(false);     val sortResult = sortPair.map(line=>line._2);     sortResult.collect().foreach { x => print(x) };    }}
原创粉丝点击