Day19 实现二次排序

来源:互联网 发布:金风科技二期淘宝地址 编辑:程序博客网 时间:2024/05/16 04:45

本文来自王家林大数据梦工厂整理:http://weibo.com/ilovepains

 分别用java 和scala 实现二次排序

分析:

// 按照order 和Serializable 实现自定义排序的key// 将要进行二次排序的文件加载进来生成(key, value) 类型的RDD//使用sortBykey 基于自定义的key 进行排序//去除掉排序的值,保留排序的结果

实现自定义key:

import java.io.Serializable;import scala.math.Ordered;/**  * @author 作者 E-mail:  * @version 创建时间:2016年2月20日 上午12:13:57  * 类说明  */public class SecondSortByKey implements Ordered<SecondSortByKey> , Serializable {    public int getFirst() {        return first;    }    public void setFirst(int first) {        this.first = first;    }    public int getSecond() {        return second;    }    public void setSecond(int second) {        this.second = second;    }    @Override    public boolean equals(Object o) {        if (this == o) return true;        if (o == null || getClass() != o.getClass()) return false;        SecondSortByKey that = (SecondSortByKey) o;        if (first != that.first) return false;        return second == that.second;    }    @Override    public int hashCode() {        int result = first;        result = 31 * result + second;        return result;    }    private  int first, second;    public SecondSortByKey(int first, int second){        this.first = first;        this.second = second;    }    public int compare(SecondSortByKey other) {        if (this.first - other.getFirst() != 0){            return this.first - other.getFirst();        } else {            return  this.second - other.getSecond();        }    }      public boolean $less(SecondSortByKey other) {        if (this.first < other.getFirst() ){            return  true;        } else if (this.first == other.getFirst() && this.second < other.getSecond()){            return true;        }       return  false;    }        public boolean $greater(SecondSortByKey other) {        if (this.first > other.getFirst()){            return  true;        } else if ( this.first == other.getFirst() && this.second > other.getSecond()){ // first equals and second bigger            return true;        }        return false;    }        public boolean $less$eq(SecondSortByKey other) {        if (this.$less(other)){            return true;        } else if (this.first == other.getFirst() && this.second == other.getSecond()){            return true;        }        return false;    }       public boolean $greater$eq(SecondSortByKey other) {   //是否相等 每次进行比较        if (this.$greater(other)){            return true;        } else if (this.first == other.getFirst() && this.second == other.getSecond()){            return true;        }        return false;    }        public int compareTo(SecondSortByKey other) {        if (this.first - other.getFirst() != 0){            return this.first - other.getFirst();        } else {            return  this.second - other.getSecond();        }    }}

 实现类:

import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.PairFunction;import org.apache.spark.api.java.function.VoidFunction;import scala.Tuple2;/**  * @author 作者 E-mail:  * @version 创建时间:2016年2月20日 上午6:54:29  * 类说明  */public class SecondArraySort {    // 按照order 和Serializable 实现自定义排序的key    // 将要进行二次排序的文件加载进来生成(key, value) 类型的RDD    //使用sortBykey 基于自定义的key 进行排序    //去除掉排序的值,保留排序的结果    public static void main(String[] args) {        SparkConf conf = new SparkConf().setAppName("sort by spark").setMaster("local");        JavaSparkContext sc = new JavaSparkContext(conf);        JavaRDD<String> lines = sc.textFile("D://googledown//datas.txt");        JavaPairRDD<SecondSortByKey, String> paris = lines.mapToPair(new PairFunction<String, SecondSortByKey, String>() {            private static final long serialVersionID = 1L;                        public Tuple2<SecondSortByKey, String> call(String line) throws Exception {                String[] strs = line.split(" ");                SecondSortByKey sortByKey = new SecondSortByKey(Integer.valueOf(strs[0]), Integer.valueOf(strs[1]));                return new Tuple2<SecondSortByKey, String>(sortByKey, line);            }        });        JavaPairRDD<SecondSortByKey, String> sorted = paris.sortByKey();    //完成二次排序        //key是自己构造的,不需要过滤后的key,保留排序结果       JavaRDD<String> secondSorted =  sorted.map(new Function<Tuple2<SecondSortByKey, String>, String>() {           private static final long serialVersionID = 1L;            public String call(Tuple2<SecondSortByKey, String> sortedContext) throws Exception {                return sortedContext._2();  // 返回value            }        });        secondSorted.foreach(new VoidFunction<String>() {                        public void call(String sorted) throws Exception {                System.out.println(sorted);            }        });    }}

pom.xml配置文件

<dependency>      <groupId>junit</groupId>      <artifactId>junit</artifactId>      <version>3.8.1</version>      <scope>test</scope>    </dependency>    <dependency>       <groupId>org.apache.spark</groupId>       <artifactId>spark-core_2.10</artifactId>       <version>1.6.0</version>    </dependency>    <dependency>        <groupId>org.apache.spark</groupId>        <artifactId>spark-sql_2.10</artifactId>        <version>1.6.0</version>    </dependency>    <dependency>      <groupId>org.apache.spark</groupId>      <artifactId>spark-hive_2.10</artifactId>      <version>1.6.0</version>    </dependency><dependency>      <groupId>org.apache.spark</groupId>      <artifactId>spark-streaming_2.10</artifactId>      <version>1.6.0</version></dependency><dependency>      <groupId>org.apache.hadoop</groupId>      <artifactId>hadoop-client</artifactId>      <version>2.6.0</version></dependency><dependency>      <groupId>org.apache.spark</groupId>      <artifactId>spark-streaming-kafka_2.10</artifactId>      <version>1.6.0</version></dependency><dependency>      <groupId>org.apache.spark</groupId>      <artifactId>spark-graphx_2.10</artifactId>      <version>1.6.0</version></dependency>

测试数据:

2 3
4 1
3 2
4 3
8 7
2 1
9 7
9 8
8 3

运行结果:


0 0