Spark的二次排序解决方案

来源：互联网发布：22周排畸b超数据看男女编辑：程序博客网时间：2024/06/06 00:07

一、MapReduce/Hadoop的二次排序解决方案（点击打开）

二、Spark的二次排序解决方案

方案1：同（一）的方案1，将一个给定键的所有值读取并缓存到一个List数组（Array）数据结构中，然后对这些值进行排序。如果内存不够放，则无法实现

方案2：同（一）的方案2，“会为自然键增加部分或整个值来创建一个组合键以实现排序目标”

三、代码实现（一）：使用Spark Java API 中的groupByKey实现方案1（Java原始语法、lambda语法）

package ercipaixu_spark1;// STEP-0: import required Java/Spark classes.import java.util.List;import java.util.SortedMap;import java.util.TreeMap;//import scala.Tuple2;//import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.PairFunction;import org.apache.spark.api.java.function.Function2;///**  * SecondarySortUsingCombineByKey class implements the secondary sort design pattern  * by using combineByKey(). * * * Input: * *    name, time, value   *    x,2,9 *    y,2,5 *    x,1,3 *    y,1,7 *    y,3,1 *    x,3,6 *    z,1,4 *    z,2,8 *    z,3,7 *    z,4,0 *    p,1,10 *    p,3,60 *    p,4,40 *    p,6,20 * * Output: generate a time-series looking like this: * *       t1   t2   t3   t4  t5     t6 *  x => [3,  9,   6] *  y => [7,  5,   1] *  z => [4,  8,   7,   0] *  p => [10, null, 60, 40, null , 20] *  *  x => [(1,3), (2,9), (3,6)]            where 1 < 2 < 3 *  y => [(1,7), (2,5), (3,1)]            where 1 < 2 < 3  *  z => [(1,4), (2,8), (3,7), (4,0)]     where 1 < 2 < 3 < 4 *  p => [(1,10), (3,60), (4,40), (6,20)] where 1 < 3 < 4 < 6 * * @author Mahmoud Parsian * */public class SecondarySortUsingCombineByKey {    public static void main(String[] args) throws Exception {    args = new String[2];    args[0] = "/media/chenjie/0009418200012FF3/ubuntu/sample_input.txt";    args[1] = "/media/chenjie/0009418200012FF3/ubuntu/sample_output";        // STEP-1: read input parameters and validate them        if (args.length < 2) {            System.err.println("Usage: SecondarySortUsingCombineByKey <input> <output>");            System.exit(1);        }        String inputPath = args[0];        System.out.println("inputPath=" + inputPath);        String outputPath = args[1];        System.out.println("outputPath=" + outputPath);        // STEP-2: Connect to the Sark master by creating JavaSparkContext object        final JavaSparkContext ctx = SparkUtil.createJavaSparkContext("local","spark1");        // STEP-3: Use ctx to create JavaRDD<String>        //  input record format: <name><,><time><,><value>        JavaRDD<String> lines = ctx.textFile(inputPath, 1);        // STEP-4: create (key, value) pairs from JavaRDD<String> where        // key is the {name} and value is a pair of (time, value).        // The resulting RDD will be JavaPairRDD<String, Tuple2<Integer, Integer>>.            // convert each record into Tuple2(name, time, value)        // PairFunction<T, K, V>T => Tuple2(K, V) where K=String and V=Tuple2<Integer, Integer>        //                                                                                     input   K       V        System.out.println("===  DEBUG STEP-4 ===");        JavaPairRDD<String, Tuple2<Integer, Integer>> pairs = lines.mapToPair(new PairFunction<String, String, Tuple2<Integer, Integer>>() {            @Override            public Tuple2<String, Tuple2<Integer, Integer>> call(String s) {                String[] tokens = s.split(","); // x,2,5                System.out.println(tokens[0] + "," + tokens[1] + "," + tokens[2]);                Tuple2<Integer, Integer> timevalue = new Tuple2<Integer, Integer>(Integer.parseInt(tokens[1]), Integer.parseInt(tokens[2]));                return new Tuple2<String, Tuple2<Integer, Integer>>(tokens[0], timevalue);            }        });        // STEP-5: validate STEP-4, we collect all values from JavaPairRDD<> and print it.            List<Tuple2<String, Tuple2<Integer, Integer>>> output = pairs.collect();        for (Tuple2 t : output) {            Tuple2<Integer, Integer> timevalue = (Tuple2<Integer, Integer>) t._2;            System.out.println(t._1 + "," + timevalue._1 + "," + timevalue._1);        }        // How to use combineByKey(): to use combineByKey(), you         // need to define 3 basic functions f1, f2, f3:        // and then you invoke it as: combineByKey(f1, f2, f3)        //    function 1: create a combiner data structure         //    function 2: merge a value into a combined data structure        //    function 3: merge two combiner data structures                        // function 1: create a combiner data structure                 // Here, the combiner data structure is a SortedMap<Integer,Integer>,        // which keeps track of (time, value) for a given key        // Tuple2<Integer, Integer> = Tuple2<time, value>        // SortedMap<Integer, Integer> = SortedMap<time, value>        Function<Tuple2<Integer, Integer>, SortedMap<Integer, Integer>> createCombiner                = new Function<Tuple2<Integer, Integer>, SortedMap<Integer, Integer>>() {            @Override            public SortedMap<Integer, Integer> call(Tuple2<Integer, Integer> x) {                Integer time = x._1;                Integer value = x._2;                SortedMap<Integer, Integer> map = new TreeMap<>();                map.put(time, value);                return map;            }        };        // function 2: merge a value into a combined data structure        Function2<SortedMap<Integer, Integer>, Tuple2<Integer, Integer>, SortedMap<Integer, Integer>> mergeValue                = new Function2<SortedMap<Integer, Integer>, Tuple2<Integer, Integer>, SortedMap<Integer, Integer>>() {            @Override            public SortedMap<Integer, Integer> call(SortedMap<Integer, Integer> map, Tuple2<Integer, Integer> x) {                Integer time = x._1;                Integer value = x._2;                map.put(time, value);                return map;            }        };        // function 3: merge two combiner data structures        Function2<SortedMap<Integer, Integer>, SortedMap<Integer, Integer>, SortedMap<Integer, Integer>> mergeCombiners                = new Function2<SortedMap<Integer, Integer>, SortedMap<Integer, Integer>, SortedMap<Integer, Integer>>() {            @Override            public SortedMap<Integer, Integer> call(SortedMap<Integer, Integer> map1, SortedMap<Integer, Integer> map2) {                if (map1.size() < map2.size()) {                    return DataStructures.merge(map1, map2);                } else {                    return DataStructures.merge(map1, map2);                }            }        };        // STEP-5: create sorted (time, value)        JavaPairRDD<String, SortedMap<Integer, Integer>> combined = pairs.combineByKey(                createCombiner,                mergeValue,                mergeCombiners);        // STEP-7: validate STEP-6, we collect all values from JavaPairRDD<> and print it.            System.out.println("===  DEBUG STEP-6 ===");        List<Tuple2<String, SortedMap<Integer, Integer>>> output2 = combined.collect();        for (Tuple2<String, SortedMap<Integer, Integer>> t : output2) {            String name = t._1;            SortedMap<Integer, Integer> map = t._2;            System.out.println(name);            System.out.println(map);        }        // persist output        combined.saveAsTextFile(outputPath);        // done!        ctx.close();                // exit        System.exit(0);    }}

其中用到的比较器类，因为我们是按时间进行比较，所以比较（int,int）的第一个分量

package ercipaixu_spark1;import scala.Tuple2;import java.util.Comparator;import java.io.Serializable;/**  * The SparkTupleComparator class enable us to compare two  * Tuple2<Integer, Integer> objects based on the first Tuple2 * argument. * * @author Mahmoud Parsian * */public class SparkTupleComparator    implements Comparator<Tuple2<Integer, Integer>>, Serializable {       public static final SparkTupleComparator INSTANCE = new SparkTupleComparator();      private SparkTupleComparator() {   }      @Override   public int compare(Tuple2<Integer, Integer> t1, Tuple2<Integer, Integer> t2){      return t1._1.compareTo(t2._1);   }}

四、代码实现（二）：使用Spark Java API 中的combineByKey实现方案1（Java原始语法、lambda语法）

五、代码实现（三）：使用Spark Java API 中的repartitionAndSortWithinPartitions实现方案1

六、代码实现（四）：使用Scala语言实现方案2

import org.apache.spark.Partitioner/***  * 自定义Partitioner  * @param partitions  */class CustomPartitioner(partitions: Int) extends Partitioner {    require(partitions > 0, s"Number of partitions ($partitions) cannot be negative.")  def numPartitions: Int = partitions  def getPartition(key: Any): Int = key match {    case (k: String, v: Int) => math.abs(k.hashCode % numPartitions)    case null                => 0    case _                   => math.abs(key.hashCode % numPartitions)  }  override def equals(other: Any): Boolean = other match {    case h: CustomPartitioner => h.numPartitions == numPartitions    case _                    => false  }  override def hashCode: Int = numPartitions}

import org.apache.spark.SparkConfimport org.apache.spark.SparkContextobject SecondarySort {  def main(args: Array[String]): Unit = {    val partitions = 1//分区数为1    val inputPath = "file:///media/chenjie/0009418200012FF3/ubuntu/sample_input.txt"//输入文件路径    val outputPath = "file:///media/chenjie/0009418200012FF3/ubuntu/sample_output"//输出文件路径    val conf = new SparkConf().setAppName("CJResult").setMaster("local")//设置Spark在本地运行    val sc = new SparkContext(conf)//新建sc    val input = sc.textFile(inputPath)//读取输入文件    val valueToKey = input.map(x => {      val line = x.split(",")      ((line(0) + "-" + line(1), line(2).toInt), line(2).toInt)    })//将输入文件映射为（（名字-时间，值），值）的复合键值对集合，其中的键也为键值对    implicit def tupleOrderingDesc = new Ordering[Tuple2[String, Int]] {      override def compare(x: Tuple2[String, Int], y: Tuple2[String, Int]): Int = {        if (y._1.compare(x._1) == 0) y._2.compare(x._2)        else y._1.compare(x._1)      }    }//定义一个隐式转换，对于（String，Int）类型的键值对（二元组），将其按照：如果键不相同按照键排序，如果键相同再按值排序    //其实就是将上面的（名字-时间，值）进行排序    val sorted = valueToKey.repartitionAndSortWithinPartitions(new CustomPartitioner(partitions))    val result = sorted.map {      case (k, v) => (k._1, v)    }//将（（名字-时间，值），值）转为（名字-时间，值）    result.saveAsTextFile(outputPath)//将结果写入文件    sc.stop()  }}

阅读全文

0 0