Spark的二次排序解决方案
来源:互联网 发布:22周排畸b超数据看男女 编辑:程序博客网 时间:2024/06/06 00:07
一、MapReduce/Hadoop的二次排序解决方案(点击打开)
二、Spark的二次排序解决方案
方案1:同(一)的方案1,将一个给定键的所有值读取并缓存到一个List数组(Array)数据结构中,然后对这些值进行排序。如果内存不够放,则无法实现
方案2:同(一)的方案2,“会为自然键增加部分或整个值来创建一个组合键以实现排序目标”
三、代码实现(一):使用Spark Java API 中的groupByKey实现方案1(Java原始语法、lambda语法)
package ercipaixu_spark1;// STEP-0: import required Java/Spark classes.import java.util.List;import java.util.SortedMap;import java.util.TreeMap;//import scala.Tuple2;//import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.PairFunction;import org.apache.spark.api.java.function.Function2;///** * SecondarySortUsingCombineByKey class implements the secondary sort design pattern * by using combineByKey(). * * * Input: * * name, time, value * x,2,9 * y,2,5 * x,1,3 * y,1,7 * y,3,1 * x,3,6 * z,1,4 * z,2,8 * z,3,7 * z,4,0 * p,1,10 * p,3,60 * p,4,40 * p,6,20 * * Output: generate a time-series looking like this: * * t1 t2 t3 t4 t5 t6 * x => [3, 9, 6] * y => [7, 5, 1] * z => [4, 8, 7, 0] * p => [10, null, 60, 40, null , 20] * * x => [(1,3), (2,9), (3,6)] where 1 < 2 < 3 * y => [(1,7), (2,5), (3,1)] where 1 < 2 < 3 * z => [(1,4), (2,8), (3,7), (4,0)] where 1 < 2 < 3 < 4 * p => [(1,10), (3,60), (4,40), (6,20)] where 1 < 3 < 4 < 6 * * @author Mahmoud Parsian * */public class SecondarySortUsingCombineByKey { public static void main(String[] args) throws Exception { args = new String[2]; args[0] = "/media/chenjie/0009418200012FF3/ubuntu/sample_input.txt"; args[1] = "/media/chenjie/0009418200012FF3/ubuntu/sample_output"; // STEP-1: read input parameters and validate them if (args.length < 2) { System.err.println("Usage: SecondarySortUsingCombineByKey <input> <output>"); System.exit(1); } String inputPath = args[0]; System.out.println("inputPath=" + inputPath); String outputPath = args[1]; System.out.println("outputPath=" + outputPath); // STEP-2: Connect to the Sark master by creating JavaSparkContext object final JavaSparkContext ctx = SparkUtil.createJavaSparkContext("local","spark1"); // STEP-3: Use ctx to create JavaRDD<String> // input record format: <name><,><time><,><value> JavaRDD<String> lines = ctx.textFile(inputPath, 1); // STEP-4: create (key, value) pairs from JavaRDD<String> where // key is the {name} and value is a pair of (time, value). // The resulting RDD will be JavaPairRDD<String, Tuple2<Integer, Integer>>. // convert each record into Tuple2(name, time, value) // PairFunction<T, K, V>T => Tuple2(K, V) where K=String and V=Tuple2<Integer, Integer> // input K V System.out.println("=== DEBUG STEP-4 ==="); JavaPairRDD<String, Tuple2<Integer, Integer>> pairs = lines.mapToPair(new PairFunction<String, String, Tuple2<Integer, Integer>>() { @Override public Tuple2<String, Tuple2<Integer, Integer>> call(String s) { String[] tokens = s.split(","); // x,2,5 System.out.println(tokens[0] + "," + tokens[1] + "," + tokens[2]); Tuple2<Integer, Integer> timevalue = new Tuple2<Integer, Integer>(Integer.parseInt(tokens[1]), Integer.parseInt(tokens[2])); return new Tuple2<String, Tuple2<Integer, Integer>>(tokens[0], timevalue); } }); // STEP-5: validate STEP-4, we collect all values from JavaPairRDD<> and print it. List<Tuple2<String, Tuple2<Integer, Integer>>> output = pairs.collect(); for (Tuple2 t : output) { Tuple2<Integer, Integer> timevalue = (Tuple2<Integer, Integer>) t._2; System.out.println(t._1 + "," + timevalue._1 + "," + timevalue._1); } // How to use combineByKey(): to use combineByKey(), you // need to define 3 basic functions f1, f2, f3: // and then you invoke it as: combineByKey(f1, f2, f3) // function 1: create a combiner data structure // function 2: merge a value into a combined data structure // function 3: merge two combiner data structures // function 1: create a combiner data structure // Here, the combiner data structure is a SortedMap<Integer,Integer>, // which keeps track of (time, value) for a given key // Tuple2<Integer, Integer> = Tuple2<time, value> // SortedMap<Integer, Integer> = SortedMap<time, value> Function<Tuple2<Integer, Integer>, SortedMap<Integer, Integer>> createCombiner = new Function<Tuple2<Integer, Integer>, SortedMap<Integer, Integer>>() { @Override public SortedMap<Integer, Integer> call(Tuple2<Integer, Integer> x) { Integer time = x._1; Integer value = x._2; SortedMap<Integer, Integer> map = new TreeMap<>(); map.put(time, value); return map; } }; // function 2: merge a value into a combined data structure Function2<SortedMap<Integer, Integer>, Tuple2<Integer, Integer>, SortedMap<Integer, Integer>> mergeValue = new Function2<SortedMap<Integer, Integer>, Tuple2<Integer, Integer>, SortedMap<Integer, Integer>>() { @Override public SortedMap<Integer, Integer> call(SortedMap<Integer, Integer> map, Tuple2<Integer, Integer> x) { Integer time = x._1; Integer value = x._2; map.put(time, value); return map; } }; // function 3: merge two combiner data structures Function2<SortedMap<Integer, Integer>, SortedMap<Integer, Integer>, SortedMap<Integer, Integer>> mergeCombiners = new Function2<SortedMap<Integer, Integer>, SortedMap<Integer, Integer>, SortedMap<Integer, Integer>>() { @Override public SortedMap<Integer, Integer> call(SortedMap<Integer, Integer> map1, SortedMap<Integer, Integer> map2) { if (map1.size() < map2.size()) { return DataStructures.merge(map1, map2); } else { return DataStructures.merge(map1, map2); } } }; // STEP-5: create sorted (time, value) JavaPairRDD<String, SortedMap<Integer, Integer>> combined = pairs.combineByKey( createCombiner, mergeValue, mergeCombiners); // STEP-7: validate STEP-6, we collect all values from JavaPairRDD<> and print it. System.out.println("=== DEBUG STEP-6 ==="); List<Tuple2<String, SortedMap<Integer, Integer>>> output2 = combined.collect(); for (Tuple2<String, SortedMap<Integer, Integer>> t : output2) { String name = t._1; SortedMap<Integer, Integer> map = t._2; System.out.println(name); System.out.println(map); } // persist output combined.saveAsTextFile(outputPath); // done! ctx.close(); // exit System.exit(0); }}
其中用到的比较器类,因为我们是按时间进行比较,所以比较(int,int)的第一个分量
package ercipaixu_spark1;import scala.Tuple2;import java.util.Comparator;import java.io.Serializable;/** * The SparkTupleComparator class enable us to compare two * Tuple2<Integer, Integer> objects based on the first Tuple2 * argument. * * @author Mahmoud Parsian * */public class SparkTupleComparator implements Comparator<Tuple2<Integer, Integer>>, Serializable { public static final SparkTupleComparator INSTANCE = new SparkTupleComparator(); private SparkTupleComparator() { } @Override public int compare(Tuple2<Integer, Integer> t1, Tuple2<Integer, Integer> t2){ return t1._1.compareTo(t2._1); }}
四、代码实现(二):使用Spark Java API 中的combineByKey实现方案1(Java原始语法、lambda语法)
五、代码实现(三):使用Spark Java API 中的repartitionAndSortWithinPartitions实现方案1
六、代码实现(四):使用Scala语言实现方案2
import org.apache.spark.Partitioner/*** * 自定义Partitioner * @param partitions */class CustomPartitioner(partitions: Int) extends Partitioner { require(partitions > 0, s"Number of partitions ($partitions) cannot be negative.") def numPartitions: Int = partitions def getPartition(key: Any): Int = key match { case (k: String, v: Int) => math.abs(k.hashCode % numPartitions) case null => 0 case _ => math.abs(key.hashCode % numPartitions) } override def equals(other: Any): Boolean = other match { case h: CustomPartitioner => h.numPartitions == numPartitions case _ => false } override def hashCode: Int = numPartitions}
import org.apache.spark.SparkConfimport org.apache.spark.SparkContextobject SecondarySort { def main(args: Array[String]): Unit = { val partitions = 1//分区数为1 val inputPath = "file:///media/chenjie/0009418200012FF3/ubuntu/sample_input.txt"//输入文件路径 val outputPath = "file:///media/chenjie/0009418200012FF3/ubuntu/sample_output"//输出文件路径 val conf = new SparkConf().setAppName("CJResult").setMaster("local")//设置Spark在本地运行 val sc = new SparkContext(conf)//新建sc val input = sc.textFile(inputPath)//读取输入文件 val valueToKey = input.map(x => { val line = x.split(",") ((line(0) + "-" + line(1), line(2).toInt), line(2).toInt) })//将输入文件映射为((名字-时间,值),值)的复合键值对集合,其中的键也为键值对 implicit def tupleOrderingDesc = new Ordering[Tuple2[String, Int]] { override def compare(x: Tuple2[String, Int], y: Tuple2[String, Int]): Int = { if (y._1.compare(x._1) == 0) y._2.compare(x._2) else y._1.compare(x._1) } }//定义一个隐式转换,对于(String,Int)类型的键值对(二元组),将其按照:如果键不相同按照键排序,如果键相同再按值排序 //其实就是将上面的(名字-时间,值)进行排序 val sorted = valueToKey.repartitionAndSortWithinPartitions(new CustomPartitioner(partitions)) val result = sorted.map { case (k, v) => (k._1, v) }//将((名字-时间,值),值)转为(名字-时间,值) result.saveAsTextFile(outputPath)//将结果写入文件 sc.stop() }}
阅读全文
0 0
- Spark的二次排序解决方案
- Spark的高级排序(二次排序)
- Spark的高级排序(二次排序)
- Spark中的二次排序
- spark 二次排序实现
- spark二次排序
- Spark二次排序
- spark二次排序
- spark简单二次排序
- spark二次排序
- Spark Scala 二次排序
- Spark之二次排序
- Spark分组二次排序
- Spark Scala 二次排序
- Spark Java 二次排序
- Spark二次排序
- MapReduce/Hadoop的二次排序解决方案
- Spark二次排序学习总结
- 技术文章
- TCP/IP数据包结构详解
- Android 线程通信(Handler + Message + Looper) 4
- HttpPost
- CHAPTER 10-Introduction to Artifcial Neural Networks
- Spark的二次排序解决方案
- Date与LocalDateTime、LocalDate、LocalTime互转
- 丘成桐在CNCC会议的演讲全文
- 程序员,30岁,如何趁 AI 浪潮突破年薪天花板?
- 为什么Java开发人员都带眼镜 | 程序员搞笑段子合集
- ofbiz——上传工具类HttpRequestFileUpload修改优化
- 【SpringBoot】RestTemplate调用报错:'org.springframework.web.client.RestTemplate' that could not be found.
- 自定义view画圆跟着鼠标移动
- 圆2