Spark 排序原理
来源:互联网 发布:PHP地址匹配正则 编辑:程序博客网 时间:2024/06/14 12:00
Spark基本排序原理
- 经典wordcount排序原理,单词个数降序
Java版BasicSort
public class BasicSort {public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName(BasicSort.class.getSimpleName()).setMaster("local[2]"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> linesRDD = sc.textFile("E:/test/word.txt"); JavaRDD<String> wordsRDD= linesRDD.flatMap(x -> Arrays.asList( x.split(" "))); JavaPairRDD<String, Integer> pairsRDD = wordsRDD.mapToPair(x -> new Tuple2<String, Integer>(x, 1)); JavaPairRDD<String, Integer> rwordsRDD = pairsRDD.reduceByKey((v1, v2) -> v1 + v2); List<Tuple2<String, Integer>> collect = rwordsRDD.mapToPair(x -> new Tuple2<Integer, String>(x._2, x._1)) .sortByKey(false) .map(x -> new Tuple2<String, Integer>(x._2, x._1)) .collect(); for (Tuple2<String,Integer> x: collect) { System.out.println(x._1() + "---->" + x._2()); }}}
Scala版本BasicSort
object wordcount { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("wordcount").setMaster("local") val sc = new SparkContext(conf) val wcRDD = sc.textFile("E:/test/word.txt").flatMap(_.split(" ")) .map((_,1)).reduceByKey(_+_) val collect = wcRDD.map( x =>(x._2,x._1)).sortByKey(false).map(x => (x._2,x._1)).collect collect.foreach(x => println(x)) }}
-Spark二次排序
Java版本 Spark二次排序
public class SecondSortApp {public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName(SecondSortApp.class.getSimpleName()).setMaster("local[2]"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("E:/test/sort.txt"); List<Tuple2<SecondSort, String>> collect = lines.mapToPair(line -> new Tuple2<SecondSort, String>(new SecondSort(line.split(" ")[0], line.split(" ")[1]), line)) .sortByKey().collect(); for (Tuple2<SecondSort, String> t: collect) { System.out.println(t._2()); }}}class SecondSort implements Comparable<SecondSort>,Serializable{private int first;private int sencod;public SecondSort(int first, int sencod) { this.first = first; this.sencod = sencod;}public SecondSort(String first,String second) { this.first = Integer.valueOf(first.trim()); this.sencod = Integer.valueOf(second.trim());}public SecondSort() {}public int getFirst() { return first;}public void setFirst(int first) { this.first = first;}public int getSencod() { return sencod;}public void setSencod(int sencod) { this.sencod = sencod;}@Overridepublic int compareTo(SecondSort o) { int ret = first -o.first; if (ret == 0){ ret = o.sencod - sencod; } return ret;}}
Scala版本二次排序
object SecondSortAPP {def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SecondSortAPP").setMaster("local[2]") val sc = new SparkContext(conf) val linesRDD = sc.textFile("E:/test/sort.txt") val collect = linesRDD.map(line =>(new SecondSort(line.split(" ")(0),line.split(" ")(1)),line)).sortByKey().collect() collect.foreach(x => println(x._2)) }}class SecondSort(val first:String,val second:String) extends Ordered[SecondSort] with Serializable{ def getFirst() = first def getSecond() = second override def compare(that: SecondSort): Int = { var ret = first.compareTo(that.first) if(ret == 0){ ret = second.compareTo(that.second) } ret }}
-Spark topN
Java版本Spark topN问题
public class TopN {public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName(TopN.class.getName()).setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> linesRDD = sc.textFile("E:/test/topn.txt"); int topn = Integer.valueOf(args[0]); Broadcast<Integer> topN = sc.broadcast(topn); JavaPairRDD<String, Iterable<String>> result = linesRDD.mapToPair(line -> new Tuple2<String, String>(line.split(" ")[0], line.split(" ")[1])) .groupByKey().mapToPair(x -> { TreeSet<String> set = new TreeSet<String>(new Mycomparator() { @Override public int compare(String o1, String o2) { int ret = o1.compareTo(o2); if (ret == 0){ //不去重 ret = 1; } return ret; } }); for (String sorce : x._2()) { set.add(sorce); if (set.size() > topN.value()) { set.pollLast(); } } return new Tuple2<String, Iterable<String>>(x._1, set); }).sortByKey(); result.foreach(x -> System.out.println(x)); }}//因为对象序列化在这里没有用,比较器也需要序列化interface Mycomparator extends Comparator<String>,Serializable{}
Scala版 Spark topN问题
object TopN { def main(args: Array[String]): Unit = {val conf = new SparkConf().setAppName("TopN").setMaster("local[2]")val sc = new SparkContext(conf)val linesRDD = sc.textFile("E:/test/topn.txt")linesRDD.map(line => new Tuple2[String,String](line.split(" ")(0),line.split(" ")(1))).groupByKey() .sortByKey().map(x => MyTopN(3,x)).foreach(x => println(x))}ing]]):Tuple2[String,Iterable[String]] ={//特别注意,他妈逼scala 与 java TreeSet自定义比较器的时候,不一样//Java直接从第一个括号传进去,Scala要另起一括号,操var set = mutable.TreeSet[String]()(new Ordering[String]() { override def compare(x: String, y: String): Int = { var ret = x.compareTo(y) if(ret == 0){ ret = 1 } ret }})for(s <-tuple._2){ set += s if (set.size>topn){ set = set.take(topn) }}new Tuple2[String,mutable.Iterable[String]](tuple._1,set) }}
0 0
- Spark 排序原理
- spark原理
- spark原理介绍
- spark原理介绍
- spark RDD的原理
- spark streaming原理
- Spark 之Streaming--原理
- spark基本工作原理
- Spark原理介绍
- Spark GraphX原理介绍
- Spark核心编程原理
- spark原理详解
- Spark Streaming原理介绍
- Spark UI界面原理
- spark on hive原理
- Spark计算引擎原理
- Spark UI界面原理
- Spark执行原理概述
- 在eclipse中使用Tomcat8.0时出现Could not publish server ...错误
- UVA 1331 Minimax Triangulation
- salt源码安装软件和yum安装软件
- Android ActionBar完全解析,使用官方推荐的最佳导航栏(下)
- 第一个布局
- Spark 排序原理
- Python合并字典键值并去除重复元素
- jquery easy ui中根据第一个下拉框框选中的值,设置第二个下拉框是否可以编辑
- mount挂载
- 创建第一个node.js应用
- 图片存档
- Fortran 中的五种基本数据类型和派生类型介绍
- C语言-指针操作
- 安卓之recycleview