Spark简单函数 JAVA/SCALA

来源：互联网发布：js原型链继承编辑：程序博客网时间：2024/06/15 14:57

一、用Java编写的Spark函数
package com.css.ideaSpark;

import org.apache.spark.Accumulator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import org.apache.spark.broadcast.Broadcast;
import scala.Tuple2;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

public class TransformationOperation {
public static void main(String[] args){

    SparkConf conf = new SparkConf()            .setMaster("local")            .setAppName("TransformationOperation");    conf.set("spark.testing.memory", "2147480000");    /*SparkWordCountFunction(conf);    ParallellizeFunction(conf);    FilterFunction(conf);    FileSizeFunction(conf);    GroupBykeyFunction(conf);    reduceByKeyFunction(conf);    SortBykeyFunction(conf);    JoinByKeyFunction(conf);    congroupFunction(conf);    BroadcastVariable(conf);    accumulableFunction(conf);    secondSordFunction(conf);    TopnFunction(conf);*/    GroupTopnFunction(conf);}//public static void SparkWordCountFunction(SparkConf conf){    JavaSparkContext sc= new JavaSparkContext(conf);    JavaRDD<String> lines = sc.textFile("/opt/modules/hadoop-2.5.0-cdh5.3.6/input.txt");    JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {        private static final long serialVersionUID = 1L;        public Iterable<String> call(String line) throws Exception {            return  Arrays.asList(line.split(" "));        }    });    JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {        private static final long serialVersionUID = 1L;        public Tuple2<String, Integer> call(String word) throws Exception {            return new Tuple2<String, Integer>(word,1);        }    });    JavaPairRDD<String, Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {        private static final long serialVersionUID = 1L;        @Override        public Integer call(Integer v1, Integer v2) throws Exception {            return v1 + v2;        }    });    wordCounts.foreach(new VoidFunction<Tuple2<String,Integer>>() {        public void call(Tuple2<String, Integer> wordCount) throws Exception {            System.out.println(wordCount._1 + "------" + wordCount._2+"times.");        }           private static final long serialVersionUID = 1L;    });    sc.close();}public static void ParallellizeFunction(SparkConf conf){    JavaSparkContext sc= new JavaSparkContext(conf);    List<Integer> numbers= Arrays.asList(1,2,3,4,5);    JavaRDD<Integer> numberRDD= sc.parallelize(numbers);    int sum = numberRDD.reduce( new Function2<Integer, Integer, Integer>(){        public Integer call(Integer v1,Integer v2) throws Exception{            return v1 +v2;        }    });    System.out.println(sum);    sc.close();}public static void FilterFunction(SparkConf conf){    JavaSparkContext sc= new JavaSparkContext(conf);    List<Integer> list= Arrays.asList(1,2,3,4,5,6,7,8,9,10);    JavaRDD<Integer> numRdd= sc.parallelize(list);    JavaRDD<Integer> selectedNumRdd= numRdd.filter(new Function<Integer, Boolean>() {        @Override        public Boolean call(Integer num) throws Exception {            return num%2==0;        }    });    selectedNumRdd.foreach(new VoidFunction<Integer>() {        @Override        public void call(Integer num) throws Exception {            System.out.println(num);        }    });    sc.close();}public static void FileSizeFunction(SparkConf conf){    JavaSparkContext sc= new JavaSparkContext(conf);    JavaRDD<String> lines=sc.textFile("/opt/modules/hadoop-2.5.0-cdh5.3.6/input.txt");    JavaRDD<Integer> lineRdd =lines.map(new Function<String, Integer>() {        @Override        public Integer call(String line) throws Exception {            return line.length();        }    });    int filesize = lineRdd.reduce(new Function2<Integer, Integer, Integer>() {        @Override        public Integer call(Integer v1, Integer v2) throws Exception {            return v1+v2;        }    });    System.out.println(filesize);    sc.close();}public static void GroupBykeyFunction(SparkConf conf){    JavaSparkContext sc= new JavaSparkContext(conf);    List<Tuple2<String,Integer>> scoredata= Arrays.asList(            new Tuple2<String,Integer>("class1",70),            new Tuple2<String,Integer>("class2",80),            new Tuple2<String,Integer>("class3",90),            new Tuple2<String,Integer>("class2",60),            new Tuple2<String,Integer>("class1",80)    );    JavaPairRDD<String,Integer> scoreRdd= sc.parallelizePairs(scoredata);    JavaPairRDD<String,Iterable<Integer>> scoreGroupRdd= scoreRdd.groupByKey();    scoreGroupRdd.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {        @Override        public void call(Tuple2<String, Iterable<Integer>> tuple) throws Exception {            System.out.println("class:  "+tuple._1);            Iterator<Integer> it= tuple._2.iterator();            while(it.hasNext()){                System.out.println(it.next());            }            System.out.println("==============================");        }    });    sc.close();}public static void reduceByKeyFunction(SparkConf conf) {    JavaSparkContext sc = new JavaSparkContext(conf);    List<Tuple2<String, Integer>> scores = Arrays.asList(            new Tuple2<String, Integer>("class1", 80),            new Tuple2<String, Integer>("class1", 90),            new Tuple2<String, Integer>("class2", 80),            new Tuple2<String, Integer>("class1", 60),            new Tuple2<String, Integer>("class2", 80)    );    JavaPairRDD<String, Integer> scoreRdd = sc.parallelizePairs(scores);    JavaPairRDD<String,Integer> scoreReduceByKeyRdd= scoreRdd.reduceByKey(new Function2<Integer, Integer, Integer>() {        @Override        public Integer call(Integer v1, Integer v2) throws Exception {            return v1+v2;        }    });    scoreReduceByKeyRdd.foreach(new VoidFunction<Tuple2<String, Integer>>() {        @Override        public void call(Tuple2<String, Integer> ScoreTuple) throws Exception {            System.out.println("class :     "+ScoreTuple._1+"           scoreSum:     "+ScoreTuple._2);        }    });    sc.close();}public static void SortBykeyFunction(SparkConf conf){    JavaSparkContext sc= new JavaSparkContext(conf);    List<Tuple2<Integer, String>> scores = Arrays.asList(            new Tuple2<Integer, String>(90,"name1"),            new Tuple2<Integer, String>(80,"name2"),            new Tuple2<Integer, String>(88,"name3"),            new Tuple2<Integer, String>(60,"name4"),            new Tuple2<Integer, String>(50,"name5")    );    JavaPairRDD<Integer,String> scoresRdd= sc.parallelizePairs(scores);    JavaPairRDD<Integer,String> scoresSortByKeyRdd=scoresRdd.sortByKey(false);    scoresSortByKeyRdd.foreach(new VoidFunction<Tuple2<Integer, String>>() {        @Override        public void call(Tuple2<Integer, String> ScoreTuple) throws Exception {            System.out.println("score:     "+ScoreTuple._1+"          name: :     "+ScoreTuple._2);        }    });    sc.close();}public static void JoinByKeyFunction(SparkConf conf){    JavaSparkContext sc= new JavaSparkContext(conf);    List<Tuple2<Integer,String>> studensInfo = Arrays.asList(            new Tuple2<Integer,String>(1,"zhangsan"),            new Tuple2<Integer, String>(2,"lisi"),            new Tuple2<Integer, String>(3,"wangwu"),            new Tuple2<Integer, String>(4,"zhaoliu")    );    List<Tuple2<Integer,String>> studentsScore= Arrays.asList(            new Tuple2<Integer, String>(1,"99"),            new Tuple2<Integer, String>(2,"88"),            new Tuple2<Integer, String>(3,"77"),            new Tuple2<Integer, String>(4,"66")    );    JavaPairRDD<Integer,String> studentIfoRdd= sc.parallelizePairs(studensInfo);    JavaPairRDD<Integer,String> studentsScoreRdd= sc.parallelizePairs(studentsScore);    JavaPairRDD<Integer,Tuple2<String,String>> joinInfoRdd= studentIfoRdd.join(studentsScoreRdd);    joinInfoRdd.foreach(new VoidFunction<Tuple2<Integer, Tuple2<String, String>>>() {        @Override        public void call(Tuple2<Integer, Tuple2<String, String>> t) throws Exception {            System.out.println("id:     "+t._1);            System.out.println("name:    "+t._2._1);            System.out.println("score:   "+t._2._2);            System.out.println("======================================");        }    });    sc.close();}public static void congroupFunction(SparkConf conf){    JavaSparkContext sc= new JavaSparkContext(conf);    List<Tuple2<Integer,String>> studensInfo = Arrays.asList(            new Tuple2<Integer,String>(1,"zhangsan"),            new Tuple2<Integer, String>(2,"lisi"),            new Tuple2<Integer, String>(3,"wangwu"),            new Tuple2<Integer, String>(4,"zhaoliu")    );    List<Tuple2<Integer,String>> studentsScore= Arrays.asList(            new Tuple2<Integer, String>(1,"99"),            new Tuple2<Integer, String>(2,"88"),            new Tuple2<Integer, String>(3,"77"),            new Tuple2<Integer, String>(4,"66"),            new Tuple2<Integer, String>(1,"78"),            new Tuple2<Integer, String>(3,"78"),            new Tuple2<Integer, String>(2,"78")    );    JavaPairRDD<Integer,String> studentIfoRdd= sc.parallelizePairs(studensInfo);    JavaPairRDD<Integer,String> studentsScoreRdd= sc.parallelizePairs(studentsScore);    JavaPairRDD<Integer,Tuple2<Iterable<String>,Iterable<String>>> joinInfoRdd= studentIfoRdd.cogroup(studentsScoreRdd);    joinInfoRdd.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Iterable<String>, Iterable<String>>>>() {        @Override        public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<String>>> t) throws Exception {            System.out.println("id:     "+t._1);            System.out.println("name:     "+t._2._1);            System.out.println("score:     "+t._2._2);            System.out.println("=================================");        }    });    sc.close();}public static void BroadcastVariable(SparkConf conf){    JavaSparkContext sc = new JavaSparkContext(conf);    List<Integer> numss= Arrays.asList(1,2,3,4,5);    JavaRDD<Integer> numsRdd= sc.parallelize(numss);    final Broadcast  broadcastvariable =sc.broadcast(2);    numsRdd.foreach(new VoidFunction<Integer>() {        @Override        public void call(Integer data) throws Exception {            System.out.println(Integer.valueOf(broadcastvariable.getValue().toString())*data);        }    });    sc.close();}public static void accumulableFunction(SparkConf conf){    JavaSparkContext sc= new JavaSparkContext(conf);    final Accumulator<Integer>  accumulator =sc.accumulator(0);    List<Integer> numss= Arrays.asList(1,2,3,4,5);    JavaRDD<Integer> numsRdd= sc.parallelize(numss);    numsRdd.foreach(new VoidFunction<Integer>() {        @Override        public void call(Integer data) throws Exception {            accumulator.add(data);        }    });    System.out.println(accumulator.value());    sc.close();}public static void secondSordFunction(SparkConf conf){    JavaSparkContext sc= new JavaSparkContext(conf);    JavaRDD<String> linesRdd=sc.textFile("/opt/modules/hadoop-2.5.0-cdh5.3.6/data.txt");    JavaPairRDD<MyKey,String> myKeyPairsRdd= linesRdd.mapToPair(new PairFunction<String, MyKey, String>() {        @Override        public Tuple2<MyKey, String> call(String line) throws Exception {           int firstkey=Integer.valueOf(line.split(" ")[0]);           int secondkey=Integer.valueOf(line.split(" ")[1]);           MyKey mykey = new MyKey(firstkey,secondkey);           return new Tuple2<MyKey,String>(mykey,line);        }    });    JavaPairRDD<MyKey,String> sortedRdd= myKeyPairsRdd.sortByKey();    JavaRDD<String> resultRDd =sortedRdd.map(new Function<Tuple2<MyKey, String>, String>() {        @Override        public String call(Tuple2<MyKey, String> myKeyStringTuple2) throws Exception {            return myKeyStringTuple2._2;        }    });    resultRDd.foreach(new VoidFunction<String>() {        @Override        public void call(String line) throws Exception {            System.out.println(line);        }    });    sc.close();}public static void TopnFunction(SparkConf conf){    JavaSparkContext sc= new JavaSparkContext(conf);    JavaRDD<String> linesRdd=sc.textFile("/opt/modules/hadoop-2.5.0-cdh5.3.6/data2.txt");    JavaPairRDD<Integer,String> PairsRdd= linesRdd.mapToPair(new PairFunction<String, Integer, String>() {        @Override        public Tuple2<Integer, String> call(String line) throws Exception {            return new Tuple2<Integer,String>(Integer.valueOf(line),line);        }    });    JavaPairRDD<Integer,String> topnRdd= PairsRdd.sortByKey(false);    JavaRDD<String> resultRdd= topnRdd.map(new Function<Tuple2<Integer, String>, String>() {        @Override        public String call(Tuple2<Integer, String> Tuple2) throws Exception {            return Tuple2._2;        }    });    Iterator<String> It=resultRdd.take(3).iterator();  while( It.hasNext()){      System.out.println(It.next());  }    sc.close();}public static void GroupTopnFunction(SparkConf conf){    JavaSparkContext sc= new JavaSparkContext(conf);    JavaRDD<String> linesRdd=sc.textFile("/opt/modules/hadoop-2.5.0-cdh5.3.6/data3.txt");    JavaPairRDD<String,Integer> pairRDD= linesRdd.mapToPair(new PairFunction<String , String, Integer>() {        @Override        public Tuple2<String, Integer> call(String line) throws Exception {            return new Tuple2<String, Integer>(line.split(" ")[0],Integer.valueOf(line.split(" ")[1]));        }    });    JavaPairRDD<String,Iterable<Integer>> groupRdd= pairRDD.groupByKey();    JavaPairRDD<String,Iterable<Integer>> groupTop3Rdd=groupRdd.mapToPair(new PairFunction<Tuple2<String, Iterable<Integer>>, String, Iterable<Integer>>() {        @Override        public Tuple2<String, Iterable<Integer>> call(Tuple2<String, Iterable<Integer>> TuplePara) throws Exception {            String className= TuplePara._1;            Iterator<Integer> It= TuplePara._2.iterator();            Integer top3[] =new Integer[3];            while (It.hasNext()){                Integer score = It.next();                for(int i=0;i<3;i++){                    if(top3[i]==null){                        top3[i]=score;                        break;                    }else if(score>top3[i]){                        for(int j=2;j>i;j--){                            top3[j]=top3[j-1];                        }                        top3[i] =score;                        break;                    }                }            }            return new Tuple2<String,Iterable<Integer>>(className,Arrays.asList(top3));        }    });    groupTop3Rdd.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {        @Override        public void call(Tuple2<String, Iterable<Integer>> TuplePara) throws Exception {            System.out.println("className:    "+TuplePara._1);            Iterator<Integer> iterator= TuplePara._2.iterator();            while (iterator.hasNext()) {                System.out.println("score:    "+iterator.next());            }        }    });    sc.close();}

}

//实现二次排序自己创建的Key
package com.css.ideaSpark;

import scala.math.Ordered;

import java.io.Serializable;

public class MyKey implements Ordered,Serializable{

private int firstKey;private int secondKey;public MyKey(int firstKey, int secondKey) {    this.firstKey = firstKey;    this.secondKey = secondKey;}public int getFirstKey() {    return firstKey;}public int getSecondKey() {    return secondKey;}public void setFirstKey(int firstKey) {    this.firstKey = firstKey;}public void setSecondKey(int secondKey) {    this.secondKey = secondKey;}@Overridepublic boolean $less(MyKey that) {    if(this.getFirstKey()<that.getFirstKey()){            return  true;    }else if((this.getFirstKey()==that.getFirstKey())&&(this.getSecondKey()<that.getSecondKey())){        return true;    }else{        return false;    }}@Overridepublic boolean $greater(MyKey that) {    if(this.getFirstKey()>that.getFirstKey()){        return  true;    }else if((this.getFirstKey()==that.getFirstKey())&&(this.getSecondKey()>that.getSecondKey())){        return true;    }else{        return false;    }}@Overridepublic boolean $less$eq(MyKey that) {    if($less(that)||this.getFirstKey()==that.getFirstKey()&&this.getSecondKey()==that.getSecondKey()){        return true;    }else{        return false;    }}@Overridepublic boolean $greater$eq(MyKey that) {    if($greater(that)|this.getFirstKey()==that.getFirstKey()&&this.getSecondKey()==that.getSecondKey()){        return true;    }else{        return false;    }}@Overridepublic int compare(MyKey that) {    if(this.getFirstKey()!=that.getFirstKey()){        return this.getFirstKey()-that.getFirstKey();    }else{        return this.getSecondKey()-that.getSecondKey();    }}@Overridepublic int compareTo(MyKey that) {    if(this.getFirstKey()!=that.getFirstKey()){        return this.getFirstKey()-that.getFirstKey();    }else{        return this.getSecondKey()-that.getSecondKey();    }}@Overridepublic boolean equals(Object o) {    if (this == o) return true;    if (o == null || getClass() != o.getClass()) return false;    MyKey myKey = (MyKey) o;    if (firstKey != myKey.firstKey) return false;    return secondKey == myKey.secondKey;}@Overridepublic int hashCode() {    final int prime=31;    int result=1;    result = prime * result + firstKey;    result = prime * result + secondKey;    return result;}

}

二、用Scala程序编写的Spark函数
package com.css.scala

import breeze.optimize.linear.LinearProgram
import org.apache.spark.{SparkConf, SparkContext}

import scala.util.control.Breaks

object TransformationOperation {

def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster(“local”)
.setAppName(“TransformationOperation”)
/* ScalaWordCountFunction(conf)
ParallelizeFunction(conf)
FilterFunction(conf)
FileSizeFunction(conf)
GroupBykeyFunction(conf)
ReduceByKeyFunction(conf)
SortByKeyFunction(conf)
JoinBykeyFunction(conf)
CoGroupFunction(conf)
broadcastVariable(conf)
accmulableFunction(conf)
secondSordFunction(conf)
TopnFunction(conf)*/
GroupTopnFunction(conf)

}

def ScalaWordCountFunction(conf: SparkConf):Unit={

val sc= new SparkContext(conf)val lines= sc.textFile("/opt/modules/hadoop-2.5.0-cdh5.3.6/input.txt")val words = lines.flatMap(line =>line.split(" "))val pairs = words.map(word =>(word,1))val wordCounts=pairs.reduceByKey(_+_)wordCounts.foreach(wordCount =>println(wordCount._1+"     appread     "+wordCount._2+"       times"))// lines.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect().foreach(println)sc.stop()

}

def ParallelizeFunction(conf:SparkConf): Unit ={

val sc= new SparkContext(conf)val numbers = Array(1,2,3,4,5)val numberRdd= sc.parallelize(numbers)val sum =numberRdd.reduce(_+_)println(sum)sc.stop();

}

def FilterFunction(conf:SparkConf): Unit ={

val sc= new SparkContext(conf)val nums= Array(1,2,3,4,5,6,7,8,9,10)val numRdd= sc.parallelize(nums)//  numRdd.filter(num=>num%2==0)numRdd.filter(_%2==0).foreach(println(_))sc.stop()

}

def FileSizeFunction(conf :SparkConf): Unit ={

val sc= new SparkContext(conf)val linesRdd= sc.textFile("/opt/modules/hadoop-2.5.0-cdh5.3.6/input.txt")val fileSZ= linesRdd.map(line=>(line.length)).reduce(_+_);println(fileSZ)sc.stop()

}

def GroupBykeyFunction(conf :SparkConf): Unit ={

val sc= new SparkContext(conf)val scores = Array(("class1",77),("class2",80),("class3",90),("class1",87),("class3",78))val scoresRdd = sc.parallelize(scores)scoresRdd.groupByKey().foreach(score=>{  println("class:  "+score._1)  score._2.foreach(scoredata=>println(scoredata))  println("=============================")})sc.stop()

}

def ReduceByKeyFunction(conf:SparkConf): Unit ={
val sc= new SparkContext(conf)
val scores = Array(new Tuple2(“class1”,80),
new Tuple2(“class1”,80),
new Tuple2(“class2”,80),
new Tuple2(“class2”,80),
new Tuple2(“class1”,80))

val scoresRdd = sc.parallelize(scores)scoresRdd.reduceByKey(_+_).foreach(classScore=>{println("class:   "+classScore._1+"   scoreSum    "+classScore._2)})

}

def SortByKeyFunction(conf:SparkConf): Unit ={

val sc= new SparkContext(conf)val scores = Array(new Tuple2(90,"name1"),  new Tuple2(80,"name2"),  new Tuple2(88,"name3"),  new Tuple2(89,"name4"),  new Tuple2(80,"name5"),  new Tuple2(90,"name6"),  new Tuple2(99,"name7"))val scoresRdd = sc.parallelize(scores)scoresRdd.sortBy(_._1,false).foreach(Score=>{println("score:  "+Score._1+"   name    "+Score._2)})

}

def JoinBykeyFunction(conf :SparkConf): Unit ={
val sc= new SparkContext(conf)
val studentsInfo=Array(new Tuple2(1,”name1”),
new Tuple2(2,”name2”),
new Tuple2(3,”name3”),
new Tuple2(4,”name4”),
new Tuple2(5,”name5”))

val studentsScore = Array(new Tuple2(1,"99"),  new Tuple2(2,"88"),  new Tuple2(3,"77"),  new Tuple2(4,"66"),  new Tuple2(2,"97"))val studentsInfoRdd=sc.parallelize(studentsInfo)val studentsScoreRdd=sc.parallelize(studentsScore)val joinRdd=studentsInfoRdd.join(studentsScoreRdd)joinRdd.foreach(joinPara=>{  println("ID:  "+joinPara._1)  println("name:  "+joinPara._2._1)  println("score:  "+joinPara._2._2)  println("============================")})sc.stop()

}

def CoGroupFunction(conf:SparkConf): Unit ={

val sc= new SparkContext(conf)val studentInfo= Array(new Tuple2(1,"name1"),  new Tuple2(2,"name2"),  new Tuple2(3,"name3"),  new Tuple2(4,"name4"),  new Tuple2(4,"name5"))val studentScore=Array(new Tuple2(1,"77"),  new Tuple2(2,"88"),  new Tuple2(3,"99"),  new Tuple2(4,"66"),  new Tuple2(1,"77"),  new Tuple2(2,"88"),  new Tuple2(3,"99"),  new Tuple2(4,"66"))val studentInfoRdd= sc.parallelize(studentInfo)val studentScoreRdd=sc.parallelize(studentScore)val cogroupRdd= studentInfoRdd.cogroup(studentScoreRdd)cogroupRdd.foreach(congroupValue=>{  println("id:    "+congroupValue._1)  println("name:  "+congroupValue._2._1)  println("score:  "+congroupValue._2._2)})sc.stop()

}

def broadcastVariable(conf:SparkConf): Unit ={  val sc= new SparkContext(conf)  val broadcastvaliale = sc.broadcast(2)  val arrNum= Array(1,2,3,4,5)  val arrNumRdd= sc.parallelize(arrNum)  arrNumRdd.foreach(data=>println(broadcastvaliale.value*data))  sc.stop()}def accmulableFunction(conf:SparkConf): Unit ={  val sc = new SparkContext(conf)  val accmulableVariable=sc.accumulator(0)  val arrNum= Array(1,2,3,4,5)  val arrNumRdd= sc.parallelize(arrNum)  arrNumRdd.foreach(data=>{    accmulableVariable+=data    if(data==5) println(accmulableVariable)  })  sc.stop()}

def secondSordFunction(conf :SparkConf): Unit ={

val sc= new SparkContext(conf)val linesRdd= sc.textFile("/opt/modules/hadoop-2.5.0-cdh5.3.6/data.txt")val pairSortKey=linesRdd.map(line=>(  new SecondSortKey(line.split(" ")(0).toInt,line.split(" ")(1).toInt),  line));val sortPairRdd = pairSortKey.sortByKey()val sortResultRdd= sortPairRdd.map(line=>line._2)sortResultRdd.foreach(x=>println(x))sc.stop()

}

def TopnFunction(conf:SparkConf): Unit ={
val sc= new SparkContext(conf)
val linesRdd= sc.textFile(“/opt/modules/hadoop-2.5.0-cdh5.3.6/data2.txt”)
val pairRdd=linesRdd.map(line=>(line.toInt,line));

val sortPairRdd = pairRdd.sortByKey(false)val sortResultRdd= sortPairRdd.map(line=>line._2)val top3=sortResultRdd.take(3)top3.foreach(x=>println(x))sc.stop()

}

def GroupTopnFunction(conf:SparkConf): Unit ={

val sc= new SparkContext(conf)val linesRdd= sc.textFile("/opt/modules/hadoop-2.5.0-cdh5.3.6/data3.txt")val pairRdd= linesRdd.map(line=>(line.split(" ")(0),line.split(" ")(1))).groupByKey()  .map(score=>{  val iterator=score._2.iterator  val top3= new Array[Integer](3)  val loop= new Breaks  while (iterator.hasNext){    val temp = iterator.next()    loop.breakable(      for(i<- 0 until 3 ){        if(top3(i)==null){          top3(i)=temp.toInt          loop.break()        }else if(temp.toInt > top3(i)){          for(j<-Range(2,0,-1) if(j>i)){            top3(j)=top3(j-1)          }          top3(i)=temp.toInt          loop.break()        }      }    )  }  println(top3(0))  println(top3(1))  println(top3(2))  new Tuple2(score._1,top3)})pairRdd .foreach(classScore=>{  println("class:   "+classScore._1)  val iterator=classScore._2.iterator  while (iterator.hasNext){    println("score: "+iterator.next())  }})sc.stop()

}

def test(): Unit ={
var i=0;
for(j<-Range(2,0,-1) if(j>i)){

  println("j="+j+"i="+i)}

}
}

//实现二次排序自己创建的Key
package com.css.scala

class SecondSortKey(val firstKey:Int,val secondKey:Int)extends Ordered[SecondSortKey]with Serializable {
override def compare(that: SecondSortKey) :Int={
if(this.firstKey!=that.firstKey) {
this.firstKey-that.firstKey
}else{
this.secondKey-that.secondKey
}
}
}

阅读全文

0 0