求学生最喜欢的老师topN两道题

来源:互联网 发布:linux vi 编辑模式 编辑:程序博客网 时间:2024/05/01 19:48

表:

http://bigdata.edu360.cn/laozhang

http://bigdata.edu360.cn/laozhang

 

一,求学生最喜欢的老师的topN

package

import org.apache.spark.rdd.RDD

import org.apache.spark.{SparkConf,SparkContext}

object FavTeacher {

  defmain(args: Array[String]): Unit = {

   //setMaster("local[*]") 指定master的本地模式,方便调试,[*]启动多个线程

   val conf = new SparkConf().setAppName("FavTeacher").setMaster("local[*]")

    valsc = new SparkContext(conf)

   //指定以后从哪里读取数据

   val lines: RDD[String] = sc.textFile(args(0))

   //将url切分,取出老师,将老师名称和一放到一个元组中

    val teacherAndOne:RDD[(String, Int)] = lines.map(line => {

      val index =line.lastIndexOf("/")

      val teacher = line.substring(index + 1)

      (teacher, 1)

    })

   //聚合

   val reduced = teacherAndOne.reduceByKey(_+_)

   //排序

   val sorted = reduced.sortBy(_._2, false)

   //取出前两名

   val top2: Array[(String, Int)] = sorted.take(2)

   println(top2.toBuffer)

   sc.stop()

  }

}

二、求学生最喜欢的每门功课的topN

package

import org.apache.spark.rdd.RDD

import org.apache.spark.{SparkConf,SparkContext}

object FavTeacher1 {

  defmain(args: Array[String]): Unit = {

   val conf = new SparkConf().setAppName("FavTeacher").setMaster("local[*]")

    valsc = new SparkContext(conf)

   val lines: RDD[String] = sc.textFile(args(0))

    //url切分,取出老师,将老师名称和一放到一个元组中

    val subjectTeacherAndOne: RDD[((String,String), Int)] = lines.map(line => {

      val index =line.lastIndexOf("/")

      val subUrl = line.substring(0, index)

      val subIndex =subUrl.lastIndexOf("/")

      val subject = subUrl.substring(subIndex +1)

      val teacher = line.substring(index + 1)

      ((subject, teacher), 1)

    })

   val reduced:RDD[((String, String), Int)] =subjectTeacherAndOne.reduceByKey(_+_)

   val grouped: RDD[(String, Iterable[((String, String), Int)])] =reduced.groupBy(_._1._1)

   //二次排序

//   val result: RDD[(String, List[((String, String), Int)])] =grouped.mapValues(it => {

//     it.toList.sortBy(_._2).reverse.take(2)

//   })

   //获取迭代器,一个迭代器就一个学科老师信息的集合

   val values: RDD[Iterable[((String, String), Int)]] = grouped.values

   //map一次就拿到一个学科

   val result = values.map(it => {

     it.toList.sortBy(_._2).reverse.take(1)(0)

   })

   val arr = result.collect()

   println(arr.toBuffer)

   sc.stop()

  }

}

原创粉丝点击