Spark Streaming累加器与广播的简单应用

来源:互联网 发布:淘宝奥巴狗游戏 编辑:程序博客网 时间:2024/06/05 07:17
package spark/** *监控网络中的数据,基于broadcast中的黑名单,对获取数据流中的单词进行过滤筛选,并统计过滤筛选出记录的个数 */import org.apache.log4j.{Level, Logger}import org.apache.spark.{SparkConf, SparkContext}import org.apache.spark.broadcast.Broadcastimport org.apache.spark.rdd.RDDimport org.apache.spark.streaming.{Seconds, StreamingContext}import org.apache.spark.Accumulator/** * 定义单例对象WordBlacklist,用来注册广播变量。 */object WordBlacklist {  @volatile private var instance: Broadcast[Seq[String]] = null  def getInstance(sc: SparkContext): Broadcast[Seq[String]] = {    if (instance == null) {      synchronized {   //同步        if (instance == null) {          val wordBlacklist = Seq("hello", "world") //黑名单的词          instance = sc.broadcast(wordBlacklist) //广播变量,一旦创建,发送给所有子节点        }      }    }    instance  }}/** * 定义单例对象DroppedWordsCounter,用来注册累加器。 */object DroppedWordsCounter {  @volatile private var instance: Accumulator[Int] = null  def getInstance(sc: SparkContext): Accumulator[Int] = {    if (instance == null) {  //如果累加器不存在则创建      synchronized {        if (instance == null) {          instance = sc.accumulator(0) //用以记录被删除的词        }      }    }    instance  }}object Streaming_AccumulatorAndBroadcast {  //1.封装业务逻辑;2.创建对象  def createContext(ip: String, port: Int, outputPath: String, checkpointDirectory: String) //接收四个参数  : StreamingContext = {    // 如果没有打印出"Creating new context"说明StreamingContext已经从checkpoint目录中创建    println("Creating new context")    // 创建ssc,设定批生成时间间隔为5s    val sparkConf = new SparkConf().setAppName("Streaming_AccumulatorAndBroadcast").setMaster("local[2]")    val ssc = new StreamingContext(sparkConf, Seconds(5))    ssc.checkpoint(checkpointDirectory)    // 创建基于ip:port的Socket数据流,数据流中word以'\n'进行切分    val lines = ssc.socketTextStream(ip, port)    val words = lines.flatMap(_.split(","))    val wordCounts = words.map((_, 1)).reduceByKey(_ + _)  //转换为二元组进行累加操作    //eg: wordCounts = {RDD1={(hell0,2),(spark,1)},RDD2={(world,1),(spark,3)}...}    wordCounts.foreachRDD { rdd=>      // 获取或注册黑名单广播变量blacklist Broadcast  Seq("hello", "world")      val blacklist = WordBlacklist.getInstance(rdd.sparkContext)//rdd.sparkContext返回当前rdd中封装的SparkContext实例      // 获取或注册删除单词累加器WordsCounter Accumulator  0      val droppedWordsCounter = DroppedWordsCounter.getInstance(rdd.sparkContext)//rdd.sparkContext返回当前rdd中封装的SparkContext实例      // 使用黑名单blacklist过滤单词,并利用droppedWordsCounter对过滤掉的单词进行统计,并输出      //      val counts = rdd.filter { case (word, count) =>      val filteredrdd=rdd.filter { case (word, count) =>        if (blacklist.value.contains(word)) {   //判断是否包含"hello"或"world"          droppedWordsCounter.add(count.toInt)          println("the word: "+word+" is deleted "+count+" times")          false  //返回false值        }        else {          true        }      }      filteredrdd.saveAsTextFile(outputPath)  //保存结果      filteredrdd.foreach(println)  //输出      println("the accumulator is "+droppedWordsCounter+"!!!!!!!!!!!!!") //返回累加器的总和    }    ssc  //返回ssc  }  def main(args: Array[String]) {    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)    if (args.length != 4) {      System.err.println("Your arguments were " + args.mkString("[", ", ", "]"))      System.exit(1)    }    //需要事先创建相应目录!!    val Array(ip, port, outputPath, checkpointDirectory) = args    val ssc = StreamingContext.getOrCreate(checkpointDirectory,      () => createContext(ip, port.toInt,outputPath,checkpointDirectory))    ssc.start()    ssc.awaitTermination()  }}

程序运行时需手动在IDE上配置四个运行参数:ip、端口、输出路径和checkpoint路径,中间以空格隔开。
这里写图片描述

还是通过早前博文上的模拟器及源文件,在9999端口发送数据。

这里写图片描述

运行主程序时,模拟器会连接上并开始发送数据,主程序监听并处理数据,输出
这里写图片描述

原创粉丝点击