sparkStreaming
来源:互联网 发布:mes软件供应商 编辑:程序博客网 时间:2024/04/28 21:52
package kafkaimport org.apache.spark._import org.apache.spark.broadcast.Broadcastimport org.apache.spark.rdd.RDDimport org.apache.spark.sql.SparkSessionimport org.apache.spark.streaming._import org.apache.spark.streaming.StreamingContext._import org.apache.spark.streaming.dstream.DStreamimport org.apache.spark.util.LongAccumulator// not necessary since Spark 1.3object sparkStreaming { @volatile private var WordBlacklist : Broadcast[Seq[String]] = null def getWordBlacklist (sc: SparkContext): Broadcast[Seq[String]] = { if (WordBlacklist == null) { synchronized { if (WordBlacklist == null) { val wordBlacklist = Seq("a", "b", "c") WordBlacklist = sc.broadcast(wordBlacklist) } } } WordBlacklist } @volatile private var DroppedWordsCounter : LongAccumulator = null def getDroppedWordsCounter (sc: SparkContext): LongAccumulator = { if (DroppedWordsCounter == null) { synchronized { if (DroppedWordsCounter == null) { DroppedWordsCounter = sc.longAccumulator("WordsInBlacklistCounter") } } } DroppedWordsCounter } def main(args: Array[String]): Unit = {// val conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount")// val ssc = new StreamingContext(conf, Seconds(1)) val conf = new SparkConf().setMaster("local[2]").setAppName("sparkStreaming") val ssc = new StreamingContext(conf, Seconds(1)) val lines = ssc.socketTextStream("localhost", 9999) // Split each line into words val words = lines.flatMap(_.split(" "))// val words: DStream[String] = ... words.foreachRDD { rdd => // Get the singleton instance of SparkSession val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate() import spark.implicits._ // Convert RDD[String] to DataFrame val wordsDataFrame = rdd.toDF("word") // Create a temporary view wordsDataFrame.createOrReplaceTempView("words") // Do word count on DataFrame using SQL and print it val wordCountsDataFrame = spark.sql("select word, count(*) as total from words group by word") wordCountsDataFrame.show() } import org.apache.spark.streaming.StreamingContext._ // Count each word in each batch val pairs = words.map(word => (word, 1)) val wordCounts = pairs.reduceByKey(_ + _) // Print the first ten elements of each RDD generated in this DStream to the console wordCounts.print() wordCounts.foreachRDD { (rdd: RDD[(String, Int)], time: Time) => // Get or register the blacklist Broadcast val blacklist = sparkStreaming.getWordBlacklist(rdd.sparkContext) // Get or register the droppedWordsCounter Accumulator val droppedWordsCounter = sparkStreaming.getDroppedWordsCounter(rdd.sparkContext) // Use blacklist to drop words and use droppedWordsCounter to count them val counts = rdd.filter { case (word, count) => if (blacklist.value.contains(word)) { droppedWordsCounter.add(count) false } else { true } }.collect().mkString("[", ", ", "]") val output = "Counts at time " + time + " " + counts } ssc.start() // Start the computation ssc.awaitTermination() // Wait for the computation to terminate }}
0 0
- sparkStreaming
- sparkStreaming
- sparkstreaming
- SparkStreaming
- sparkCookbook5--SparkStreaming
- Kafka->SparkStreaming
- SparkStreaming实战
- SparkStreaming笔记
- sparkStreaming+flume
- sparkstreaming架构
- sparkstreaming+kafka
- SparkStreaming socketTextStream
- SparkStreaming例子
- SparkStreaming实战
- sparkStreaming Window
- sparkstreaming优化
- sparkStreaming总结
- D32 SparkStreaming
- Android支付宝支付
- NDK-builder方式
- UserClickCountAnalytics kafka
- Set集合
- 最短路练习4/poj/3268 /Silver Cow Party
- sparkStreaming
- 80端口被系统占用,关闭后,iis启动不了解决方法
- Python更换pip源
- 系统的简单注解
- win10 解决 WMI Provider Host 占用CPU过高问题
- 记录自已学习之单链表(创建)
- KafkaSparkDemoMain
- C++(笔记)虚方法
- Android平台下线程池管理工具-ThreadPoolHelp