SparkStream demo

来源：互联网发布：上古卷轴5大叔捏脸数据编辑：程序博客网时间：2024/05/16 03:57

炼数成金课程

1、监控本地文件夹下的文件信息

import org.apache.spark.SparkConfimport org.apache.spark.streaming.{Seconds, StreamingContext}import org.apache.spark.streaming.StreamingContext._object HdfsWordCount {  def main(args: Array[String]) {    val sparkConf = new SparkConf().setAppName("HdfsWordCount").setMaster("local[2]")//这里指在本地运行，2个线程，一个监听，一个处理数据    // Create the context    val ssc = new StreamingContext(sparkConf, Seconds(20))// 时间划分为20秒    val lines = ssc.textFileStream("/home/mmicky/temp/")    val words = lines.flatMap(_.split(" "))    val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)    wordCounts.print()    ssc.start()    ssc.awaitTermination()  }}

2、网络socket监控

1）构建socket模拟周期发送数据

import java.io.{PrintWriter}import java.net.ServerSocketimport scala.io.Sourceobject SaleSimulation {  def index(length: Int) = { //销售模拟器：参数1：读入的文件；参数2：端口；参数3：发送时间间隔ms    import java.util.Random    val rdm = new Random    rdm.nextInt(length)  }  def main(args: Array[String]) {    if (args.length != 3) {      System.err.println("Usage: <filename> <port> <millisecond>")      System.exit(1)    }    val filename = args(0)    val lines = Source.fromFile(filename).getLines.toList    val filerow = lines.length    val listener = new ServerSocket(args(1).toInt)    while (true) {      val socket = listener.accept()      new Thread() {        override def run = {          println("Got client connected from: " + socket.getInetAddress)          val out = new PrintWriter(socket.getOutputStream(), true)          while (true) {            Thread.sleep(args(2).toLong)            val content = lines(index(filerow))            println(content)            out.write(content + '\n')            out.flush()          }          socket.close()        }      }.start()    }  }}

运行：java -cp week5.jar week5.SaleSimulation /home/mmicky/data/spark/people.txt 9999 1000 //从people文件随机读取，发送端口9999，间隔1秒

2）sparkStream 监控端

import org.apache.spark.{SparkContext, SparkConf}import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext}import org.apache.spark.streaming.StreamingContext._import org.apache.spark.storage.StorageLevelobject NetworkWordCount {  def main(args: Array[String]) {    val conf = new SparkConf().setAppName("NetworkWordCount").setMaster("local[2]")    val sc = new SparkContext(conf)    val ssc = new StreamingContext(sc, Seconds(5))//5秒间隔     val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER)// 服务器地址，端口，序列化方案    val words = lines.flatMap(_.split(","))    val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)    wordCounts.print()    ssc.start()    ssc.awaitTermination()  }}

3、监控有状态（stateful）

import org.apache.spark.{SparkContext, SparkConf}import org.apache.spark.streaming.{Seconds, StreamingContext}import org.apache.spark.streaming.StreamingContext._object StatefulWordCount {  def main(args: Array[String]) {    val updateFunc = (values: Seq[Int], state: Option[Int]) => { //StateFul需要定义的处理函数，第一个参数是本次进来的值，第二个是过去处理后保存的值      val currentCount = values.foldLeft(0)(_ + _)<span style="white-space:pre"></span>//求和      val previousCount = state.getOrElse(0)<span style="white-space:pre"></span>// 如果过去没有 即取0      Some(currentCount + previousCount)<span style="white-space:pre"></span>// 求和<span style="white-space:pre"></span>    }    val conf = new SparkConf().setAppName("StatefulWordCount").setMaster("local[2]")    val sc = new SparkContext(conf)    //创建StreamingContext    val ssc = new StreamingContext(sc, Seconds(5))    ssc.checkpoint(".")<span style="white-space:pre"></span>//因为是有状态的，需要保存之前的信息，所以这里设定了 checkpoint的目录，以防断电后内存数据丢失。<span style="white-space:pre"></span>//这里因为没有设置checkpoint的时间间隔，所以会发现每一次数据块过来 即切分一次，产生一个 .checkpoint 文件    //获取数据    val lines = ssc.socketTextStream(args(0), args(1).toInt)    val words = lines.flatMap(_.split(","))    val wordCounts = words.map(x => (x, 1))    //使用updateStateByKey来更新状态    val stateDstream = wordCounts.updateStateByKey[Int](updateFunc)<span style="white-space:pre"></span>//调用 处理函数 updateFunc    stateDstream.print()    ssc.start()    ssc.awaitTermination()    }}

4、windows操作

import org.apache.spark.{SparkContext, SparkConf}import org.apache.spark.storage.StorageLevelimport org.apache.spark.streaming._import org.apache.spark.streaming.StreamingContext._object WindowWordCount {  def main(args: Array[String]) {    val conf = new SparkConf().setAppName("WindowWordCount").setMaster("local[2]")    val sc = new SparkContext(conf)    //创建StreamingContext    val ssc = new StreamingContext(sc, Seconds(5))    ssc.checkpoint(".")    // //获取数据    val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_ONLY_SER)    val words = lines.flatMap(_.split(","))    //windows操作    val wordCounts = words.map(x => (x , 1)).reduceByKeyAndWindow((a:Int,b:Int) => (a + b), Seconds(args(2).toInt), Seconds(args(3).toInt))<span style="white-space:pre"></span>//第二个参数是 windows的窗口时间间隔，比如是 监听间隔的 倍数，上面是 5秒，这里必须是5的倍数。eg :30<span style="white-space:pre"></span>//第三个参数是 windows的滑动时间间隔，也必须是监听间隔的倍数。eg :10<span style="white-space:pre"></span>//那么这里的作用是， 每隔10秒钟，对前30秒的数据， 进行一次处理，这里的处理就是 word count。    //val wordCounts = words.map(x => (x , 1)).reduceByKeyAndWindow(_+_, _-_,Seconds(args(2).toInt), Seconds(args(3).toInt))<span style="white-space:pre"></span>//这个是优化方法， 即加上上一次的结果，减去 上一次存在又不在这一次的数据块的部分。    wordCounts.print()    ssc.start()    ssc.awaitTermination()  }}

http://blog.csdn.net/escaflone/article/details/43341275

http://www.aboutyun.com/thread-8900-1-1.html

0 0