spark streaming 使用socket数据来源
来源:互联网 发布:netflix ribbon源码 编辑:程序博客网 时间:2024/06/06 18:28
1.编写监听socket的模拟socket程序
2.编写SocketWordCount
3.基于状态的单词累计出现次数
4.基于窗口的单词累计出现次数
1.编写监听socket的模拟socket程序
import java.io.PrintWriterimport java.net.ServerSocketimport scala.io.Sourceobject DataFlowSimulator { //定义随机获取整数的方法 def index(length:Int)={ import java.util.Random val rdm = new Random(); rdm.nextInt(length) } def main(args: Array[String]): Unit = { //调用该模拟器需要三个参数,分为文件路径、端口好、间隔时间(单位:毫秒) if(args.length != 3){ System.err.println("Usage <filename> <port> <millisecond>") System.exit(-1) } //获取文件的总行数 val filename = args(0) val lines = Source.fromFile(filename).getLines().toList val filerow = lines.length //制定端口,但外部程序请求时建立连接 val lister = new ServerSocket(args(1).toInt) while (true){ val socket = lister.accept() new Thread(){ override def run(): Unit ={ println("Got client connection from:"+socket.getInetAddress) val out = new PrintWriter(socket.getOutputStream,true) while(true){ Thread.sleep(args(2).toLong) //当该端口接受请求时,随机获取某以行的数据 val content = lines(index(filerow)) println(content) out.write(content+'\n') out.flush() } out.close() socket.close() } }.start() } }}
2.编写SocketWordCount
package streamingimport org.apache.log4j.{Level, Logger}import org.apache.spark.{SparkConf, SparkContext}import org.apache.spark.storage.StorageLevelimport org.apache.spark.streaming.{Seconds, StreamingContext}object SocketSparkStreaming { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("socketSparkStreaming").setMaster("local[2]") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc,Seconds(5)) Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) val lines = ssc.socketTextStream("spark02",8089,StorageLevel.MEMORY_ONLY) val worlds = lines.flatMap(_.split(",")) val worldCounts = worlds.map(x=>(x,1)).reduceByKey((_+_)) worldCounts.print() //worldCounts.saveAsTextFiles("file:///home/spark/test/sparktest") ssc.start() ssc.awaitTermination() }}
3.基于状态的单词累计出现次数
package streamingimport org.apache.hadoop.hdfs.server.common.Storageimport org.apache.log4j.{Level, Logger}import org.apache.spark.{SparkConf, SparkContext}import org.apache.spark.storage.StorageLevelimport org.apache.spark.streaming.{Seconds, StreamingContext}object StateWorldCount { def main(args: Array[String]): Unit = { Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) //更新函数,参数values作为当前批次单词的频率,state为以往批次单词的频率 val updateFunc = (values:Seq[Int],state:Option[Int])=>{ val currentCount = values.foldLeft(0)(_+_) val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val conf = new SparkConf().setAppName("stateSparkStreaming").setMaster("local[2]") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc,Seconds(5)) ssc.checkpoint("file:///d:/checkpoint") val lines = ssc.socketTextStream(args(0),args(1).toInt,StorageLevel.MEMORY_ONLY) val worldCounts = lines.flatMap(_.split(",")).map((_,1)) //使用updateStateByKey来更新状态,统计从运行开始到现在以来的单词频率 val stateDStream = worldCounts.updateStateByKey(updateFunc) stateDStream.print() ssc.start() ssc.awaitTermination() }}
4.基于窗口的单词累计出现次数
package streamingimport org.apache.log4j.{Level, Logger}import org.apache.spark.SparkConfimport org.apache.spark.storage.StorageLevelimport org.apache.spark.streaming.{Seconds, StreamingContext}object WindowWordCount { def main(args: Array[String]): Unit = {// if(args.length != 4){// System.err.println("Usage: WindowWorldCount <filename> <port> <windowDuration> <slideDuration>")// System.exit(-1)// } Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) val conf = new SparkConf().setAppName("window").setMaster("local[2]") val ssc = new StreamingContext(conf,Seconds(5)) ssc.checkpoint("file:///H:/checkpoint") val lines = ssc.socketTextStream("spark02",8089,StorageLevel.MEMORY_ONLY) val words = lines.flatMap(_.split(",")).map((_,1)) //windos操作,第一种为叠加处理,第二种为增量处理 //val worldCounts = words.reduceByKeyAndWindow((a:Int,b:Int)=>(a+b),Seconds(10),Seconds(10)) val worldCounts = words.reduceByKeyAndWindow(_+_,_-_,Seconds(10),Seconds(10)) worldCounts.print() ssc.start() ssc.awaitTermination() }}
阅读全文
0 0
- spark streaming 使用socket数据来源
- Spark streaming不同数据来源(socket套接字、hdfs目录)和存储位置(hdfs、本地)的java代码
- spark streaming检查点使用
- Spark Streaming使用Kafka保证数据零丢失
- Spark Streaming使用Kafka保证数据零丢失
- Spark Streaming使用Kafka保证数据零丢失
- Spark Streaming使用Kafka保证数据零丢失
- Spark Streaming使用Kafka保证数据零丢失
- spark streaming 读取网络数据
- Spark Streaming 数据接收过程
- Spark Streaming 数据接收优化
- 大数据技术--Spark Streaming
- Spark Streaming 数据清理机制
- spark streaming 获取数据方式
- Spark Streaming通过Socket检测空气质量
- 使用 Spark Streaming 检测关键词
- Spark Streaming基本使用介绍
- spark流数据处理:Spark Streaming的使用
- Android7.0 init.rc流程分析
- 局部变量、 全局变量、 堆、 堆栈、 静态和全局
- log4j2 个性化日志名
- 02_ACS550变频器RS485Modbus通信-通信命令
- JavaWeb学习心得之JSP原理
- spark streaming 使用socket数据来源
- linux学习---进程控制(fork,vfork,popen,exec,system)
- MindManager2018中文版发布,新版新发现!
- 图解phpstorm常用快捷键
- char * 与 string 类型相互转换方法--C/C++
- matlab函数bsxfun、crossvalind、ismember,kmean
- Java并发编程:volatile关键字解析
- Java 内存溢出(java.lang.OutOfMemoryError)的常见情况和处理方式总结
- mysql时间的转换