spark例子整理
来源:互联网 发布:百度 mac 五笔 词库 编辑:程序博客网 时间:2024/05/16 12:07
Spark Streaming是一个准实时流处理框架,处理响应时间一般以分钟为单位,也就是说处理实时数据的延迟时间是秒级别的;Storm是一个实时流处理框架,处理响应是毫秒级的。所以在流框架选型方面要看具体业务场景。需要澄清的是现在很多人认为Spark Streaming流处理运行不稳定、数据丢失、事务性支持不好等等,那是因为很多人不会驾驭Spark Streaming及Spark本身。在Spark Streaming流处理的延迟时间方面,Spark定制版本,会将Spark Streaming的延迟从秒级别推进到100毫秒之内甚至更少。SparkStreaming优点:1、提供了丰富的API,企业中能快速实现各种复杂的业务逻辑。2、流入Spark Streaming的数据流通过和机器学习算法结合,完成机器模拟和图计算。3、Spark Streaming基于Spark优秀的血统。SparkStreaming能不能像Storm一样,一条一条处理数据?Storm处理数据的方式是以条为单位来一条一条处理的,而Spark Streaming基于单位时间处理数据的,SparkStreaming能不能像Storm一样呢?答案是:可以的。
下面是一个从kafka读取数据,然后利用foreachRdd遍历Rdd,在使用sparksql转换成表进行分析的demo
package com.sprakStream.demoimport java.util.regex.Matcherimport org.apache.kafka.common.serialization.StringDeserializerimport org.apache.spark.streaming.Secondsimport org.apache.spark.streaming.StreamingContextimport org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctionsimport org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribeimport org.apache.spark.streaming.kafka010.KafkaUtilsimport org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistentimport com.sprakStream.bean.IpMapperimport org.apache.spark.SparkConfimport org.apache.spark.SparkContextimport java.util.Propertiesimport org.apache.spark.sql.SparkSessionimport com.sprakStream.util.CommUtilimport java.sql.Connectionimport java.sql.PreparedStatementimport java.sql.DriverManagerimport java.util.Arrays.ArrayListimport java.util.ArrayListimport java.util.Arrays.ArrayListimport org.apache.spark.sql.Rowimport org.apache.spark.sql.types.StructFieldimport org.apache.spark.sql.types.StringTypeimport org.apache.spark.sql.types.StructTypeimport com.sprakStream.util.AppConstantimport org.apache.spark.rdd.RDDimport kafka.utils.Timeimport org.apache.spark.sql.SQLContextimport org.apache.spark.SparkContextimport org.apache.spark.rdd.RDDimport org.apache.spark.streaming.{ Time, Seconds, StreamingContext }import org.apache.spark.util.IntParamimport org.apache.spark.sql.SQLContextimport org.apache.spark.storage.StorageLevelimport org.apache.hadoop.record.Recordimport java.sql.Timeobject KafkaExcamle3 { def main(args: Array[String]): Unit = { //val conf = new SparkConf() //val sc = new SparkContext() // System.setProperty("spark.sql.warehouse.dir", "D:\\tools\\spark-2.0.0-bin-hadoop2.6"); // System.setProperty("hadoop.home.dir", "D:\\tools\\hadoop-2.6.0"); println("success to Init...") val url = "jdbc:postgresql://172.16.12.190:5432/dataex_tmp" val prop = new Properties() prop.put("user", "postgres") prop.put("password", "issing") val conf = new SparkConf().setAppName("wordcount").setMaster("local") val ssc = new StreamingContext(conf, Seconds(1)) val sparkSession = SparkSession.builder().config(conf).getOrCreate() val util = Utilities; util.setupLogging() // Construct a regular expression (regex) to extract fields from raw Apache log lines val pattern = util.apacheLogPattern() // hostname:port for Kafka brokers, not Zookeeper val kafkaParams = Map[String, Object]( "bootstrap.servers" -> AppConstant.KAFKA_HOST, "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "example", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean)) // List of topics you want to listen for from Kafka val topics = List(AppConstant.KAFKA_TOPIC).toSet val lines = KafkaUtils.createDirectStream[String, String]( ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams)).map(_.value()); val spiltWorks = lines.map(x => { val matcher: Matcher = pattern.matcher(x); if (matcher.matches()) matcher.group(0) }) val spiltDesc = spiltWorks.map { x => x.toString() }.window(Seconds(30), Seconds(2)) //调用foreachRDD方法,遍历DStream中的RDD spiltDesc.foreachRDD({ rdd => // Get the singleton instance of SQLContext println() println("=================================================开始你的表演111111111=================================================") println() val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext) import sqlContext.implicits._ val wordsDataFrame = rdd.map(x => x.toString().split(" ")).map(x => IpMapper(CommUtil.uuid(), x(0).toString(), x(1).toString(), x(2).toString(), x(3).toString(), x(4).toString(), x(5).toString(), x(6).toString(), x(7).toString(), x(8).toString())).toDF() wordsDataFrame.registerTempTable("wordsDataFrame") val wordCountsDataFrame = sqlContext.sql("select * from wordsDataFrame") wordCountsDataFrame.show() }) //调用foreachRDD方法,遍历DStream中的RDD spiltWorks.foreachRDD({ rdd => // Get the singleton instance of SQLContext println() println("=================================================开始你的表演22222222222=================================================") println() val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext) import sqlContext.implicits._ val wordsDataFrame = rdd.map(x => x.toString().split(" ")).map(x => IpMapper(CommUtil.uuid(), x(0).toString(), x(1).toString(), x(2).toString(), x(3).toString(), x(4).toString(), x(5).toString(), x(6).toString(), x(7).toString(), x(8).toString())).toDF() wordsDataFrame.registerTempTable("wordsDataFrame") val wordCountsDataFrame = sqlContext.sql("select * from wordsDataFrame") wordCountsDataFrame.show() }) // Kick it off ssc.checkpoint("/user/root/spark/checkpoint") ssc.start() ssc.awaitTermination() println("KafkaExample-结束.................................") }}object SQLContextSingleton { @transient private var instance: SQLContext = _ def getInstance(sparkContext: SparkContext): SQLContext = { if (instance == null) { instance = new SQLContext(sparkContext) } instance }}
阅读全文
0 0
- spark例子整理
- Spark-Spark Streaming例子整理(一)
- Spark-Spark Streaming例子整理(二)
- Spark-Spark Streaming例子整理(三)
- Spark-Spark Streaming例子整理(一)
- Spark-Spark Streaming例子整理(二)
- Spark-Spark Streaming例子整理(三)
- Spark-Spark Streaming例子整理(一)
- spark例子
- spark 例子运行- spark pi
- 【spark】spark word count例子
- spark streaming例子
- Spark SQL的例子
- spark python例子
- Spark-shell例子
- spark sql例子
- Spark SQL的例子
- spark+kudu 例子
- androidstudio之ADB异常
- Android IPC 进程间通信机制之 AIDL
- linux下安装最新版本mysql5.7 问题(yum安装方法)
- NetUse命令在渗透测试中的连接条件测试
- 流畅的python--dict的优点以及缺点
- spark例子整理
- AD元器件向导的封装类型
- PyCharm使用技巧:Show in Explorer(打开文件所在的文件夹)
- Linux 内存和CPU占用高的程序
- Error:java: Compilation failed: internal java compiler error
- s3c2440裸机中 option.inc C语言解释
- 三元表达式&列表解析&生成器表达式
- 简单的下拉导航条代码
- 【Python学习笔记】利用Python爬取百度搜索结果