spark实践——处理表
来源:互联网 发布:mmd恋爱循环数据 编辑:程序博客网 时间:2024/05/01 01:30
import org.apache.spark.HashPartitionerimport org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}import org.apache.spark.rdd.RDDimport org.apache.spark.sql.types._import org.apache.spark.{SparkContext, SparkConf}import org.apache.spark.sql.{Column, DataFrame, Row, columnar}import org.apache.spark.sql.hive.HiveContextimport org.apache.spark.mllib.linalg.Matriximport scala.math.Orderingimport org.apache.hadoop.io.LongWritableimport org.apache.hadoop.io.Textimport org.apache.hadoop.conf.Configurationimport org.apache.hadoop.mapreduce.lib.input.TextInputFormatimport org.apache.spark.storage.StorageLevel @transient val conf = new Configuration conf.set("textinputformat.record.delimiter", "\1") val sqlContext = new HiveContext(sc)//org.apache.spark.sql.hive ""->, /*原始数据转换为DataFrame*/ def dataToDF(path:String) = { val datatblRDD1 = sc.newAPIHadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], conf).map(_._2.toString) val brdsegs1 = sc.broadcast(datatblRDD1.first().split("\2")) val rowRDD1 = datatblRDD1.filter(_(0) != brdsegs1.value(0)(0)).map(x =>Row(x.split("\2",brdsegs1.value.size):_*)) val segments1 = sc.broadcast(StructType(for(x <- brdsegs1.value) yield StructField(x,StringType,true))) val alarmDF1tmp = sqlContext.createDataFrame(rowRDD1,segments1.value) alarmDF1tmp.registerTempTable("TMPTBL") /*正则选取ip time,过滤脏数据*/ val ipadrs_r = """.*((25[0-5]|2[0-4]\\d|1\\d{2}|[1-9]?\\d)($|(?!\\.$)\\.)){4}$""".r.toString() val date_r = """.*(\\d{1,2}($|(?!:$):)){3}""".r.toString() val returnDF = sqlContext.sql(s"SELECT * FROM TMPTBL where length(regexp_extract(IPADDRESS,'$ipadrs_r',0)) > 6 AND " + s"length(regexp_extract(ORIGINALEVENTTIME,'$date_r',0)) > 4 AND length(regexp_extract(EVENTTIME,'$date_r',0)) > 4") returnDF }def dataToDF1(path:String) = { val datatblRDD1 = sc.newAPIHadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], conf).map(_._2.toString) val brdsegs1 = sc.broadcast(datatblRDD1.first().split("\2")) val rowRDD1 = datatblRDD1.filter(_(0) != brdsegs1.value(0)(0)).map(x =>Row(x.split("\2",brdsegs1.value.size):_*)) val segments1 = sc.broadcast(StructType(for(x <- brdsegs1.value) yield StructField(x,StringType,true))) val alarmDF1tmp = sqlContext.createDataFrame(rowRDD1,segments1.value) alarmDF1tmp.registerTempTable("TMPTBL1") /*正则选取ip time,过滤脏数据*/ val ipadrs_r = """.*((25[0-5]|2[0-4]\\d|1\\d{2}|[1-9]?\\d)($|(?!\\.$)\\.)){4}$""".r.toString() val date_r = """.*(\\d{1,2}($|(?!:$):)){3}""".r.toString() val returnDF = sqlContext.sql(s"SELECT * FROM TMPTBL1 where length(regexp_extract(IPADDRESS,'$ipadrs_r',0)) > 6 AND " + s"length(regexp_extract(ORIGINALEVENTTIME,'$date_r',0)) > 4 AND length(regexp_extract(EVENTTIME,'$date_r',0)) > 4") returnDF } /*DF注册为表,然后查询表得出所需DF*/ def tableToDF(alarmdf1:DataFrame,similardf2:DataFrame) = { alarmdf.registerTempTable("ALARMOBJECTTBL") similardf.registerTempTable("SIMILARALARMTBL") /*官网SqlContext的配置,默认是200,1.3.0v.*/ val ponalarmTOfObjPart1: DataFrame = sqlContext.sql("SELECT ALARMOBJECTID,OCNAME,ORIGINALEVENTTIME as EVENTTIME,substr(EVENTTIME,0,13) as EVENTDATEHOUR," + "EQUIPMENTNAME,SATOTAL,IPADDRESS,1 as IS_DG " + s"FROM ALARMOBJECTTBL WHERE ALARMINFO = '2.1.5'") ponalarmTOfObjPart1.registerTempTable("PON_ALARM_TBL1") val getPlSim: DataFrame =sqlContext.sql("SELECT ALARMOBJECTID, OCNAME FROM PON_ALARM_TBL1 WHERE cast(SATOTAL as int) > 0 AND cast(IS_DG as int) = 1") getPlSim.registerTempTable("GET_PL_SIM") val ponalarmTOfObjPart2 = sqlContext.sql("SELECT s.ALARMOBJECTID,s.OCNAME,s.ORIGINALEVENTTIME as EVENTTIME,substr(s.EVENTTIME,0,13) as EVENTDATEHOUR," + "s.EQUIPMENTNAME,-1 as SATOTAL,s.IPADDRESS,1 as IS_DG FROM SIMILARALARMTBL s ,GET_PL_SIM g WHERE s.PARENTALARMOBJECT = g.ALARMOBJECTID AND s.OCNAME = g.OCNAME ") val ponalarmTOfObjPart3: DataFrame = sqlContext.sql("SELECT ALARMOBJECTID,OCNAME,ORIGINALEVENTTIME as EVENTTIME,substr(EVENTTIME,0,13) as EVENTDATEHOUR," + s"EQUIPMENTNAME,SATOTAL,IPADDRESS,0 as IS_DG FROM ALARMOBJECTTBL WHERE ALARMINFO = '2.1.6'") ponalarmTOfObjPart3.registerTempTable("PON_ALARM_TBL2") val getOlSim = sqlContext.sql("SELECT ALARMOBJECTID, OCNAME FROM PON_ALARM_TBL2 WHERE cast(SATOTAL as int) > 0 AND cast(IS_DG as int) = 0") getOlSim.registerTempTable("GET_OL_SIM") val ponalarmTOfObjPart4: DataFrame = sqlContext.sql("SELECT s.ALARMOBJECTID,s.OCNAME,s.ORIGINALEVENTTIME as EVENTTIME,substr(s.EVENTTIME,0,13) as EVENTDATEHOUR," + "s.EQUIPMENTNAME,-1 as SATOTAL,s.IPADDRESS,0 as IS_DG FROM SIMILARALARMTBL s,GET_PL_SIM g WHERE s.PARENTALARMOBJECT = g.ALARMOBJECTID AND s.OCNAME = g.OCNAME") val pon_alarm_tbl: DataFrame = ponalarmTOfObjPart1.unionAll(ponalarmTOfObjPart2).unionAll(ponalarmTOfObjPart3).unionAll(ponalarmTOfObjPart4) pon_alarm_tbl.registerTempTable("PON_ALARM_TBL") val ponAllDeviceLable = sqlContext.sql("SELECT DISTINCT IPADDRESS as IPADDRESS,trim(EQUIPMENTNAME) as NAME FROM PON_ALARM_TBL") ponAllDeviceLable.registerTempTable("PON_ALL_DEVICE_LABLE_TBL") val ponAllDevice = sqlContext.sql("SELECT DISTINCT IPADDRESS as IPADDRESS FROM PON_ALL_DEVICE_LABLE_TBL") ponAllDevice.registerTempTable("PON_ALL_DEVICE_TBL") val ponDgAlarmCount = sqlContext.sql("SELECT p.EVENTDATEHOUR as EVENTDATEHOUR,p.IPADDRESS as IPADDRESS,COUNT(1) AS ALARMCOUNT FROM PON_ALARM_TBL p WHERE IS_DG = 1 GROUP BY p.EVENTDATEHOUR,p.IPADDRESS") ponDgAlarmCount.registerTempTable("PON_DG_ALARM_COUNT_TBL") val ponAllAlaramCount =sqlContext.sql("SELECT p.EVENTDATEHOUR as EVENTDATEHOUR,p.IPADDRESS as IPADDRESS,COUNT(1) AS ALARMCOUNT FROM PON_ALARM_TBL p GROUP BY p.EVENTDATEHOUR,p.IPADDRESS") ponAllAlaramCount.registerTempTable("PON_ALL_ALARM_COUNT_TBL") val ponDgAlarmTime = sqlContext.sql("SELECT p.IPADDRESS as IPADDRESS,sum(case when cast(p.ALARMCOUNT as int) > 0 then 1 else 0 end) as ALARMTIMECOUNT FROM PON_DG_ALARM_COUNT_TBL p GROUP BY p.IPADDRESS") ponDgAlarmTime.registerTempTable("PON_DG_ALALRM_TIME_TBL") val curDgAlarmCount = sqlContext.sql("SELECT EVENTDATEHOUR,IPADDRESS,ALARMCOUNT FROM PON_DG_ALARM_COUNT_TBL") val curAllAlarmCount = sqlContext.sql("SELECT EVENTDATEHOUR,IPADDRESS,ALARMCOUNT FROM PON_ALL_ALARM_COUNT_TBL") val curDgAlarmTime = sqlContext.sql("SELECT IPADDRESS,ALARMTIMECOUNT FROM PON_DG_ALALRM_TIME_TBL") val curAllDevice = sqlContext.sql("SELECT IPADDRESS FROM PON_ALL_DEVICE_TBL") List(curDgAlarmCount,curAllAlarmCount,curDgAlarmTime,curAllDevice) } /*过滤、转换DF*/ def filterDfs(fourDFs:Array[DataFrame]) = { val a_dg_alarm_count = fourDFs(0).filter("length(IPADDRESS) > 0 AND length(EVENTDATEHOUR) > 0").rdd.map(e => (e(0).toString,e(1).toString,e(2).toString.toInt)) val a_all_alarm_count = fourDFs(1).filter("length(IPADDRESS) > 0 AND length(EVENTDATEHOUR) > 0").rdd.map(e => (e(0).toString,e(1).toString,e(2).toString.toInt)) val a_dg_alarm_time = fourDFs(2).filter("length(IPADDRESS) > 0").rdd.map(e => (e(0).toString,e(1).toString.toInt)) val a_all_device = fourDFs(3).filter("length(IPADDRESS) > 0").rdd.map(e => e(0).toString) (a_dg_alarm_count,a_all_alarm_count,a_dg_alarm_time,a_all_device) } val alarmdf = dataToDF("hdfs://10.0.0.1:9000/zh_test/alarmobjecttbl.txt")sc.broadcast(alarmdf) val similardf = dataToDF("hdfs://10.0.0.1:9000/zh_test/similaralarmtbl.txt")sc.broadcast(similardf) val fourDFs1 = tableToDF(alarmdf,similardf) val fourDFs2 = filterDfs(fourDFs1) /*time ip count*/ val a_dg_alarm_count_kv = fourDFs2._1.map(e => (e._1,e._2))// time -> ip a_dg_alarm_count_kv.persist(StorageLevel.MEMORY_ONLY_SER) val a_all_alarm_count_kv = fourDFs2._2.map(e => (e._1,e._2)) a_all_alarm_count_kv.persist(StorageLevel.MEMORY_ONLY_SER) val a_dg_alarm_time_kv = fourDFs2._3.map(e => (e._1,e._2)).repartition(10) a_dg_alarm_time_kv.persist(StorageLevel.MEMORY_ONLY_SER) val a_all_device_kv = fourDFs2._4.repartition(10)//.map(e => (0,e)) a_all_device_kv.persist(StorageLevel.MEMORY_ONLY_SER) val distinct_time = a_all_alarm_count_kv.map(e => e._1).distinct().map(e => (e,"")) //前者是掉电,后者是综合 val dg_all_count_kv = a_dg_alarm_count_kv.join(a_all_alarm_count_kv,50).map(e => ((e._2._1,e._2._2),1)).reduceByKey(_+_)//half dg_all_count_kv.persist(StorageLevel.MEMORY_ONLY_SER) a_dg_alarm_count_kv.unpersist() a_all_alarm_count_kv.unpersist() /**掉电若为0,则掉电情况下发生任意告警的概率都为零; 现有掉电设备、综合告警设备*/ /*非0 掉电-综合(ip,ip)-> ∩count dg_time_count(ip -> dgcount) */ val ip_value_kv_notzero = dg_all_count_kv.map(e => e._1._1 -> (e._1._2,e._2)).leftOuterJoin(a_dg_alarm_time_kv).map(e => e._2._2 match { case Some(s) => (e._1,e._2._1._1,if (s.toDouble > 0.8) e._2._1._2.toDouble/s.toDouble else 0.0) case scala.None => (e._1,e._2._1._1,0.0) }).map(e => ((e._1,e._2),e._3)) def addi(i:Double,j:Double) = {i+j} /*全0 --> 部分非0*/ val ip_value_kv = a_all_device_kv.cartesian(a_all_device_kv).map(e => (e,0.0)).union(ip_value_kv_notzero).aggregateByKey(0.0)(addi,addi) /*是否过滤同ip为1,且过滤小于2次的*/ ip_value_kv.persist(StorageLevel.MEMORY_ONLY_SER) import org.apache.hadoop.io.compress.GzipCodec val ip_value_kv_hive = ip_value_kv.map(e => Array(e._1._1,e._1._2,e._2).mkString(",")) //ip_value_kv_hive val result_sorted_ip = ip_value_kv_hive.saveAsTextFile("hdfs://10.0.0.1:9000/zh_test/result",classOf[GzipCodec])//生成zip压缩文件,classOf[GzipCodec]</span>
0 0
- spark实践——处理表
- spark实践
- Spark修炼之道(高级篇)——Spark源码阅读:第十二节 Spark SQL 处理流程分析
- Spark修炼之道(高级篇)——Spark源码阅读:第十二节 Spark SQL 处理流程分析
- 可扩展机器学习——Spark分布式处理
- spark点点滴滴 —— 运行scala任务异常处理
- 基于hadoop与spark的大数据分析实战——第一章 Hadoop部署与实践
- Spark实践-日志查询
- Spark实践-日志查询
- Spark-Streaming处理Kafka数据——封装成对象处理
- Flex实践——事件监听与处理
- 第十四周上机实践项目——函数处理
- Flex实践——事件监听与处理
- 基于hadoop搜索引擎实践——在线处理(六)
- 《C语言及程序设计》实践参考——成绩处理
- C++实践参考——处理C++源代码的程序
- C++实践参考——用二进制文件处理学生成绩
- C#图像处理实践——图片不同提取
- CocoaPods的安装与使用说明
- HTML5视频播放器VideoJS使用附【源码及示例】- 兼容IE
- vs2013快捷键大全
- ceshi
- 关于androidpn消息推送客户端锁屏状态,无法发送心跳包解决方案
- spark实践——处理表
- Linux系统基础命令(转)
- 在 NetBeans IDE 中使用 Git 支持
- 文件搜索find
- logback
- ios8 share extension 分享扩展 问题小结
- Anroid API Guides 翻译学习——Introduction to Android(android 引入介绍)
- learn route
- 备忘录模式