spark 日志解析格式化
来源:互联网 发布:java log4j 中文乱码 编辑:程序博客网 时间:2024/06/07 03:53
ip库的信息在这里下载 http://www.ipip.net/download.html
182.146.100.97 - 3 [03/Jan/2017:23:30:01 +0800] "GET http://7xna64.com2.z0.glb.qiniucdn.com/Fq9M_Gn0RRWy9eprb0T0CAdrybv3.jpg?imageView2/2/w/1080/h/1920&e=1483592399&token=Q-hCY0VbL4F6NTX3TgRvE_T3vcpNEo2Gr3S9RA-b:HJPKZifauy-LOmjJgA5F1uG9ibs= HTTP/1.1" 200 219736 "-" "Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+NX549J+Build/MMB29M)"
代码案例
hadoopimport java.security.MessageDigestimport java.text.SimpleDateFormatimport java.util.{Locale, Properties}import IPInfo.IPimport org.apache.spark.rdd.RDDimport org.apache.spark.sql.{Dataset, SparkSession}/** * Created by sicong on 2017/4/19. * 182.146.100.97 - 3 [03/Jan/2017:23:30:01 +0800] "GET http://7xna64.com2.z0.glb.qiniucdn.com/Fq9M_Gn0RRWy9eprb0T0CAdrybv3.jpg?imageView2/2/w/1080/h/1920&e=1483592399&token=Q-hCY0VbL4F6NTX3TgRvE_T3vcpNEo2Gr3S9RA-b:HJPKZifauy-LOmjJgA5F1uG9ibs= HTTP/1.1" 200 219736 "-" "Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+NX549J+Build/MMB29M)" * */object paseLogData { val prop = new Properties() prop.put("user", "root") prop.put("password", "") case class Record(user: String, ip: String, country: String, province: String, city: String, restime: Int, time: Long, code: Int, size: Long, firm: String, device: String, rom: String, ke: String) case class Devices(city: String,num:Int,device: String) case class Ipmap(ip: String, provinceCode: Int, cityCode: Int, province: String, city: String) case class CityFlow(city:String,flow:Long) case class StatusCode(code:Int,num:Int) case class ThreadCache(dateParser: SimpleDateFormat, sha1Digester: MessageDigest) val threadSafeCache = new ThreadLocal[ThreadCache](); val Iphelpk = new IP() def getIpInfohga(Str: String): String = { Iphelpk.mains(Str) }//主入口程序 def logbegin(): Unit = { val spark = SparkSession .builder() .appName("Spark SQL Example").master("local[4]") .config("spark.some.config.option", "some-value") .getOrCreate() // readsp(spark) parseLog("/Users/sicong/Downloads/yitianyike.txt", spark) } def getNeedParseLog(): Array[String] = { // TODO // 1. 当前时间前推 7 小时;2 当前时间前推 12 小时; // 以 1、2 为时间范围,查询日志列表 // 日志列表与近期的处理记录比对,若获得的日志为处理,则解析日志,成功后标记为已处理 Array("/Users/Simon/Downloads/7xna64.com2.z0.glb.qiniucdn.com_2017-01-03-23_0602") } def logdevicesMysql(kk: Dataset[Devices], s: String):Unit={ val prop = new Properties() prop.put("user", "root") prop.put("password", "") kk.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", s,prop) } def CizeFlowStatus(logrdd: RDD[Record],sparkSession: SparkSession) ={ import sparkSession.implicits._ val dataOfFlow=logrdd.map(x=>(x.city,x.size)).groupByKey().map(x=>(x._1,(x._2.sum.toDouble/(1024)).round)) val logMysqldata=dataOfFlow.flatMap(x=> Seq(CityFlow(x._1,x._2))).toDS() logMysqldata.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", "test1.CityFlow",prop) } def HttpStatusCode(logrdd:RDD[Record],sparkSession: SparkSession): Unit ={ import sparkSession.implicits._ val logMysqldata= logrdd.map(x=>(x.code,x)).groupByKey().flatMap(x=>{ Seq(StatusCode(x._1,x._2.size)) }).toDS() logMysqldata.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", "test1.StatusCode",prop) } def cityTopURL(logrdd:RDD[Record],sparkSession: SparkSession): Unit ={ import sparkSession.implicits._ logrdd.map(x=>(x.ke+x.city,x)).groupByKey().map(x=>(x._2.size,x._1)).sortBy(x=>x,false,1).foreach(x=>println(x)) } //spark 的解析入口 def parseLog(url: String, spark: SparkSession): Unit = { import spark.implicits._ val peopleDF = spark.sparkContext .textFile(url) val logrdd = peopleDF.flatMap(line => { val record = parses(line) if (record != null) { Seq(record) } else { Seq() } }) //这里对logrdd进行缓存到内存cache 因为接下来的每个算子action如果没有缓存到内存是会每次重新从头开始计算 //统计个个省份对应的流量的接口 CizeFlowStatus(logrdd,spark) //统计个个状态码的占有率 HttpStatusCode(logrdd,spark) cityTopURL(logrdd,spark) } //ip 查询归属地的信息 def logprovincecity(str: String): Array[String] = { val Iphelp = new IP(); val data = Iphelp.mains(str) data.substring(1, data.length - 1).split(",") } def parses(line: String): Record = { setThreadCache() val as = line.split(" ") val ip = as(0) val restime = as(2).toInt val time = parseVisitTime(as(3)) val code = as(8).toInt val size = as(9).toLong //切分出出ua Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+vivo+Y55A+Build/MMB29M) val ua = line.substring(line.lastIndexOf(" \"") + 2, line.lastIndexOf("\"")) val region = logprovincecity(ip) val province = changeEncodeing(region(0)) val city = changeEncodeing(region(1)) val country = changeEncodeing(region(2)) val driver = parseUa(ua) val firm = driver._1 val device = driver._2 val rom = driver._3 val user = mixtureUser(ip, ua) val ke = parseToKey(as(6)) val obj = Record(user, ip, country, province, city, restime, time, code, size, firm, device, rom, ke) obj } def changeEncodeing(string: String): String={ string } def parseVisitTime(string: String):Long={ println(string) var timeData=string.substring(1,string.length) println(timeData) val loc = new Locale("en") val fm = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",loc) val tm = timeData val dt2 = fm.parse(tm); var dates=dt2.getTime() dates.toString.substring(0,dates.toString.length-3).toLong } def setThreadCache(): Unit = { val cache = threadSafeCache.get() if (cache == null) { val dateParser = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss ZZZZ") val sha1 = MessageDigest.getInstance("SHA1") threadSafeCache.set(ThreadCache(dateParser, sha1)) } } private val Iphelp = new IP(); def getIpInfo(Str: String): String = { Iphelp.mains(Str) } // AndroidDownloadManager/5.1.1+(Linux;+U;+Android+5.1.1;+OPPO+R9+Plusm+A+Build/LMY47V) // Dalvik/2.1.0+(Linux;+U;+Android+5.1.1;+NX529J+Build/LMY47V) // Dalvik/2.1.0+(Linux;+U;+Android+5.1.1;+NX523J_V1+Build/LMY47V) // Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+vivo+Y55A+Build/MMB29M) // AndroidDownloadManager/5.1+(Linux;+U;+Android+5.1;+OPPO+R9m+Build/LMY47I) // ua 也包含其它字符 // - // Java/1.7.0_09 // Go-http-client/1.1 // VAYXXLWZIKRFDGFHPOXDNHJTDLTNBTV // ("Android 6.0.1", "vivo Y55A", "Build/MMB29M") def parseUa(ua: String): (String, String, String) = { try { val t1 = ua.split(";").reverse val t2 = t1(0).split("\\+") return (t1(1).replaceAll("\\+", " ").trim, t2.slice(0, t2.length - 1).mkString(" ").trim, t2(t2.length - 1)) } catch { case e: Exception => { return ("Error", "Error", "Error") } } } def mixtureUser(ip: String, ua: String) = { hash(ip + ":" + ua) } def hash(s: String): String = { threadSafeCache.get().sha1Digester.digest(s.getBytes).map("%02x".format(_)).mkString } def parseToKey(url: String) = { // https://a 至少有 9 个字符 val l = url.indexOf("?", 9); val end = if (l > 0) l else url.length() url.substring(url.indexOf("/", 9) + 1, end) } def getIpInfos(Str: String): Array[String] = { // val hell = new hello(); // hell.getipdata(Str).split(";") return Array() } def readsp(spark: SparkSession): Unit ={ import spark.implicits._ val prop = new Properties() prop.put("user", "root") prop.put("password", "") val jdbcDF2 = spark.read .jdbc("jdbc:mysql://localhost:3306", "test1.tutorials_tbl",prop) jdbcDF2.foreach(x=>println(x)) } def main(args: Array[String]): Unit = { //加载ip库 IP.load("/Users/sicong/scalas/17monipdb.dat"); logbegin() }}
0 0
- spark 日志解析格式化
- 用Spark实现日志解析
- Spark 用户日志输出解析
- Logstash 实践之Spark Driver日志解析
- 使用Spark对日志进行简单的文本解析
- Spark 日志
- spark-04-spark 日志查看
- apache日志格式化
- Oracle 监听器日志格式化
- python提取格式化日志
- git log格式化日志
- 格式化驱动输出日志
- Log4Qt 日志格式化
- 利用Spark解析Tomcat日志,并将统计结果存入Mysql数据库
- 利用Spark解析Tomcat日志,并将统计结果存入Mysql数据库
- Spark的日志配置
- spark 日志分析
- Spark应用中的日志
- html_css结合方式以及选择器
- 操作系统与原理
- [CoffeeBot] 为用户设置最优角度
- JVM虚拟机工作原理
- rtp,rtcp网络音视频写入(avi)文件,音视频同步方案
- spark 日志解析格式化
- c++作业四—项目3
- Windows下的php运行环境的部署
- vector的内存管理
- 浅谈多态以及php的实现方法
- 5-大数据分析之 druid 介绍
- Android 程序获取、设置铃声、音量、静音、扬声器
- GOPS 2017全球运维大会(北京站)亮点介绍
- Python基础入门(七)- 字符串