SQLAccesslogAnalyzer in sparkSQL
来源:互联网 发布:矩阵图片 编辑:程序博客网 时间:2024/06/05 00:34
1. SQLAccessLogAnalyzerSpark
package com.ibeifeng.bigdata.spark.sqlimport com.ibeifeng.bigdata.spark.core.ApacheAccessLogimport org.apache.spark.rdd.RDDimport org.apache.spark.sql.{DataFrame, SQLContext}import org.apache.spark.{SparkContext, SparkConf}/** * Created by XuanYu on 2017/3/5. */object SQLAccessLogAnalyzerSpark { def main(args: Array[String]) { // 读取配置信息 val sparkConf = new SparkConf(); sparkConf.setAppName("SQLAccessLogAnalyzer Application").setMaster("local[2]") // 创建SparkContext实例 val sc = SparkContext.getOrCreate(sparkConf) /** * 创建SparkSQL入口 SQLContext */ val sqlContext = new SQLContext(sc) // this is used to implicitly convert an RDD to a DataFrame. import sqlContext.implicits._// ================================================================================ // 自定义函数 // 案例一、定义UDF:一对一 sqlContext.udf.register( "toUpper" ,// function name (word: String) => word.toUpperCase() ) // 案例二:截取Double类型值的后几位 sqlContext.udf.register( "double_scala", (numValue: Double, scale: Int) => { import java.math.BigDecimal val db = new BigDecimal(numValue) db.setScale(scale, BigDecimal.ROUND_HALF_UP).doubleValue() } ) // 案例三:注册定义UDAF sqlContext.udf.register( "sal_avg", AvgSalUDAF ) // ================================================================================ val logFile = "/test/access_log" val accessLogsRdd: RDD[ApacheAccessLog] = sc .textFile(logFile) // read file from hdfs // filter .filter(ApacheAccessLog.isValidateLogLine) // parse .map(log => ApacheAccessLog.parseLogLine(log)) /** * RDD -> DataFrame * 其中一种方式为:RDD[CASE CLASS] */ // Create DataFrame val accessLogsDF: DataFrame = accessLogsRdd.toDF() // 打印Schema /** * root |-- ipAddress: string (nullable = true) |-- clientIdented: string (nullable = true) |-- userId: string (nullable = true) |-- dateTime: string (nullable = true) |-- method: string (nullable = true) |-- endpoint: string (nullable = true) |-- protocol: string (nullable = true) |-- responseCode: integer (nullable = false) |-- contentSize: long (nullable = false) */ accessLogsDF.printSchema() /** * 再次使用SQL语句进行分析,SQL语句时针对表的,所以讲DataFrame注册为一张表,然后进行SQL分析 */ // 将DataFrame注册为一张表 accessLogsDF.registerTempTable("access_log") // 将表中的数据放入内存中 sqlContext.cacheTable("access_log") /** * 需求一: The average, min, and max content size of responses returned from the server. */ val contentSizesRow = sqlContext.sql( """ |SELECT | SUM(contentSize), COUNT(*), MIN(contentSize), MAX(contentSize) |FROM | access_log """.stripMargin).first() // println println("Content Size Avg: %s, Min: %s, Max: %s".format( contentSizesRow.getLong(0) / contentSizesRow.getLong(1), contentSizesRow(2), contentSizesRow(3) )) // Content Size Avg: 7777, Min: 0, Max: 138789 /** * 需求二: A count of response code's returned. */ val responseCodeToCount: Array[(Int, Long)] = sqlContext.sql( """ |SELECT | responseCode, COUNT(*) |FROM | access_log |GROUP BY | responseCode |LIMIT | 10 """.stripMargin).map(row => (row.getInt(0), row.getLong(1))).collect() println( s"""Response Code Count: ${responseCodeToCount.mkString("[",", ","]")}""" ) /** * 需求三: All IPAddresses that have accessed this server more than N times. */ val ipAddresses: Array[String] = sqlContext.sql( """ |SELECT | ipAddress, COUNT(*) AS total |FROM | access_log |GROUP BY | ipAddress |HAVING | total > 10 |LIMIT | 10 """.stripMargin).map(row => row.getString(0)).collect() println( s"""IP Addresses: ${ipAddresses.mkString("[",", ","]")}""" ) /** * 需求四: The top endpoints requested by count. */ val topEndpoints: Array[(String, Long)] = sqlContext.sql( """ |SELECT | endpoint, COUNT(*) AS total |FROM | access_log |GROUP BY | endpoint |ORDER BY | total DESC |LIMIT | 10 """.stripMargin).map(row => (row.getString(0), row.getLong(1))).collect() println( s"""Top Endpoints: ${topEndpoints.mkString("[",", ","]")}""" ) // unCache Table sqlContext.uncacheTable("access_log") /** * 为了调试监控 */ Thread.sleep(10000000) // SparkContext sc.stop() }}
2.ApacheAccessLog
package com.ibeifeng.bigdata.spark.coreimport scala.util.matching.Regeximport scala.util.matching.Regex.Match/** * 数据: * 1.1.1.1 - - [21/Jul/2014:10:00:00 -0800] "GET /chapter1/java/src/main/java/com/databricks/apps/logs/LogAnalyzer.java HTTP/1.1" 200 1234 */case class ApacheAccessLog( ipAddress: String, clientIdented: String, userId: String, dateTime: String, method: String, endpoint: String, protocol: String, responseCode: Int, contentSize: Long )object ApacheAccessLog{ // regex val PARTTERN: Regex = """^(\S+) (-|\S+) (-|\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r /** * 对数据进行过滤 * @param log * @return */ def isValidateLogLine(log: String): Boolean = { // parse log info val res: Option[Match] = PARTTERN.findFirstMatchIn(log) // invalidate !res.isEmpty // true } /** * 用户解析Log文件,将每行数据解析成对应的CASE CLASS * * @param log * @return */ def parseLogLine(log: String): ApacheAccessLog = { // 使用正则表达式进行匹配 // parse log info val res: Option[Match] = PARTTERN.findFirstMatchIn(log) // invalidate if(res.isEmpty){ throw new RuntimeException("Cannot parse log line: " + log) } // get value val m: Match = res.get // return ApacheAccessLog( m.group(1), m.group(2), m.group(3), m.group(4), m.group(5), m.group(6), m.group(7), m.group(8).toInt, m.group(9).toLong ) }}
0 0
- SQLAccesslogAnalyzer in sparkSQL
- sparkSQL
- SparkSQL
- SparkSQL
- SparkSQL
- integrate hbase into sparkSql in fedora
- Hive/sparkSQL ( NOT IN ) 语句优化 ---- bigger than spark.driver.maxResultSize (1.0 GB)
- spark-sparkSQL
- SparkSql 使用
- LearningSpark9:SparkSQL
- sparkCookbook4-SparkSQL
- SparkSQL 概述
- SparkSql搭建
- SparkSQL样例
- SparkSQL简介
- SparkSQL 概述
- day60:sparkSql
- SparkSQL简介
- 深入理解CPU和异构计算芯片GPU/FPGA/ASIC (上篇)
- 【Java学习笔记】3:抽象类与接口
- 网络编程
- 数据挖掘-目录-基本统计
- Mac OS X Terminal 终端使用初级教程
- SQLAccesslogAnalyzer in sparkSQL
- 2-6 让字典保持有序
- L1-011. A-B
- 复杂性科学与现代理论下的自然科学价值特点
- Android一维、二维扫码
- Java 使用HashMap基本操作
- 线程的分离属性
- 蓝桥杯:算法训练 数字三角形
- SpringMVC 分析