SQLAccesslogAnalyzer in sparkSQL

来源:互联网 发布:矩阵图片 编辑:程序博客网 时间:2024/06/05 00:34
1. SQLAccessLogAnalyzerSpark
package com.ibeifeng.bigdata.spark.sqlimport com.ibeifeng.bigdata.spark.core.ApacheAccessLogimport org.apache.spark.rdd.RDDimport org.apache.spark.sql.{DataFrame, SQLContext}import org.apache.spark.{SparkContext, SparkConf}/** * Created by XuanYu on 2017/3/5. */object SQLAccessLogAnalyzerSpark {  def main(args: Array[String]) {    // 取配置信息    val sparkConf = new SparkConf();    sparkConf.setAppName("SQLAccessLogAnalyzer Application").setMaster("local[2]")    // SparkContext    val sc = SparkContext.getOrCreate(sparkConf)    /**     * SparkSQL入口  SQLContext     */    val sqlContext = new SQLContext(sc)    // this is used to implicitly convert an RDD to a DataFrame.    import sqlContext.implicits._// ================================================================================    // 自定义函数    // 案例一、定义UDF:一    sqlContext.udf.register(      "toUpper" ,// function name      (word: String) => word.toUpperCase()    )    // 案例二:截取Double的后几位    sqlContext.udf.register(      "double_scala",      (numValue: Double, scale: Int) => {        import java.math.BigDecimal        val db = new BigDecimal(numValue)        db.setScale(scale, BigDecimal.ROUND_HALF_UP).doubleValue()      }    )    // 案例三:注册定义UDAF    sqlContext.udf.register(      "sal_avg",      AvgSalUDAF    )    // ================================================================================    val logFile = "/test/access_log"    val accessLogsRdd: RDD[ApacheAccessLog] = sc      .textFile(logFile)   // read file from hdfs      // filter      .filter(ApacheAccessLog.isValidateLogLine)      // parse      .map(log => ApacheAccessLog.parseLogLine(log))    /**     * RDD -> DataFrame     *    其中一种方式RDD[CASE CLASS]     */    // Create DataFrame    val accessLogsDF: DataFrame = accessLogsRdd.toDF()    // 打印Schema    /**     * root       |-- ipAddress: string (nullable = true)       |-- clientIdented: string (nullable = true)       |-- userId: string (nullable = true)       |-- dateTime: string (nullable = true)       |-- method: string (nullable = true)       |-- endpoint: string (nullable = true)       |-- protocol: string (nullable = true)       |-- responseCode: integer (nullable = false)       |-- contentSize: long (nullable = false)     */    accessLogsDF.printSchema()    /**     * 再次使用SQL行分析,SQL时针对表的,所以DataFrame注册表,然后SQL分析     */    // DataFrame注册    accessLogsDF.registerTempTable("access_log")    // 将表中的数据放入内存中    sqlContext.cacheTable("access_log")    /**     * 需求一:       The average, min, and max content size of responses returned from the server.     */    val contentSizesRow = sqlContext.sql(      """        |SELECT        |  SUM(contentSize), COUNT(*), MIN(contentSize), MAX(contentSize)        |FROM        |  access_log      """.stripMargin).first()    // println    println("Content Size Avg: %s, Min: %s, Max: %s".format(      contentSizesRow.getLong(0) / contentSizesRow.getLong(1),      contentSizesRow(2),      contentSizesRow(3)    ))    // Content Size Avg: 7777, Min: 0, Max: 138789    /**     * 需求二:         A count of response code's returned.     */    val responseCodeToCount: Array[(Int, Long)] = sqlContext.sql(      """        |SELECT        | responseCode, COUNT(*)        |FROM        | access_log        |GROUP BY        |  responseCode        |LIMIT        |  10      """.stripMargin).map(row => (row.getInt(0), row.getLong(1))).collect()    println(      s"""Response Code Count: ${responseCodeToCount.mkString("[",", ","]")}"""    )    /**     * 需求三:         All IPAddresses that have accessed this server more than N times.     */    val ipAddresses: Array[String] = sqlContext.sql(      """        |SELECT        | ipAddress, COUNT(*) AS total        |FROM        | access_log        |GROUP BY        | ipAddress        |HAVING        | total > 10        |LIMIT        | 10      """.stripMargin).map(row => row.getString(0)).collect()    println(      s"""IP Addresses: ${ipAddresses.mkString("[",", ","]")}"""    )    /**     * 需求四:         The top endpoints requested by count.     */    val topEndpoints: Array[(String, Long)] = sqlContext.sql(      """        |SELECT        | endpoint, COUNT(*) AS total        |FROM        | access_log        |GROUP BY        | endpoint        |ORDER BY        | total DESC        |LIMIT        | 10      """.stripMargin).map(row => (row.getString(0), row.getLong(1))).collect()    println(      s"""Top Endpoints: ${topEndpoints.mkString("[",", ","]")}"""    )    // unCache Table    sqlContext.uncacheTable("access_log")    /**     * 调试监     */    Thread.sleep(10000000)    // SparkContext    sc.stop()  }}

2.ApacheAccessLog

package com.ibeifeng.bigdata.spark.coreimport scala.util.matching.Regeximport scala.util.matching.Regex.Match/** * 数据: * 1.1.1.1 - - [21/Jul/2014:10:00:00 -0800] "GET /chapter1/java/src/main/java/com/databricks/apps/logs/LogAnalyzer.java HTTP/1.1" 200 1234 */case class ApacheAccessLog(  ipAddress: String,  clientIdented: String,  userId: String,  dateTime: String,  method: String,  endpoint: String,  protocol: String,  responseCode: Int,  contentSize: Long )object ApacheAccessLog{  // regex  val PARTTERN: Regex = """^(\S+) (-|\S+) (-|\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r  /**   * 数据过滤   * @param log   * @return   */  def isValidateLogLine(log: String): Boolean = {    // parse log info    val res: Option[Match] = PARTTERN.findFirstMatchIn(log)    // invalidate    !res.isEmpty   // true  }  /**   * 解析Log文件,将每行数据解析成对应CASE CLASS   *   * @param log   * @return   */  def parseLogLine(log: String): ApacheAccessLog = {    // 使用正表达式行匹配    // parse log info    val res: Option[Match] = PARTTERN.findFirstMatchIn(log)    // invalidate    if(res.isEmpty){      throw new RuntimeException("Cannot parse log line: " + log)    }    // get value    val m: Match = res.get    // return    ApacheAccessLog(      m.group(1),      m.group(2),      m.group(3),      m.group(4),      m.group(5),      m.group(6),      m.group(7),      m.group(8).toInt,      m.group(9).toLong    )  }}

0 0