动态正则匹配

来源:互联网 发布:美国警察知乎 编辑:程序博客网 时间:2024/06/16 07:11

需求:

1、写一个动态正则;

2、只要写出日志的Schma就可以获取到日志的正则。

package com.donews.utilimport java.util.regex.Patternimport scala.collection.mutable.ArrayBuffer/**  * Created by yuhui on 2016/8/5.  *//***列子:       www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" China 22 Beijing第一版本    "$domain $ip - $remote_user [$timestamp] \"$http_url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" $country $region $city"例子 :      www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing"第二版本    "$domain $ip - $remote_user [$timestamp] \"$http_url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$e_ip\" \"$country\" \"$region\" \"$city\""例子 :     www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" "http://www.donews.com/media/201408/2834414.shtm" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing"第三版本    $domain $http_x_forwarded_for - $remote_user [$timestamp] "$http_url" "$url" $status $body_bytes_sent "$http_referer" "$http_user_agent" "$e_ip" "$country" "$region" "$city" */object DynamicRegex{  var cmd = ""  var regex =""  def tran(cmd: String): String = {    val sb = new StringBuffer()    sb.append("^")    val regex = "^(\\W+)$"    val p = Pattern.compile(regex)    cmd.split(" ").foreach(key =>      if (!p.matcher(key).find()) {        key.substring(0, key.indexOf("$"))        match {          case "" =>            if (key.split("\\$").length > 2) {              var split = ""              val regex = "(\\$\\w+)(\\W+)(\\$\\w+)(.*)"              val p = Pattern.compile(regex)              val m = p.matcher(key)              while (m.find()) {                split = m.group(2)              }              sb.append("(")              for (i <- Range(0, key.split("\\$").length - 1, 1)) {                if (i < key.split("\\$").length - 2) {                  sb.append("[\\S]+[" + split + "]")                } else {                  sb.append("[\\S]+")                }              }              sb.append(")\\s")            } else {              sb.append("([\\S]+)\\s")            }          case _ =>            val regex = "(\\W+)(\\$\\w+)(\\W+)"            val p = Pattern.compile(regex)            val m = p.matcher(key)            if (m.find) {              val pre = m.group(1)              val end = m.group(3)              sb.append("(" + escape(pre) + ".+" + escape(end) + ")\\s")            }        }      }else{        sb.append("(\\W+)\\s")      }    )    val str = sb.toString    str.substring(0, str.length - 2).concat("$")  }  def escape(original: String): String = {    val tb = new StringBuffer()    for (i <- Range(0, original.length(), 1)) {      if ("\"".equals(original.charAt(i).toString)) {      } else {        tb.append("\\")      }      tb.append(original.charAt(i))    }    tb.toString  }  def lineToGroup(line: String): ArrayBuffer[String] = {    val groups = ArrayBuffer[String]()    val p = Pattern.compile(regex)    val m = p.matcher(line)    while (m.find()) {      for (i <- Range(1, m.groupCount() + 1, 1)) {        groups.append(m.group(i))      }    }    groups  }  def main(args: Array[String]): Unit = {    cmd = "$domain $http_x_forwarded_for - $remote_user [$timestamp] \"$http_url\" \"$url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$e_ip\" \"$country\" \"$region\" \"$city\""    regex=tran(cmd)    println(regex)    val log = "www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] \"GET /media/201408/2834414.shtm HTTP/1.1\" \"http://www.donews.com/media/201408/2834414.shtm\" 200 11296 \"-\" \"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)\" \"-\" \"China\" \"22\" \"Beijing\""    lineToGroup(log).foreach(x=>println(x))  }}

输出结果:

^([\S]+)\s([\S]+)\s(\W+)\s([\S]+)\s(\[.+\])\s(".+")\s(".+")\s([\S]+)\s([\S]+)\s(".+")\s(".+")\s(".+")\s(".+")\s(".+")\s(".+")$www.donews.com123.125.71.72--[28/Nov/2016:11:08:50 +0800]"GET /media/201408/2834414.shtm HTTP/1.1""http://www.donews.com/media/201408/2834414.shtm"20011296"-""Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)""-""China""22""Beijing"
0 0