垃圾数据的整理

来源:互联网 发布:安卓mac一键修改软件 编辑:程序博客网 时间:2024/05/16 11:43

要处理的部分文本内容格式如下:

Jul 12 09:28:17 mx1 spamd[2808]: spamd: identified spam (29.8/7.5) for (unknown):65534 in 2.4 seconds, 6104 bytes.
Jul 12 09:28:26 mx1 spamd[2808]: spamd: clean message (5.6/7.5) for (unknown):65534 in 3.8 seconds, 6112 bytes.
Jul 12 09:28:26 mx1 spamd[3230]: spamd: clean message (5.6/7.5) for (unknown):65534 in 3.8 seconds, 6108 bytes.
Jul 12 09:28:26 mx1 spamd[2808]: spamd: clean message (-96.4/7.5) for (unknown):65534 in 0.5 seconds, 6035 bytes.
Jul 12 09:28:27 mx1 spamd[2808]: spamd: clean message (-96.4/7.5) for (unknown):65534 in 0.5 seconds


将要被调用的脚本文件 statistic_spam.awk 内容如下
BEGIN {
    today = strftime("%Y-%m-%d", systime())
}

# 正常邮件
/clean message/{
    count_minute_clean[sprintf("%s:%02d", substr($3,1,2), int(substr($3,4,2)/5)*5)] ++
}

# 垃圾邮件
/identified spam/{
    count_minute_spam[sprintf("%s:%02d", substr($3,1,2), int(substr($3,4,2)/5)*5)] ++
}

/in [0-9]*\.[0-9]* seconds/{
    total[sprintf("%s:%02d", substr($3,1,2), int(substr($3,4,2)/5)*5)] += $13
    count[sprintf("%s:%02d", substr($3,1,2), int(substr($3,4,2)/5)*5)] ++

    if ($13 <= 0.5)
        count_time[0] ++
    else if (0.5 < $13 && $13 <=4)
        count_time[1] ++
    else
        count_time[2] ++
}

END {
    for(variable in total)
    {
#        print    variable,
#                count_minute_spam[variable], sprintf("%2.2f", count_minute_spam[variable] / count[variable] * 100),
#                count_minute_clean[variable],
#                count[variable],
#                total[variable] / count[variable]

        total_count_clean += count_minute_clean[variable]
        total_count_spam += count_minute_spam[variable]
    }

#####################################################################################################################
#|    日期    | 扫描邮件总量 | 正常邮件数量 | 垃圾邮件数量 | 处理时间小于0.5秒 | 处理时间小于4秒 | 处理时间大于4秒 |#
#####################################################################################################################
    total_count = total_count_clean + total_count_spam
    printf("| %10s | %12d | %12d | %12d | %17d | %15d | %15d |\n",
           today,
           total_count, total_count_clean, total_count_spam,
           count_time[0], count_time[1], count_time[2])
}


最后执行的命令如下:

# /usr/local/bin/gawk -f statistic_spam.awk /var/log/maillog >> /tmp/KevinShell/statistic/output/statistic_spam.log.201107


看一下输出结果




将以上输出结果插入到数据库Statistic 表Spam中

#  /usr/local/mysql/bin/mysql -uroot -pPASSWD -hlocalhost -DStatistic -e"insert into Spam(Date,Total,Ham,Spam,Lower1s,Lower4s,Greater4s)values(`gawk -f statistic_spam.awk /var/log/maillog |sed 's#|##g'|sed 's/[ ][ ]*/ /g'|sed 's/^ //g'|sed 's/ $//g'|sed 's/ /,/g'`)"








原创粉丝点击