进程监控脚本

来源:互联网 发布:云上贵州大数据公司 编辑:程序博客网 时间:2024/06/11 16:37
#!/bin/bash
#获取脚本所在目录
function GetHome #
{
PRG="$0"
## echo $PRG
while [ -h "$PRG" ]; do
  ls=`ls -ld "$PRG"`
  link=`expr "$ls" : '.*-> \(.*\)$'`
##  echo $link
  if expr "$link" : '/.*' > /dev/null; then
     PRG="$link"
  else
     PRG=`dirname "$PRG"`/"$link"
  fi
done
PRGDIR=`dirname "$PRG"`
# [ -z "$HOME" ] && HOME=`cd "$PRGDIR" >/dev/null; pwd`
HOME=`cd "$PRGDIR" >/dev/null; pwd`
echo $HOME
}

APP_HOME=`GetHome`
cd $APP_HOME
#存放日志
FILE_LOG="$APP_HOME/logs/log.log"
#存放错误日志
ERROR_LOG="$APP_HOME/logs/error.log"
#存放发出的emmail内容
EMAIL_LOG="$APP_HOME/logs/email.log"
#发出的邮件备份
EMAIL_LOG_BAK="$APP_HOME/logs/email.bak.log"
path=`pwd`
#cat $path/hosts|while read line 
#do
#IFS='\n'
#hosts放到脚本同目录中,内容看脚本下面的备注
for line in $(cat $path/hosts);do  
hostName=`echo $line|awk -F '-' '{print $1}'`
userName=`echo $line|awk -F '-' '{print $2}'`
processName=`echo $line|awk -F '-' '{print $3}'`
ppid=`echo $line|awk -F '-' '{print $4}'`
rptime=`date "+%Y-%m-%d %H:%M:%S"`
#pid=`ssh $hostName ps -ef|grep ^$userName|grep $processName|grep -v grep|grep -v vi|grep -v dbx|grep -v tail|grep -v start|grep -v stop |sed -n 1p |awk '{print $2}' `
#pid=`ssh $hostName ps -ef|grep ^$userName|grep $processName|grep -v grep|grep -v vi|grep -v dbx|grep -v tail|grep -v start|grep -v stop |sed -n 1p |awk '{print $2}' `
pid=`ssh $hostName ps -ef|grep ^$userName|grep $processName|grep -v grep|grep -v stop |sed -n 1p |awk '{print $2}' `

if [ "-$pid" == "-" ]; then 
    { 
    content="大家好:$hostName上面的$processName进程,在$rptime停止运行了,请相关人员查看!"
    echo $content >> $EMAIL_LOG
    }
elif [ "$pid" != "$ppid" ]; then 
{
processInfo=`ssh $hostName ps -ef|grep ^$userName|grep $processName|grep -v grep|grep -v vi|grep -v dbx|grep -v tail|grep -v start|grep -v stop |sed -n 1p |awk '{print $0}' `
content="大家好:$hostName上面的$processName进程,在$rptime之前重启过,原来进程号:$ppid,现在进程号:$pid,请相关人员查看,并将监控脚本中的$hostName-$userName-$processName-$ppid修改为$hostName-$userName-$processName-$pid!进程信息:$processInfo"

echo $content >> $EMAIL_LOG
}
else
    echo $rptime - $hostName - $userName - $processName-pid:$pid" Check Ok." >> $FILE_LOG
fi 
done

RECEIVERS="515256@qq.com cyxinda@163.com"
if [[ -f $EMAIL_LOG ]]; then
if [ `cat $EMAIL_LOG | wc -l`  -gt  0 ] 
then
export LANG=zh_CN.UTF-8
mailx -s "some process of hadoop is down..." $RECEIVERS < $EMAIL_LOG -- -f mon.tom-ora@fone.net.cn
  if [ $? -eq 0 ]
  then
    cat $EMAIL_LOG >> $EMAIL_LOG_BAK
    cat /dev/null > $EMAIL_LOG 
  fi
fi
fi









2,备注1:hosts文件内容:
格式:主机名-进程所属用户-进程名称-进程号
namenode1-hadoop-JobHistoryServer-2337
namenode1-hadoop-ResourceManager-2581
namenode1-hadoop-JournalNode-1842
namenode1-hadoop-NameNode-1121
namenode2-hadoop-NameNode-20404
namenode2-hadoop-JournalNode-20622
datanode1-hadoop-JournalNode-46540
datanode1-hadoop-NodeManager-46736
datanode1-hadoop-DataNode-46220
datanode4-hadoop-DataNode-42854
datanode4-hadoop-NodeManager-9680
datanode4-hadoop-JournalNode-42976
datanode5-hadoop-DataNode-24849
datanode5-hadoop-NodeManager-2413
datanode5-hadoop-JournalNode-24529
datanode6-hadoop-DataNode-34364
datanode6-hadoop-NodeManager-20315
datanode7-hadoop-DataNode-24874
datanode7-hadoop-NodeManager-21257


邮件格式一:
大家好:datanode4上面的NodeManager进程,在2014-07-24 10:15:05停止运行了,请相关人员查看!大家好:datanode5上面的NodeManager进程,在2014-07-24 10:15:06停止运行了,请相关人员查看!大家好:datanode6上面的NodeManager进程,在2014-07-24 10:15:07停止运行了,请相关人员查看!

邮件格式2:
大家好:datanode4上面的NodeManager进程,在2014-07-24 10:20:05之前重启过,原来进程号:33711,现在进程号:9650,请相关人员查看,并将监控脚本中的datanode4-hadoop-NodeManager-33311修改为datanode4-hadoop-NodeManager-9650!进程信息:hadoop 9650 1 22 10:16 pts/0 00:00:43 /usr/local/jdk1.7.0_51/bin/java -Dproc_nodemanager -Xmx1000m -Dhadoop.log.dir=/opt/hadoop.....
大家好:datanode5上面的NodeManager进程,在2014-07-24 10:20:08之前重启过,原来进程号:32960,现在进程号:2313,请相关人员查看,并将监控脚本中的datanode5-hadoop-NodeManager-32960修改为datanode5-hadoop-NodeManager-2313!进程信息:hadoop 2313 1 29 10:15 pts/0 00:01:24 /usr/local/jdk1.7.0_51/bin/java -Dproc_nodemanager -Xmx1000m -Dhadoop.log.dir=/opt/hadoop.....
.
.
.
原创粉丝点击