一、
i.被监控机上监控内存使用情况,在被监控机上操作
cd /usr/local/nagios/libexec
vi check_mem.sh
#!/bin/bash
if [ "$1" = "-w" ] && [ "$2" -gt "0" ] && [ "$3" = "-c" ] && [ "$4" -gt "0" ]; then
memTotal_b=`free -b |grep Mem |awk '{print $2}'`
memFree_b=`free -b |grep Mem |awk '{print $4}'`
memBuffer_b=`free -b |grep Mem |awk '{print $6}'`
memCache_b=`free -b |grep Mem |awk '{print $7}'`
memTotal_m=`free -m |grep Mem |awk '{print $2}'`
memFree_m=`free -m |grep Mem |awk '{print $4}'`
memBuffer_m=`free -m |grep Mem |awk '{print $6}'`
memCache_m=`free -m |grep Mem |awk '{print $7}'`
memUsed_b=$(($memTotal_b-$memFree_b-$memBuffer_b-$memCache_b))
memUsed_m=$(($memTotal_m-$memFree_m-$memBuffer_m-$memCache_m))
memUsedPrc=$((($memUsed_b*100)/$memTotal_b))
if [ "$memUsedPrc" -ge "$4" ]; then
echo "Memory: CRITICAL Total: $memTotal_m MB - Used: $memUsed_m MB - $memUsedPrc% used!|TOTAL=$memTotal_b;;;; USED=$memUsed_b;;;; CACHE=$memCache_b;;;; BUFFER=$memBuffer_b;;;;"
$(exit 2)
elif [ "$memUsedPrc" -ge "$2" ]; then
echo "Memory: WARNING Total: $memTotal_m MB - Used: $memUsed_m MB - $memUsedPrc% used!|TOTAL=$memTotal_b;;;; USED=$memUsed_b;;;; CACHE=$memCache_b;;;; BUFFER=$memBuffer_b;;;;"
$(exit 1)
else
echo "Memory: OK Total: $memTotal_m MB - Used: $memUsed_m MB - $memUsedPrc% used|TOTAL=$memTotal_b;;;; USED=$memUsed_b;;;; CACHE=$memCache_b;;;; BUFFER=$memBuffer_b;;;;"
$(exit 0)
fi
else
echo "check_mem v1.1"
echo ""
echo "Usage:"
echo "check_mem.sh -w <warnlevel> -c <critlevel>"
echo ""
echo "warnlevel and critlevel is percentage value without %"
echo ""
echo "Copyright (C) 2012 Lukasz Gogolin (lukasz.gogolin@gmail.com)"
exit
fi
ii.chmod +x check_mem.sh
测试check_mem脚本是否能正常使用
./check_mem.sh -w 80 -c 90
iii.修改nrpe.cfg
vi /usr/local/nagios/etc/nrpe.cfg
增加一行:command[check_mem]=/usr/lib64/nagios/plugins/check_mem.sh -w 80 -c 90
iiii.重启电脑
iv.执行命令启动nrpe:/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
二、在监控机上添加commands.cfg配置
vi /etc/nagios/objects/commands.cfg
define command{ command_name check_mem command_line $USER1$/check_mem.sh -w $ARG1$ -c $ARG2$ }
在被监控机配置文件里面添加vi /usr/local/nagios/etc/objects/linux37.cfg
define service{ use generic-service host_name Nagios_Centos_Client1 service_description Memery Monitoring check_command check_nrpe!check_mem } 重启nagios即可systemctl restart nagios
三、检查CPU,步骤同上,
把check_cpu.sh 文件放在libexec目录下。
#!/bin/sh
# Filename: check_cpu.sh
procinfo=`which procinfo 2>/dev/null`
sar=`which sar 2>/dev/null`
function help {
echo -e "\n\tThis plugin shows the % of used CPU, using either procinfo or sar (whichever is available)\n\n\t$0:\n\t\t-c <integer>\tIf the % of used CPU is above <integer>, returns CRITICAL state\n\t\t-w <integer>\tIf the % of used CPU is below CRITICAL and above <integer>, returns WARNING state\n"
exit -1
}
# Getting parameters:
while getopts "w:c:h" OPT; do
case $OPT in
"w") warning=$OPTARG;;
"c") critical=$OPTARG;;
"h") help;;
esac
done
# Checking parameters:
( [ "$warning" == "" ] || [ "$critical" == "" ] ) && echo "ERROR: You must specify warning and critical levels" && help
[[ "$warning" -ge "$critical" ]] && echo "ERROR: critical level must be highter than warning level" && help
# Assuring that the needed tools exist:
( ( [ -f $procinfo ] && command="procinfo") || [ -f $sar ] ) || \
( echo "ERROR: You must have either procinfo or sar installer in order to run this plugin" && exit -1 )
# Doing the actual check:
idle=`top -b -n 1 | grep Cpu |awk '{print $5}' |cut -f 1 -d "%"`
used=10
# Comparing the result and setting the correct level:
if [[ $used -ge $critical ]]; then
msg="CRITICAL"
status=2
else if [[ $used -ge $warning ]]; then
msg="WARNING"
status=1
else
msg="OK"
status=0
fi
fi
# Printing the results:
echo "$msg - CPU used=$used% idle=$idle% | 'CPU Usage'=$used%;$warning;$critical;"
# Bye!
exit $status
四、Nagios服务端check_nt 检查命令介绍,windows系统的
# 检查本次系统启动总时间
check_nt -H 192.168.1.121 -p 12489 -s 12345 -v UPTIME
# 检查内存占用情况
check_nt -H 192.168.1.121 -p 12489 -s 12345 -v MEMUSE -w 80 -c 90
# 检查客户端版本信息
check_nt -H 192.168.1.121 -p 12489 -s 12345 -v CLIENTVERSION
# 检查5分钟内CPU占用情况
check_nt -H 192.168.1.121 -p 12489 -s 12345 -v CPULOAD -w 80 -c 90 -l 5,80,90
# 检查磁盘C占用情况
check_nt -H 192.168.1.121 -p 12489 -s 12345 -v USEDDISKSPACE -d SHOWALL -l C
# 检查服务状态
check_nt -H 192.168.1.121 -p 12489 -s 12345 -v SERVICESTATE -l Spooler -d SHOWALL
# 检查进程状态
check_nt -H 192.168.1.121 -p 12489 -s 12345 -v PROCSTATE -l spark.exe -d SHOWALL
# 查看所有进程列表
check_nt -H 192.168.1.121 -p 12489 -s 12345 -v INSTANCES -l process
五、一个完整的windows配置文件
[root@nagios objects]# cd winserver
[root@nagios winserver]# vi winhost_172.cfg
#定义监控主机,设置主机名(不可重复)、别名、IP地址。
define host{
use windows-server
host_name winhost_172
alias ywzhou_pc
address 10.188.1.172
}
#定义主机组,在winserver文件下只需要一个文件定义了主机组,其他文件就不要再定义了。
define hostgroup{
hostgroup_name windows-servers
alias Windows Servers
}
#第一部分:定义基于check_nt命令的监控服务。
#一个配置文件中的service_description不能重复
#监控NSClient++客户端软件版本
define service{
use generic-service
host_name winhost_172
service_description NSClient++ Version
check_command check_nt!CLIENTVERSION
}
#监控在线时长
define service{
use generic-service
host_name winhost_172
service_description Uptime
check_command check_nt!UPTIME
}
#监控CPU负载,超过80%报警,超过90%严重
define service{
use generic-service
host_name winhost_172
service_description CPU Load
check_command check_nt!CPULOAD!-l 5,80,90
}
#监控内存使用情况,超过80%报警,超过90%严重
define service{
use generic-service
host_name winhost_172
service_description Memory Usage
check_command check_nt!MEMUSE!-w 80 -c 90
}
#监控C盘使用情况,可以复制该组服务来监控其他磁盘
define service{
use generic-service
host_name winhost_172
service_description C:\ Drive Space
check_command check_nt!USEDDISKSPACE!-l c -w 80 -c 90
}
#监控系统服务状态(是否启动),默认的W3SVC是IIS的服务,可以复制该组服务来监控其他系统服务
#可以在任务管理器中的服务项查看哪些服务器比较重要就监控起来,比如IIS、SQLServer等。
define service{
use generic-service
host_name winhost_172
service_description W3SVC
check_command check_nt!SERVICESTATE!-d SHOWALL -l W3SVC
}
#监控程序状态(是否运行),默认的Explorer.exe是桌面进程的程序,可以复制该组服务来监控其他系统服务
#可以在任务管理器中的进程项查看哪些服务器比较重要就监控起来。
define service{
use generic-service
host_name winhost_172
service_description Explorer
check_command check_nt!PROCSTATE!-d SHOWALL -l Explorer.exe
}
#第二部分:定义基于check plugins的监控服务。
#安装NSCP时启用了Enable common check plugins功能
#check plugins是位于/usr/local/nagios/libexec下的Nagios自带的监控插件
#监控ftp服务
define service{
use generic-service
host_name winhost_172
service_description FTP
check_command check_ftp
}
#监控http服务
define service{
use generic-service
host_name winhost_172
service_description HTTP
check_command check_http
}
#监控ssh服务
define service{
use generic-service
host_name winhost_172
service_description SSH
check_command check_ssh
}
#监控dhcp服务
define service{
use generic-service
host_name winhost_172
service_description DHCP
check_command check_dhcp
}
#监控pop3服务
define service{
use generic-service
host_name winhost_172
service_description POP
check_command check_pop
}
#监控imap服务
define service{
use generic-service
host_name winhost_172
service_description IMAP
check_command check_imap
}
#监控smtp服务
define service{
use generic-service
host_name winhost_172
service_description SMTP
check_command check_smtp
}
#监控tcp端口,常用于监控多个网站使用不同端口时,监控端口状态
define service{
use generic-service
host_name winhost_172
service_description TCP
check_command check_tcp!80
}