monit是一款功能强大的系统状态、进程、文件、目录和设备的监控软件,用于*nix平台, 它可以自动重启那些已经挂掉的程序,非常适合监控系统关键的进程和资源,如:nginx、apache、mysql和cpu占有率等。而监控管理Python进程,常用的是supervisor,后续会另外撰文介绍。
下面分别介绍monit的安装、配置和启动。
安装
Step 1 创建 M/Monit 库
/usr/local/mysql/bin/mysqladmin create mmonit -uUSER -pPASSWD
/usr/local/mysql/bin/mysql -f -h localhost -uUSER -pPASSWD -e "use mysql; grant all privileges on mmonit.* to mmonit@localhost identified by 'mmonit';"
/usr/local/mysql/bin/mysql -f -h localhost -uUSER -pPASSWD -e "use mysql; flush privileges;"
查看数据库
mysql> show databases;
+--------------------+
| Database |
+--------------------+
| mmonit |
| mysql |
| test |
+--------------------+
Step 2 安装tar包
下载monit源码包 http://mmonit.com/download/
tar zxf monit-5.1.1.tar.gz
cd monit-5.1.1
/monit-5.1.1]# ./configure
/monit-5.1.1]# gmake
/monit-5.1.1]# gmake install
/usr/bin/install -c -m 755 -d /usr/local/bin || exit 1
/usr/bin/install -c -m 755 -d /usr/local/share/man/man1 || exit 1
/usr/bin/install -c -m 555 -s monit /usr/local/bin || exit 1
/usr/bin/install -c -m 444 monit.1 /usr/local/share/man/man1/monit.1 || exit 1
Step 3 复制配置文件
/monit-5.1.1]# cp monitrc /etc/
/mmonit-2.3.4/conf]# vi server.xml
<Realm url="mysql://mmonit:mmonit@127.0.0.1:3306/mmonit"
minConnections="5"
maxConnections="30"
reapConnections="300" />
配置
vi /etc/monitrc
# 检查周期,默认为2分钟,在这设置为90秒
set daemon 90
# with start delay 240 #这是可选选项,首次检测工作延迟4分钟(一般情况下默认是的monit启动之后立即进行检测)
# 日志文件
set logfile /var/log/monit.log
# monit进程号存放的位置
set idfile /var/.monit.id
#monit状态文件
set statefile /var/.monit.state
# 邮件通知服务器
set mailserver 119.254.72.233
# 通知邮件的格式设置
set mail-format { from: monit@gby.dns.com.cn }
set mail-format { Subject: alert $HOST $SERVICE $DESCRIPTION }
# 设置邮件通知接收者。建议发到gmail,方便邮件过滤
set alert zhangzhn@dns.com.cn # receive all alerts
set alert zhaoyj@dns.com.cn # receive all alerts
# set alert aniya.zhao@gmail.com { timeout } #只有监控的服务超时的时候才发送邮件通知到该邮件地址
set httpd port 2812 and #设置http监控页面的端口
use address 119.254.72.248 #http监控页面的IP或域名
allow localhost #允许本地进行访问
allow 203.86.46.224/29 #允许此IP段访问
allow 203.86.63.133 #允许此IP访问
allow USER:PASSWD #允许访问的用户名密码
#############################################################################
## Services
###############################################################################
#
# 系统整体运行状况监控,默认的就可以,可以自己去微调
# 系统名称,可以是IP或域名
check system 119.254.72.248
if loadavg (1min) > 4 then alert
if loadavg (5min) > 2 then alert
if memory usage > 75% then alert
if cpu usage (user) > 70% then alert
if cpu usage (system) > 30% then alert
if cpu usage (wait) > 20% then alert
#-------------------->Http<----------------------#
check process apache with pidfile /usr/local/apache/logs/httpd.pid
start program = "/usr/local/apache/bin/apachectl start" with timeout 60 seconds
stop program = "/usr/local/apache/bin/apachectl stop"
if failed host 119.254.72.248 port 80 protocol http then restart
if cpu > 60% for 2 cycles then alert
if cpu > 80% for 5 cycles then restart
if children > 140 then restart
#-------------------->Http_post<----------------------#
check process httpd with pidfile /usr/local/http_post/logs/httpd.pid
start program = "/usr/local/http_post/bin/apachectl start" with timeout 60 seconds
stop program = "/usr/local/http_post/bin/apachectl stop"
if failed host 119.254.72.248 port 8080 protocol http then restart
#-------------------->Nginx<----------------------#
check process nginx with pidfile /usr/local/nginx/logs/nginx.pid
start program = "/usr/local/nginx/sbin/nginx start" with timeout 60 seconds
stop program = "/usr/local/nginx/sbin/nginx stop"
if failed host 119.254.72.248 port 81 protocol http then restart
#-------------------->Mysql<----------------------#
check process mysqld with pidfile /usr/local/mysql/var/m-g3.dns.com.cn.pid
start program = "/usr/local/mysql/bin/mysqlctl start 3306" with timeout 60 seconds
stop program = "/usr/local/mysql/bin/mysqlctl stop 3306"
if failed host 119.254.72.248 port 3306 protocol then restart
#-------------------->Nfs<----------------------#
check process nfsd with pidfile /var/run/mountd.pid
start program = "/etc/rc.d/nfsd start" with timeout 60 seconds
stop program = "/etc/rc.d/nfsd stop"
if failed host 119.254.72.248 port 2049 protocol nfsd then restart
#-------------------->Spam<----------------------#
check process Spamd with pidfile /var/run/spamd.pid
start program = "/usr/local/sbin/spamctl start" with timeout 60 seconds
stop program = "/usr/local/sbin/spamctl stop"
if 5 restarts within 5 cycles then timeout
if cpu usage > 99% for 5 cycles then alert
if mem usage > 99% for 5 cycles then alert
#磁盘空间
check filesystem root with path /
if space usage > 70% then alert
if inode usage > 85% then alert
check filesystem home with path /home
if space usage > 50% for 5 times within 15 cycles then alert
if inode usage > 85% then alert
check filesystem usr with path /usr
if space usage > 70% then alert
if inode usage > 85% then alert
check filesystem var with path /var
if space usage > 70% then alert
if inode usage > 85% then alert
----------------------------------------------------------------
根据我的生产需求,到目前为止算是配置完成
启动
# /usr/local/bin/monit -c /etc/monitrc
monit: generated unique Monit id 3fc046bad3fc56f8959f6b0fe6d9267d and stored to '/var/.monit.id'
Starting monit daemon with http interface at [119.254.72.248:2812]
# ps -aux |grep monit
root 78633 0.0 0.1 6360 2836 ?? S 2:49PM 0:00.01 /usr/local/bin/monit
------------------------------------------------------------
monit的web配置见下篇