最新编写,配置文件均正常使用,prometheus监控根据需求添加相关exporter即可,配置grafana数据源为prometheus,添加对应面板
xxxxxx:~/prometheus+grafana$ tree -L 2
.
├── alertmanager
│ ├── alertmanager
│ ├── alertmanager.log
│ ├── alertmanager.yml
│ ├── amtool
│ ├── data
│ ├── LICENSE
│ ├── NOTICE
│ └── template
├── alertmanager-0.21.0.linux-amd64.tar.gz
├── blackbox_exporter
│ ├── blackbox_exporter
│ ├── blackbox.yml
│ ├── blackbox.yml.bak
│ ├── LICENSE
│ └── NOTICE
├── blackbox_exporter-0.18.0.linux-amd64.tar.gz
├── grafana
│ └── grafana_7.2.2_amd64.deb
├── node_exporter
│ ├── LICENSE
│ ├── node_exporter
│ └── NOTICE
├── node_exporter-1.0.1.linux-amd64.tar.gz
├── prometheus
│ ├── alert.rules.yml
│ ├── console_libraries
│ ├── consoles
│ ├── data
│ ├── LICENSE
│ ├── NOTICE
│ ├── prometheus
│ ├── prometheus.yml
│ ├── prometheus.yml.bak
│ ├── promtool
│ ├── rule.yml
│ └── services.yml
└── prometheus-2.22.0.linux-amd64.tar.gz
./blackbox_exporter --config.file=blackbox.yml --web.listen-address=:7995
xxxxx:~/prometheus+grafana/blackbox_exporter$ cat blackbox.yml
modules:
http_2xx: # http 检测模块 Blockbox-Exporter 中所有的探针均是以 Module 的信息进行配置
prober: http
timeout: 10s
http:
valid_status_codes: [200] # 这里最好作一个返回状态码,在grafana作图时,有明示
method: GET
preferred_ip_protocol: "ip4"
http_post_2xx: # http post 监测模块
prober: http
timeout: 10s
http:
method: POST
preferred_ip_protocol: "ip4"
tcp_connect: # TCP 检测模块
prober: tcp
timeout: 10s
# reload prometheus config
# precondition start as:
./prometheus --config.file=prometheus.yml --web.enable-lifecycle &
curl -X POST http://localhost:9090/-/reload
# configs
xxxxxx:~/prometheus+grafana/prometheus$ ls -l
total 161712
-rw-rw-r-- 1 ctdna ctdna 1272 10月 28 17:08 alert.rules.yml
drwxr-xr-x 2 ctdna ctdna 4096 10月 15 22:21 console_libraries
drwxr-xr-x 2 ctdna ctdna 4096 10月 15 22:21 consoles
drwxrwxr-x 12 ctdna ctdna 4096 10月 28 17:00 data
-rw-r--r-- 1 ctdna ctdna 11357 10月 15 22:21 LICENSE
-rw-r--r-- 1 ctdna ctdna 3420 10月 15 22:21 NOTICE
-rwxr-xr-x 1 ctdna ctdna 87729971 10月 15 20:32 prometheus
-rw-rw-r-- 1 ctdna ctdna 1429 10月 28 11:46 prometheus.yml
-rw-r--r-- 1 ctdna ctdna 926 10月 15 22:21 prometheus.yml.bak
-rwxr-xr-x 1 ctdna ctdna 77801407 10月 15 20:34 promtool
-rw-rw-r-- 1 ctdna ctdna 1037 10月 27 20:41 rule.yml
-rw-rw-r-- 1 ctdna ctdna 391 10月 28 15:54 services.yml
xxxxxx:~/prometheus+grafana/prometheus$ cat alert.rules.yml
groups:
- name: alert.rules
rules:
- alert: cpu_usage_over_threshold
expr: 100 - avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (alarmhost,alarmproject,alarmtype) * 100 > 90
# Alarm duration
for: 5m
labels:
severity: "critical"
annotations:
summary: "Host {{ $labels.alarmhost }} CPU usage continues to exceed the threshold for five minutes and is currently {{humanize $value}}%"
- alert: mem_usage_over_threshold
expr: 100 - (node_memory_MemAvailable_bytes)/(node_memory_MemTotal_bytes)* 100 > 90
for: 5m
labels:
severity: "critical"
annotations:
summary: "Host {{ $labels.alarmhost }} MEM usage continues to exceed the threshold for five minutes and is currently {{humanize $value}}%"
- alert: root_partition_usage_alert
expr: (node_filesystem_size_bytes{device="rootfs",mountpoint="/"} -node_filesystem_free_bytes{device="rootfs",mountpoint="/"}) / node_filesystem_size_bytes{device="rootfs",mountpoint="/"} * 100 > 88
for: 5m
labels:
severity: "critical"
annotations:
summary: "Host {{ $labels.instance }} Root partition used {{humanize1024 $value}} % ,Please expand in time"
# - alert: output_traffic_excess_alarm
# expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30s]))by (alarmhost) ) / 102400) > 17
# for: 2m
# labels:
# severity: "critical"
# annotations:
# summary: "注意! 主机 {{ $labels.alarmhost }} 网络出带宽持续2分钟高于17M. 当前量为 {{humanize1024 $value}} "
- alert: ServiceDown
expr: probe_success == 0
for: 300s
labels:
severity: "critical"
annotations:
summary: "Service {{ $labels.instance }} down"
description: "Service {{ $labels.instance }} is running normally"
xxxxx:~/prometheus+grafana/prometheus$ cat prometheus.yml
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:7996']
rule_files:
- '/home/ctdna/prometheus+grafana/prometheus/alert.rules.yml'
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ['192.168.23.98:9090']
- job_name: node-exporter
static_configs:
- targets: ['ip:7999']
labels:
alarmhost: ip
alarmtype: test
alarmproject: node-exporter
- targets: ['192.168.23.98:9100']
labels:
alarmhost: 192.168.23.98
alarmtype: on-line
alarmproject: node-exporter
- job_name: cadvisor
static_configs:
- targets: ['ip:7701']
labels:
alarmhost: ip
alarmtype: test
alarmproject: cadvisor
- targets: ['192.168.23.98:7701']
labels:
alarmhost: 192.168.23.98
alarmtype: test
alarmproject: cadvisor
- job_name: process-exporter
static_configs:
- targets: ['192.168.23.98:7779']
labels:
alarmhost: 192.168.23.98
alarmtype: test
alarmproject: process-exporter
- targets: ['ip:7999']
labels:
alarmhost: ip
alarmtype: on-line
alarmproject: process-exporter
- job_name: alertmanager
static_configs:
- targets: ['192.168.23.98:7996']
labels:
alarmhost: 192.168.23.98
alarmtype: test
alarmproject: alertmanager
- job_name: 'blackbox'
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
file_sd_configs:
- files: ['/home/ctdna/prometheus+grafana/prometheus/services.yml']
refresh_interval: 10s
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.23.98:7998 # The blackbox exporter's real hostname:port.
xxxxx:~/prometheus+grafana/prometheus$ cat services.yml
- targets:
- ip:7701
labels:
alarmproject: survey
alarmhost: ip
alarmtype: test
- targets:
- ip:7999
labels:
alarmproject: survey
alarmhost: ip
alarmtype: on-line
- targets:
- ip:7798
- ip:7803
- ip:7795
labels:
alarmproject: survey
alarmhost: ip
alarmtype: on-line
- targets:
- http://ip:7796/admin/login/?next=/admin/
labels:
alarmproject: survey
alarmhost: ip
alarmtype: on-line
# 检查配置文件是否正确
./promtool check config prometheus.yml
# start
nohup ./alertmanager --config.file=alertmanager.yml --web.listen-address=:7996 2>&1 > alertmanager.log &
# configs
xxxxx:~/prometheus+grafana/alertmanager$ ls -l
total 51724
-rwxr-xr-x 1 ctdna ctdna 28871879 6月 17 16:54 alertmanager
-rw-rw-r-- 1 ctdna ctdna 68006 10月 28 15:47 alertmanager.log
-rw-r--r-- 1 ctdna ctdna 790 10月 28 15:35 alertmanager.yml
-rwxr-xr-x 1 ctdna ctdna 23987848 6月 17 16:55 amtool
drwxrwxr-x 2 ctdna ctdna 4096 10月 28 17:24 data
-rw-r--r-- 1 ctdna ctdna 11357 6月 17 17:34 LICENSE
-rw-r--r-- 1 ctdna ctdna 457 6月 17 17:34 NOTICE
drwxrwxr-x 2 ctdna ctdna 4096 10月 28 16:20 template
xxxxxx:~/prometheus+grafana/alertmanager$ cat alertmanager.yml
global:
resolve_timeout: 5m
smtp_from: 'xxxxx@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: 'xxxxx@qq.com'
smtp_auth_password: 'vxxxxxxbdf'
smtp_require_tls: false
smtp_hello: 'qq.com'
templates:
- './template/default.tmpl'
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 10s
repeat_interval: 5m
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: '10xxxxxx@qq.com'
send_resolved: true
html: '{{ template "default.html" .}}'
#headers: { Subject: "{{.GroupLabels.SortedPairs.values }} [{{ .Status | toUpper}}:{{ .Alerts.Firing | len }}]" }
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
xxxxxx:~/prometheus+grafana/alertmanager$ cat template/default.tmpl
{{ define "default.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
[{{ .Status | toUpper }}:{{ .Alerts.Firing | len }}]
{{ range $i , $alert := .Alerts }}
<pre>
告警节点: {{ index $alert.Labels "nodename" }}
告警服务: {{ index $alert.Labels "alertname" }}
报警详情: {{ index $alert.Annotations "summary" }}
开始时间: {{ $alert.StartsAt.Local }}
</pre>
{{ end }}
{{ end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
[{{ .Status | toUpper }}:{{ .Alerts.Firing | len }}]
{{ range $i , $alert := .Alerts }}
<pre>
恢复节点: {{ index $alert.Labels "nodename" }}
恢复服务: {{ index $alert.Labels "alertname" }}
状 态: {{ index $alert.Status }}
开始时间: {{ $alert.StartsAt.Local }}
恢复时间: {{ $alert.EndsAt.Local }}
</pre>
{{ end }}
{{ end }}
{{- end }}
# 说明
group_wait: 5s
group_interval: 10s
repeat_interval: 5m
一键部署node+process+cadvisor
#! /bin/sh
if [ $1 = "node-exporter" ];then
yum -y install lrzsz
# node-exporter install
#wget https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz
rz -bye
tar xf node_exporter-1.0.1.linux-amd64.tar.gz -C /usr/local/
mv /usr/local/node_exporter-1.0.1.linux-amd64/ /usr/local/node_exporter
cd /usr/local/node_exporter/
node_port=`netstat -lntp | grep 7999`
if [ ! "$node_port" ];then
nohup ./node_exporter --web.listen-address=":7999" &
result_node=`ps -ef | grep node_exporter | grep -v grep`
if [ ! "$result_node" ];then
echo "failed to install node-exporter!"
else
echo "successfully installed node-exporter"
fi
else
echo "install node-exporter failed,port is already in use!"
fi
exit
elif [ $1 = "cadvisor" ];then
cadvisor_port=`netstat -lntp | grep 7998`
if [ ! "$cadvisor_port" ];then
docker run -d -p 7998:8080 --name cadvisor -v /:/rootfs:ro -v /var/run:/var/run:rw -v /sys:/sys:ro -v /var/lib/docker/:/var/lib/docker:ro google/cadvisor:latest
result_docker=`docker ps | grep cadvisor`
if [ ! "$result_docker" ];then
echo "failed to install cadvisor!"
else
echo "successfully installed cadvisor"
fi
else
echo "install cadvisor failed,port is already in use!"
fi
exit
else
[ $1 = "process-exporter" ]
#wget https://github.com/ncabatoff/process-exporter/releases/download/v0.7.5/process-exporter-0.7.5.linux-amd64.tar.gz
rz -bye
tar xf process-exporter-0.7.2.linux-amd64.tar.gz -C /usr/local/
mv /usr/local/process-exporter-0.7.2.linux-amd64/ /usr/local/process-exporter
cd /usr/local/process-exporter
touch pro.yml
exit
cat <<EOF >pro.yml
process_names:
- name: "{{.Matches}}"
cmdline:
- '/www/server/nginx/sbin/nginx -c /www/server/nginx/conf/nginx.conf'
EOF
process_port=`netstat -lntp | grep 7997`
if [ ! "$process_port" ];then
./process-exporter -config.path process.yml -web.listen-address=:7997 &
result_process=`ps -ef | grep process-exporter | grep -v grep`
if [ ! "$result_process" ];then
echo "failed to install process-exporter!"
else
echo "successfully installed process-exporter"
fi
else
echo "install process-exporter failed ,port is already in use!"
fi
exit
fi