vim ../alertmanager/wechat.tmpl
{{ define "wechat.tmpl" }}
{{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }}
@异常警告 <br>
实例: {{ .Labels.instance }} <br>
信息: {{ .Annotations.summary }} <br>
详情: {{ .Annotations.description }} <br>
时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
{{ end }}{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}
@异常恢复 <br>
实例: {{ .Labels.instance }} <br>
信息: {{ .Annotations.summary }} <br>
时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
恢复: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
{{ end }}{{ end -}}
{{- end }}
Awesome Prometheus alerts | Collection of alerting rules 告警规则模板大全
groups: 固定的
- name: 报警规则名称
rules:
- alert: 报警规则名称
expr: 触发条件
for: 10s 评估等待时间
labels: 自定义标签
severity: 安全级别 warning(警告) error(错误)
team: 标签 写不写都行
annotations: 用于描述告警详细信息
summary: 描述告警的概要信息
description 用于描述告警的详细信息
{{ $labels.instance }}定义变量 $labels 对应自定义标签 instance 对应主机信息
报警规则可以配置多个规则
#主机内存小于15%触发告警
groups:
- name: 主机内存
rules:
- alert: 主机内存
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 15
for: 15s
labels:
severity: warning
annotations:
summary: " 主机当前剩余内存 = {{ $value }}%"
description: "{{ $labels.instance }}主机内存 < 15% "
#主机节点运行宕机后触发告警
- alert: 节点运行状态
expr: up == 0
for: 10s
labels:
severity: warning
annotations:
summary: "运维部门的 {{ $labels.instance }} 服务已停止运行超过 10s!"
#主机cpu使用率大于70%触发告警
- alert: CPU使用情况
expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 70
for: 10s
labels:
status: warning
annotations:
summary: "主机当前CPU使用率{{$value}}"
description: "{{$labels.instance}}: CPU使用率大于 70%"
#磁盘容量使用大于85%触发告警
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 85
for: 10s
labels:
severity: warning
annotations:
summary: "主机磁盘当前使用率{{ $value }}%"
description: "{{$labels.instance}}磁盘分区使用率过高 {{$labels.mountpoint}} 磁盘分区使用大于85%"
#端口不存在时触发告警
- alert: 端口状态
expr: probe_success{groups="端口监控"} == 0
for: 10s
labels:
severity: warning
annotations:
description: 'API service: {{$labels.instance}} 端口检查失败,服务不可用,请检查'
summary: 'API service: {{$labels.instance}} '
#主机状态未运行时触发告警
- alert: 主机状态
expr: probe_success{groups="主机监控"} == 0
for: 10s
labels:
severity: warning
annotations:
description: 'API service: {{$labels.instance}} 主机检查失败,主机不可用,请检查'
summary: 'API service: {{$labels.instance}} '
#mysql服务挂掉时触发告警
- alert: Mysql状态
expr: mysql_up == 0
for: 10s
labels:
severity: warning
annotations:
summary: ' {{ $labels.instance }} Mysql服务 '
description: " {{ $labels.instance }} Mysql服务不可用 请检查"
#mysql主从IO线程停止时触发告警
- alert: Mysql主从IO线程检测
expr: mysql_slave_status_slave_io_running == 0
for: 5s
labels:
severity: error
annotations:
summary: " {{ $labels.instance }} Mysql从节点IO线程"
description: "{{ $labels.instance }} Mysql主从IO线程故障,请检测"
#mysql主从sql线程停止时触发告警
- alert: Mysql主从sql线程检测
expr: mysql_slave_status_slave_sql_running == 0
for: 5s
labels:
severity: error
annotations:
summary: "{{ $labels.instance }} Mysql从节点sql线程"
description: "{{ $labels.instance }} Mysql主从sql线程故障,请检测"
#redis服务挂掉时触发告警
- alert: Redis状态
expr: redis_up == 0
for: 10s
labels:
severity: error
annotations:
summary: "{{ $labels.instance }} Redis服务"
description: " {{ $labels.instance }} Redis服务不可用 请检查"
#Redis连接数大于5个时触发报警redis_exporter在计算连接数时算一个
- alert: Redis连接数
expr: redis_connected_clients > 5
for: 5s
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }} Redis连接数"
description: "{{ $labels.instance }}连接数异常\n 当前连接数 {{ $value }}"
# ES服务挂掉时触发告警
- alert: ES 状态
expr: elasticsearch_cluster_health_up == 0
for: 10s
labels:
severity: warning
annotations:
summary: " {{ $labels.instance }} ES服务"
description: " {{ $labels.instance }} ES服务不可用,请检查 "
- alert: ES节点健康状态
expr: elasticsearch_cluster_health_status{color="red"} == 1
for: 10s
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }} ES节点健康状态"
description: "{{ $labels.instance }} ES节点健康状态为红色,请检查"