本文通过Prometheus-operator框架一键化安装prometheus、alertmanage、granfana,并配置企业微信api以及告警推送,搭建 prometheus 的前提环境是已有 k8s 环境并可用。
Prometheus Operator是什么
Prometheus Operator是运行在Kubernetes之上的监控和告警工具。部署时不用创建和修改prometheus的配置文件,所有的操作通过创建prometheus自己的资源对象来实现。对于监控配置的修改可以做到实时生效。
Prometheus Operator的自定义资源(CustomResourceDefinitions CRD)
mkdir -p /home/prometheus
cd /home/prometheus
wget https://github.com/prometheus-operator/kube-prometheus/archive/refs/tags/v0.8.0.tar.gz
tar xf v0.8.0.tar.gz
1. 修改配置
root@K8s-master:~/test/prometheus# vim prometheus-prometheus.yaml
...
serviceMonitorSelector: {}
version: 2.26.0
storage:
volumeClaimTemplate:
spec:
storageClassName: managed-nfs-storage #注意修改为对应的sc
resources:
requests:
storage: 5Gi #适当扩大
root@K8s-master:~/test/prometheus# vim grafana-deployment.yaml
...
- name: grafana-storage
persistentVolumeClaim:
claimName: grafana-data # storyclass 注意替换
...
root@K8s-master:~/test/prometheus# vim grafana-pvc.yaml
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: grafana-data
namespace: monitoring
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 5Gi # 扩大容量
storageClassName: aws-gps # 对应sc
root@K8s-master:~/test/prometheus# vim prometheus-service.yaml
...
spec:
type: LoadBalancer
externalIPs:
- 0.0.0.0 #prometheus访问的IP地址自行指定
ports:
- name: web
port: 9090
targetPort: web
nodePort: 9090
selector:
app: prometheus
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s
sessionAffinity: None
root@K8s-master:~/test/prometheus# vim grafana-service.yaml
...
spec:
type: LoadBalancer
externalIPs:
- 0.0.0.0 # 外网访问地址
ports:
- name: http
port: 3000
targetPort: http
nodePort: 3000
selector:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheu
2. 一键化部署:
至此prometheus、granfana、alertmanagey已经完成安装并可访问,接下来进行服务自动发现配置
3. 服务发现
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
kubectl create secret generic additional-configs --from-file=prometheus-additional.yaml -n monitoring
name: additional-configs
key: prometheus-additional.yaml
添加之后:
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
labels:
prometheus: k8s
name: k8s
namespace: monitoring
spec:
alerting:
alertmanagers:
- name: alertmanager-main
namespace: monitoring
port: web
baseImage: quay.io/prometheus/prometheus
nodeSelector:
beta.kubernetes.io/os: linux
replicas: 2
secrets:
- etcd-certs
resources:
requests:
memory: 400Mi
ruleSelector:
matchLabels:
prometheus: k8s
role: alert-rules
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
additionalScrapeConfigs:
name: additional-configs
key: prometheus-additional.yaml
serviceAccountName: prometheus-k8s
serviceMonitorNamespaceSelector: {}
serviceMonitorSelector: {}
version: v2.5.0
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus-k8s
rules:
- apiGroups:
- ""
resources:
- nodes
- services
- endpoints
- pods
- nodes/proxy
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
- nodes/metrics
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get
annotations:
prometheus.io/scrape: "true"
直接复制以下模板,修改一些关键字即可使用
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
prometheus: k8s
role: alert-rules
name: <规则名称>
namespace: monitorin
spec:
groups:
- name: <组名>
rules:
- alert: <监控的项目名称>
expr: <promql语句> # 例如 total_error > 0
for: <第一次匹配到规则后的持续时间,到达持续时间之后又匹配到规则则触发告警> # 例如 5m
labels:
severity: page
annotations:
summary: <主题>
description: <描述>
配置 alertmanager-secret.yaml 文件
apiVersion: v1
kind: Secret
metadata:
labels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main
namespace: monitoring
stringData:
alertmanager.yaml: |-
"global":
"resolve_timeout": "5m"
"wechat_api_url": "https://qyapi.weixin.qq.com/cgi-bin/" #直接使用
"wechat_api_secret": "" #凭证
"wechat_api_corp_id": "" #企业id
"receivers":
- "name": "wechat"
"wechat_configs":
- "send_resolved": true
"to_party": "" #接收消息的群id
"agent_id": ""
"corp_id": ""
"route":
"group_by": ["alertname"]
"group_interval": "30m"
"repeat_interval": "30m"
"group_wait": "3s"
"receiver": "wechat"
"repeat_interval": "1m"
type: Opaque
Kube Prometheus项目地址
https://github.com/coreos/kube-prometheus
项目的Helm安装包地址
https://github.com/helm/charts/blob/master/stable/prometheus-operator
Prometheus官网地址
https://prometheus.io/
Prometheus Operator项目地址
https://github.com/coreos/prometheus-operator/