kube-prometheus个性化定制

项目地址

官网: https://github.com/prometheus-operator/prometheus-operator
helm: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack

使用 helm 方便修改文件内容

配置思路

安装前基于文件修改, 修改完后查看

/etc/kubeasz/bin/helm upgrade prometheus –install -n monitor -f prom-values.yaml /etc/kubeasz/roles/cluster-addon/files/kube-prometheus-stack-45.23.0.tgz –dry-run > prometheus.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
## Provide a k8s version to auto dashboard import script example: kubeTargetVersionOverride: 1.16.6
kubeTargetVersionOverride: "1.27.2"

## Configuration for alertmanager
alertmanager:
alertmanagerSpec:
image:
registry: easzlab.io.local:5000
storage:
volumeClaimTemplate:
spec:
storageClassName: local-path
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
service:
nodePort: 30902
type: NodePort

config:
global:
resolve_timeout: 5m
inhibit_rules:
- equal:
- namespace
- alertname
source_matchers:
- severity = critical
target_matchers:
- severity =~ warning|info
- equal:
- namespace
- alertname
source_matchers:
- severity = warning
target_matchers:
- severity = info
- equal:
- namespace
source_matchers:
- alertname = InfoInhibitor
target_matchers:
- severity = info
receivers:
- name: web.hook
webhook_configs:
- send_resolved: false
url: http://prometheus-webhook-adapter:8060/adapter/wx
- name: "null"
route:
group_by:
- namespace
group_interval: 5m
group_wait: 30s
receiver: web.hook
repeat_interval: 12h
routes:
- matchers:
- alertname = "InfoInhibitor"
receiver: "null"
- matchers:
- alertname = "Watchdog"
receiver: "web.hook"
- matchers:
- "severity = critical"
receiver: "web.hook"
templates:
- /etc/alertmanager/config/*.tmpl'
## Using default values from https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml
grafana:
defaultDashboardsTimezone: CST
enabled: true
adminUser: admin
adminPassword: password
image:
repository: easzlab.io.local:5000/prometheus/grafana
service:
nodePort: 30903
type: NodePort
sidecar:
image:
repository: easzlab.io.local:5000/prometheus/k8s-sidecar
skipTlsVerify: true

datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
uid: prometheus
url: http://prometheus-kube-prometheus-prometheus:9090
access: proxy
isDefault: true
jsonData:
httpMethod: POST
timeInterval: 30s

persistence:
type: pvc
enabled: true
storageClassName: local-path
accessModes:
- ReadWriteOnce
size: 10Gi

grafana.ini:
paths:
data: /var/lib/grafana/
logs: /var/log/grafana
plugins: /var/lib/grafana/plugins
provisioning: /etc/grafana/provisioning
analytics:
check_for_updates: true
log:
mode: console
grafana_net:
url: https://grafana.net
users:
default_theme: light
security:
allow_embedding: true
auth.anonymous:
enabled: true
org_role: Viewer
auth.basic:
enabled: false
server:
root_url: http://localhost/test/grafana/
serve_from_sub_path: true

## Component scraping the kube api server
kubeApiServer:
enabled: true

## Component scraping the kubelet and kubelet-hosted cAdvisor
kubelet:
enabled: true

## Component scraping the kube controller manager
kubeControllerManager:
enabled: true
endpoints:
- 172.20.19.51
- 172.20.19.52
- 172.20.19.53
service:
port: 10257
targetPort: 10257
serviceMonitor:
https: true
insecureSkipVerify: true
serverName: localhost

## Component scraping coreDns. Use either this or kubeDns
coreDns:
enabled: true

## Component scraping etcd
kubeEtcd:
enabled: true
endpoints:
- 172.20.19.51
- 172.20.19.52
- 172.20.19.53
service:
port: 2379
targetPort: 2379
serviceMonitor:
scheme: https
insecureSkipVerify: true
serverName: localhost
caFile: /etc/prometheus/secrets/etcd-client-cert/etcd-ca
certFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client
keyFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client-key

## Component scraping kube scheduler
kubeScheduler:
enabled: true
endpoints:
- 172.20.19.51
- 172.20.19.52
- 172.20.19.53
service:
port: 10259
targetPort: 10259
serviceMonitor:
https: true
insecureSkipVerify: true

## Component scraping kube proxy
kubeProxy:
enabled: true
endpoints:
- 172.20.19.51
- 172.20.19.52
- 172.20.19.53
- 172.20.19.54
- 172.20.19.55

kubeStateMetrics:
enabled: true

## Configuration for kube-state-metrics subchart
kube-state-metrics:
image:
registry: easzlab.io.local:5000
repository: prometheus/kube-state-metrics

## Configuration for prometheus-node-exporter subchart
prometheus-node-exporter:
image:
registry: easzlab.io.local:5000
repository: prometheus/node-exporter

## Manages Prometheus and Alertmanager components
prometheusOperator:
enabled: true
admissionWebhooks:
enabled: true
patch:
enabled: true
image:
registry: easzlab.io.local:5000
repository: prometheus/kube-webhook-certgen
tag: v1.5.1
image:
registry: easzlab.io.local:5000
repository: prometheus/prometheus-operator
service:
nodePort: 30899
nodePortTls: 30900
type: NodePort
prometheusConfigReloader:
image:
registry: easzlab.io.local:5000
repository: prometheus/prometheus-config-reloader

## Deploy a Prometheus instance
prometheus:
enabled: true
service:
nodePort: 30901
type: NodePort

prometheusSpec:
image:
registry: easzlab.io.local:5000
replicas: 1
secrets:
- etcd-client-cert

additionalScrapeConfigsSecret:
enabled: true
name: additional-scrape-configs
key: prometheus-additional.yaml

storageSpec:
volumeClaimTemplate:
spec:
storageClassName: local-path
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi

安装后也可以进行修改, 最主要的就是这些

1
2
3
4
5
6
7
8
alertmanagerconfigs.monitoring.coreos.com
alertmanagers.monitoring.coreos.com
podmonitors.monitoring.coreos.com
probes.monitoring.coreos.com
prometheuses.monitoring.coreos.com
prometheusrules.monitoring.coreos.com
servicemonitors.monitoring.coreos.com
thanosrulers.monitoring.coreos.com

然后就是 各种 configMap 和 secrets

例如如下操作 Alertmanager的配置

kubectl edit alertmanagerconfigs.monitoring.coreos.com -n monitor

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
apiVersion: monitoring.coreos.com/v1alpha1
kind: AlertmanagerConfig
metadata:
labels:
alertmanagerConfig: example
name: config-example
namespace: monitor
spec:
receivers:
- name: webhook
webhookConfigs:
- url: http://example.com/
route:
groupBy:
- job
groupInterval: 5m
groupWait: 30s
receiver: webhook
repeatInterval: 12h

会与最初的文件合并,一起形成新的配置

prometheus 修改

自动发现配置(不推荐)

这个项目没有 一般在 annotations 添加 prometheus.io/scrape=true prometheus 会自动将其加入 target,配置起来不方便, 一般使用 ServiceMonitor 和 PodMonitors

示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/port: "80"
prometheus.io/scrape: "true"
labels:
app: test-frontend-250
name: test-frontend-250-svc
namespace: default
spec:
ports:
- name: test-frontend-250-http
nodePort: 31250
port: 80
targetPort: 80
selector:
app: test-frontend-250
type: NodePort

prometheus-additional.yaml 新添加的 scrape 配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name

创建 secret

1
2

kubectl create secret generic additional-configs --from-file=prometheus-additional.yaml -n monitor

prometheus 添加配置

kubectl edit prometheus -n monitor

1
2
3
4
spec: 
additionalScrapeConfigs:
name: additional-configs
key: prometheus-additional.yaml

使用 ServiceMonitor、 PodMonitors 和 probe

这两个配置是差不多的。

PodMonitors 针对 pod 发现

ServiceMonitor 针对 service 发现

probe 一般自定义发现

查看配置

kubectl get prometheus -n monitor prometheus-kube-prometheus-prometheus -o yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
spec:
podMonitorSelector:
matchLabels:
release: prometheus

probeSelector:
matchLabels:
release: prometheus

ruleSelector:
matchLabels:
release: prometheus

serviceMonitorSelector:
matchLabels:
release: prometheus

这些配置均需要带 label release: prometheus 才能匹配

使用 PodMonitors

因为postgres 的 pod 标签有主备的标识,针对 pod 更方便点

PodMonitor-postgres.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: crunchy-postgres-exporter-master
namespace: monitor
labels:
release: prometheus
spec:
podMetricsEndpoints:
- interval: 30s
port: exporter
path: /metrics
relabelings: # 改变 label 方便使用官方的告警以及面板规则
- replacement: master
targetLabel: role
- replacement: postgres-operator:test
targetLabel: pg_cluster
- replacement: test
targetLabel: cluster
- sourceLabels: [pod]
targetLabel: deployment
regex: (.*?)-\d+
replacement: $1
action: replace
- sourceLabels: [__meta_kubernetes_pod_ip]
targetLabel: ip
action: replace
- sourceLabels: [job]
targetLabel: job
regex: monitor/(.*)-(master|replica)
replacement: $1
action: replace
- sourceLabels: [namespace]
targetLabel: kubernetes_namespace
action: replace

namespaceSelector:
matchNames:
- postgres-operator
selector:
matchExpressions:
- key: postgres-operator.crunchydata.com/role
operator: In
values: [master]

---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: crunchy-postgres-exporter-replica
namespace: monitor
labels:
release: prometheus
spec:
podMetricsEndpoints:
- interval: 30s
port: exporter
path: /metrics
relabelings:
- replacement: replica
targetLabel: role
- replacement: postgres-operator:test
targetLabel: pg_cluster
- replacement: test
targetLabel: cluster
- sourceLabels: [pod]
targetLabel: deployment
regex: (.*?)-\d+
replacement: $1
action: replace
- sourceLabels: [__meta_kubernetes_pod_ip]
targetLabel: ip
action: replace
- sourceLabels: [job]
targetLabel: job
regex: monitor/(.*)-(master|replica)
replacement: $1
action: replace
- sourceLabels: [namespace]
targetLabel: kubernetes_namespace
action: replace
namespaceSelector:
matchNames:
- postgres-operator
selector:
matchExpressions:
- key: postgres-operator.crunchydata.com/role
operator: In
values: [replica]

配置 PrometheusRule

默认文件, 节选

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/name: postgres-operator-monitoring
vendor: crunchydata
name: alertmanager-rules-config
apiVersion: v1
data:
crunchy-alert-rules-pg.yml: |
groups:
- name: alert-rules
rules:
- alert: PGExporterScrapeError
expr: pg_exporter_last_scrape_error > 0
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
- alert: ExporterDown
expr: avg_over_time(up[5m]) < 0.5
for: 10s
labels:
service: system
severity: critical
severity_num: 300
annotations:
description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.'
summary: 'Prometheus Exporter Service Down'

修改成 PrometheusRule, 然后 apply 就行了

kubectl apply -f prometheusrule-postgres.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
release: prometheus # 带有这个 labels 匹配
name: postgres-operator.rule
namespace: monitor
spec:
groups:
- name: postgres-operator-rule
rules:
- alert: PGExporterScrapeError
expr: pg_exporter_last_scrape_error > 0
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
- alert: ExporterDown
expr: avg_over_time(up[5m]) < 0.5
for: 10s
labels:
service: system
severity: critical
severity_num: 300
annotations:
description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.'
summary: 'Prometheus Exporter Service Down'

alertmanager-config 修改

一般不用修改

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
apiVersion: monitoring.coreos.com/v1alpha1
kind: AlertmanagerConfig
metadata:
labels:
alertmanagerConfig: postgres
name: config-postgres
namespace: monitor
spec:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'job', 'service']
receivers:
- name: webhook
webhookConfigs:
- url: http://example.com/
route:
receiver: webhook
group_by: [severity, service, job, alertname]
group_wait: 30s
group_interval: 5m
repeat_interval: 24h

grafana 修改

grafana 模版的配置文件, 一般开源组件都自带,或者去 grafana 下载,或者自行配置

1
2
3
4
5
6
7
8
9
10
[root@test-17 dashboards]# ls -l
crud_details.json
pgbackrest.json
pod_details2.json
pod_details.json
postgres_overview.json
postgresql_details.json
postgresql_service_health.json
prometheus_alerts.json
query_statistics.json

一次性使用上面所有文件,或者按需一个一个配置
json 文件内容"datasource": "PROMETHEUS" 需要替换成 "datasource": "Prometheus"

1
2
3
find . -type f -exec sed -i 's/"datasource": "PROMETHEUS"/"datasource": "Prometheus"/g' {} \;  
# grep -rl '"datasource": "PROMETHEUS"' . | xargs sed -i 's/"datasource": "PROMETHEUS"/"datasource": "Prometheus"/g'
kubectl -n monitor create cm grafana-postgres-overview --from-file=./

需要给 cm 打上 lables 才能被 grafana 动态识别

默认 grafana_dashboard=1 ,也可以放到不同的文件夹中

1
kubectl -n monitor label cm grafana-postgres-overview grafana_dashboard=1

查看 dashboard

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
[root@test-17 monitoring]# kubectl get cm -n monitor prometheus-grafana-config-dashboards  -o yaml
apiVersion: v1
data:
provider.yaml: |-
apiVersion: 1
providers:
- name: 'sidecarProvider'
orgId: 1
folder: ''
type: file
disableDeletion: false
allowUiUpdates: false
updateIntervalSeconds: 30
options:
foldersFromFilesStructure: false
path: /tmp/dashboards

补充

使用 serviceMonitor

监控 redis 示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
release: prometheus
name: redis
namespace: redis
spec:
endpoints:
- interval: 30s
port: http-metrics
namespaceSelector:
matchNames:
- redis
selector:
matchLabels:
app.kubernetes.io/component: metrics
app.kubernetes.io/instance: redis
app.kubernetes.io/name: redis

redis-svc

1
2
3
[root@test-17 monitoring]# kubectl get svc -n redis redis-metrics --show-labels 
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE LABELS
redis-metrics ClusterIP 10.68.207.80 <none> 9121/TCP 4h30m app.kubernetes.io/component=metrics,app.kubernetes.io/instance=redis,app.kubernetes.io/managed-by=Helm,app.kubernetes.io/name=redis,helm.sh/chart=redis-17.4.0

详细内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: metrics
app.kubernetes.io/instance: redis
name: redis-metrics
namespace: redis
spec:
ports:
- name: http-metrics
port: 9121
protocol: TCP
targetPort: metrics # pod 里面的 containerPort 的 name
selector:
app.kubernetes.io/instance: redis
app.kubernetes.io/name: redis

使用 probes

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: ping
namespace: monitor
labels:
release: prometheus
spec:
jobName: ping # 任务名称
prober: # 指定blackbox的地址
url: blackbox-exporter.monitor:19115
module: icmp # 配置文件中的检测模块
targets: # 目标(可以是static配置也可以是ingress配置)
# ingress <Object>
staticConfig: # 如果配置了 ingress,静态配置优先
static:
- https://www.baidu.com

这样就可以检测了。但需要先部署 blackbox-exporter

给grafana 开启oidc

grafana.ini

kubectl edit cm -n monitor grafana

`
[server]
protocol = http
domain = grafana.test.com
root_url = https://grafana.test.com/
[auth.generic_oauth]
enabled = true
allow_sign_up = true
auto_login = false
client_id = app_xxxxx
client_secret = CSHPuFPgkxXUtSbGwWLyDgeSwGWtsYq4PSy2GTovoMJVA3
scopes = openid profile email
auth_url = https://kzipufcn.aliyunidaas.com/login/app/app_xxxxx/oauth2/authorize
token_url = https://eiam-api-cn-hangzhou.aliyuncs.com/v2/idaas_sqsdywwjwwvzug45qq46ylbwdm/app_xxxxx/oauth2/token
api_url = https://eiam-api-cn-hangzhou.aliyuncs.com/v2/idaas_sqsdywwjwwvzug45qq46ylbwdm/app_xxxxx/oauth2/userinfo
redirect_uri = https://grafana.test.com/login/generic_oauth
email_attribute_path = email
role_attribute_path = contains(email, ‘weilai@’) && ‘Admin’ || endsWith(email, '@admin.com‘) && ‘Admin’ || endsWith(email, '@na.com‘) && ‘Editor’ || ‘Viewer’