Separate AP/servers down alerts and add temperature alerts #46

Merged
jeltz merged 4 commits from prometheus_alerts into master 2021-04-03 17:28:42 +02:00
Showing only changes of commit 91817b324c - Show all commits

View file

@ -59,12 +59,14 @@ groups:
La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}% La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}%
- alert: HostPhysicalComponentTooHot - alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75 expr: node_hwmon_temp_celsius > 79
for: 3m for: 3m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: La température de l'hôte est de {{ raw('$value') }}°C summary: >-
La température de l'hôte est de {{ raw('$value') }}°C
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
- alert: HostNodeOvertemperatureAlarm - alert: HostNodeOvertemperatureAlarm
expr: node_hwmon_temp_crit_alarm_celsius == 1 expr: node_hwmon_temp_crit_alarm_celsius == 1
@ -72,7 +74,9 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: L'alarme de température de l'hôte est active summary: >-
L'alarme de température de l'hôte est active
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
- alert: HostOomKillDetected - alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0 expr: increase(node_vmstat_oom_kill[1m]) > 0