Separate AP/servers down alerts and add temperature alerts #46

Merged
jeltz merged 4 commits from prometheus_alerts into master 2021-04-03 17:28:42 +02:00

View file

@ -6,11 +6,12 @@
{%- endmacro %}
groups:
- name: alert.rules
rules:
- alert: InstanceDown
expr: up == 0
expr: up{instance!~".*.borne.auro.re$"} == 0
for: 3m
labels:
severity: critical
@ -18,6 +19,15 @@ groups:
summary: >-
Invisible depuis plus de 3 minutes
- alert: AccessPointDown
expr: up{instance=~".*.borne.auro.re$"} == 0
for: 3m
labels:
severity: warning
annotations:
summary: >-
Invisible depuis plus de 3 minutes
- alert: OutOfMemory
expr: >-
(
@ -33,6 +43,59 @@ groups:
{{ raw('$value | printf "%.1f"') }}% de mémoire
libre
- alert: HostSwapIsFillingUp
expr: >-
(
1 - (
node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes
)
) * 100 > 10
for: 3m
labels:
severity: warning
annotations:
summary: >-
La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}%
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 79
for: 3m
labels:
severity: critical
annotations:
summary: >-
La température de l'hôte est de {{ raw('$value') }}°C
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
- alert: HostNodeOvertemperatureAlarm
expr: node_hwmon_temp_crit_alarm_celsius == 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
L'alarme de température de l'hôte est active
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer)
- alert: HostEdacCorrectableErrorsDetected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: >-
{{ raw('$value | print "%.1f"') }} erreur(s) ont été
corrigée(s) (EDAC)
- alert: OutOfDiskSpace
expr: >-
node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
@ -88,7 +151,7 @@ groups:
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 1m
for: 0m
labels:
severity: critical
annotations:
@ -97,7 +160,7 @@ groups:
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 3
for: 2m
for: 0m
labels:
severity: warning
annotations:
@ -106,7 +169,7 @@ groups:
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 4
for: 10m
for: 0m
labels:
severity: critical
annotations:
@ -115,7 +178,7 @@ groups:
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 5m
for: 3m
labels:
severity: critical
annotations:
@ -124,7 +187,7 @@ groups:
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 10m
for: 5m
labels:
severity: warning
annotations:
@ -132,8 +195,10 @@ groups:
Tension d'entrée de {{ raw('$value') }}V
- alert: UpsWrongOutputVoltage
expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240)
for: 10m
expr: >-
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
< 3 * stddev_over_time(upsOutputVoltage[1d])
for: 5m
labels:
severity: warning
annotations:
@ -142,7 +207,7 @@ groups:
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 8
for: 1m
for: 0m
labels:
severity: warning
annotations:
@ -151,10 +216,11 @@ groups:
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 5
for: 1m
for: 0m
labels:
severity: critical
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
...