Separate AP/servers down alerts and add temperature alerts #46

Merged
jeltz merged 4 commits from prometheus_alerts into master 2021-04-03 17:28:42 +02:00

View file

@ -6,11 +6,12 @@
{%- endmacro %} {%- endmacro %}
groups: groups:
- name: alert.rules - name: alert.rules
rules: rules:
- alert: InstanceDown - alert: InstanceDown
expr: up == 0 expr: up{instance!~".*.borne.auro.re$"} == 0
for: 3m for: 3m
labels: labels:
severity: critical severity: critical
@ -18,6 +19,15 @@ groups:
summary: >- summary: >-
Invisible depuis plus de 3 minutes Invisible depuis plus de 3 minutes
- alert: AccessPointDown
expr: up{instance=~".*.borne.auro.re$"} == 0
for: 3m
labels:
severity: warning
annotations:
summary: >-
Invisible depuis plus de 3 minutes
- alert: OutOfMemory - alert: OutOfMemory
expr: >- expr: >-
( (
@ -33,6 +43,59 @@ groups:
{{ raw('$value | printf "%.1f"') }}% de mémoire {{ raw('$value | printf "%.1f"') }}% de mémoire
libre libre
- alert: HostSwapIsFillingUp
expr: >-
(
1 - (
node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes
)
) * 100 > 10
for: 3m
labels:
severity: warning
annotations:
summary: >-
La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}%
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 79
for: 3m
labels:
severity: critical
annotations:
summary: >-
La température de l'hôte est de {{ raw('$value') }}°C
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
- alert: HostNodeOvertemperatureAlarm
expr: node_hwmon_temp_crit_alarm_celsius == 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
L'alarme de température de l'hôte est active
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer)
- alert: HostEdacCorrectableErrorsDetected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: >-
{{ raw('$value | print "%.1f"') }} erreur(s) ont été
corrigée(s) (EDAC)
- alert: OutOfDiskSpace - alert: OutOfDiskSpace
expr: >- expr: >-
node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10 node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
@ -88,7 +151,7 @@ groups:
- alert: UpsOutputSourceChanged - alert: UpsOutputSourceChanged
expr: upsOutputSource != 3 expr: upsOutputSource != 3
for: 1m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
@ -97,7 +160,7 @@ groups:
- alert: UpsBatteryStatus - alert: UpsBatteryStatus
expr: upsBatteryStatus == 3 expr: upsBatteryStatus == 3
for: 2m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
@ -106,7 +169,7 @@ groups:
- alert: UpsBatteryStatus - alert: UpsBatteryStatus
expr: upsBatteryStatus == 4 expr: upsBatteryStatus == 4
for: 10m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
@ -115,7 +178,7 @@ groups:
- alert: UpsHighLoad - alert: UpsHighLoad
expr: upsOutputPercentLoad > 70 expr: upsOutputPercentLoad > 70
for: 5m for: 3m
labels: labels:
severity: critical severity: critical
annotations: annotations:
@ -124,7 +187,7 @@ groups:
- alert: UpsWrongInputVoltage - alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 10m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
@ -132,8 +195,10 @@ groups:
Tension d'entrée de {{ raw('$value') }}V Tension d'entrée de {{ raw('$value') }}V
- alert: UpsWrongOutputVoltage - alert: UpsWrongOutputVoltage
expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) expr: >-
for: 10m abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
< 3 * stddev_over_time(upsOutputVoltage[1d])
for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
@ -142,7 +207,7 @@ groups:
- alert: UpsTimeRemaining - alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 8 expr: upsEstimatedMinutesRemaining < 8
for: 1m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
@ -151,10 +216,11 @@ groups:
- alert: UpsTimeRemaining - alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 5 expr: upsEstimatedMinutesRemaining < 5
for: 1m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: >- summary: >-
Autonomie restante de {{ raw('$value') }} min Autonomie restante de {{ raw('$value') }} min
... ...