Separate AP/servers down alerts and add temperature alerts #46

Merged
jeltz merged 4 commits from prometheus_alerts into master 2021-04-03 17:28:42 +02:00
Showing only changes of commit 1c3127dbbe - Show all commits

View file

@ -6,6 +6,7 @@
{%- endmacro %} {%- endmacro %}
groups: groups:
- name: alert.rules - name: alert.rules
rules: rules:
@ -42,6 +43,55 @@ groups:
{{ raw('$value | printf "%.1f"') }}% de mémoire {{ raw('$value | printf "%.1f"') }}% de mémoire
libre libre
- alert: HostSwapIsFillingUp
expr: >-
(
1 - (
node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes
)
) * 100 > 10
for: 3m
labels:
severity: warning
annotations:
summary: >-
La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}%
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 3m
labels:
severity: critical
annotations:
summary: La température de l'hôte est de {{ raw('$value') }}°C
- alert: HostNodeOvertemperatureAlarm
expr: node_hwmon_temp_crit_alarm_celsius == 1
for: 0m
labels:
severity: critical
annotations:
summary: L'alarme de température de l'hôte est active
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer)
- alert: HostEdacCorrectableErrorsDetected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: >-
{{ raw('$value | print "%.1f"') }} erreur(s) ont été
corrigée(s) (EDAC)
- alert: OutOfDiskSpace - alert: OutOfDiskSpace
expr: >- expr: >-
node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10 node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
@ -97,7 +147,7 @@ groups:
- alert: UpsOutputSourceChanged - alert: UpsOutputSourceChanged
expr: upsOutputSource != 3 expr: upsOutputSource != 3
for: 1m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
@ -106,7 +156,7 @@ groups:
- alert: UpsBatteryStatus - alert: UpsBatteryStatus
expr: upsBatteryStatus == 3 expr: upsBatteryStatus == 3
for: 2m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
@ -115,7 +165,7 @@ groups:
- alert: UpsBatteryStatus - alert: UpsBatteryStatus
expr: upsBatteryStatus == 4 expr: upsBatteryStatus == 4
for: 10m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
@ -124,7 +174,7 @@ groups:
- alert: UpsHighLoad - alert: UpsHighLoad
expr: upsOutputPercentLoad > 70 expr: upsOutputPercentLoad > 70
for: 5m for: 3m
labels: labels:
severity: critical severity: critical
annotations: annotations:
@ -133,7 +183,7 @@ groups:
- alert: UpsWrongInputVoltage - alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 10m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
@ -144,7 +194,7 @@ groups:
expr: >- expr: >-
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d])) abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
< 3 * stddev_over_time(upsOutputVoltage[1d]) < 3 * stddev_over_time(upsOutputVoltage[1d])
for: 10m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
@ -153,7 +203,7 @@ groups:
- alert: UpsTimeRemaining - alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 8 expr: upsEstimatedMinutesRemaining < 8
for: 1m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
@ -162,10 +212,11 @@ groups:
- alert: UpsTimeRemaining - alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 5 expr: upsEstimatedMinutesRemaining < 5
for: 1m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: >- summary: >-
Autonomie restante de {{ raw('$value') }} min Autonomie restante de {{ raw('$value') }} min
... ...