Separate AP/servers down alerts and add temperature alerts #46
1 changed files with 59 additions and 8 deletions
|
@ -6,6 +6,7 @@
|
||||||
{%- endmacro %}
|
{%- endmacro %}
|
||||||
|
|
||||||
groups:
|
groups:
|
||||||
|
|
||||||
- name: alert.rules
|
- name: alert.rules
|
||||||
rules:
|
rules:
|
||||||
|
|
||||||
|
@ -42,6 +43,55 @@ groups:
|
||||||
{{ raw('$value | printf "%.1f"') }}% de mémoire
|
{{ raw('$value | printf "%.1f"') }}% de mémoire
|
||||||
libre
|
libre
|
||||||
|
|
||||||
|
- alert: HostSwapIsFillingUp
|
||||||
|
expr: >-
|
||||||
|
(
|
||||||
|
1 - (
|
||||||
|
node_memory_SwapFree_bytes
|
||||||
|
/ node_memory_SwapTotal_bytes
|
||||||
|
)
|
||||||
|
) * 100 > 10
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}%
|
||||||
|
|
||||||
|
- alert: HostPhysicalComponentTooHot
|
||||||
|
expr: node_hwmon_temp_celsius > 75
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: La température de l'hôte est de {{ raw('$value') }}°C
|
||||||
|
|
||||||
|
- alert: HostNodeOvertemperatureAlarm
|
||||||
|
expr: node_hwmon_temp_crit_alarm_celsius == 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: L'alarme de température de l'hôte est active
|
||||||
|
|
||||||
|
- alert: HostOomKillDetected
|
||||||
|
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer)
|
||||||
|
|
||||||
|
- alert: HostEdacCorrectableErrorsDetected
|
||||||
|
expr: increase(node_edac_correctable_errors_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
{{ raw('$value | print "%.1f"') }} erreur(s) ont été
|
||||||
|
corrigée(s) (EDAC)
|
||||||
|
|
||||||
- alert: OutOfDiskSpace
|
- alert: OutOfDiskSpace
|
||||||
expr: >-
|
expr: >-
|
||||||
node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
|
node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
|
||||||
|
@ -97,7 +147,7 @@ groups:
|
||||||
|
|
||||||
- alert: UpsOutputSourceChanged
|
- alert: UpsOutputSourceChanged
|
||||||
expr: upsOutputSource != 3
|
expr: upsOutputSource != 3
|
||||||
for: 1m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -106,7 +156,7 @@ groups:
|
||||||
|
|
||||||
- alert: UpsBatteryStatus
|
- alert: UpsBatteryStatus
|
||||||
expr: upsBatteryStatus == 3
|
expr: upsBatteryStatus == 3
|
||||||
for: 2m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -115,7 +165,7 @@ groups:
|
||||||
|
|
||||||
- alert: UpsBatteryStatus
|
- alert: UpsBatteryStatus
|
||||||
expr: upsBatteryStatus == 4
|
expr: upsBatteryStatus == 4
|
||||||
for: 10m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -124,7 +174,7 @@ groups:
|
||||||
|
|
||||||
- alert: UpsHighLoad
|
- alert: UpsHighLoad
|
||||||
expr: upsOutputPercentLoad > 70
|
expr: upsOutputPercentLoad > 70
|
||||||
for: 5m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -133,7 +183,7 @@ groups:
|
||||||
|
|
||||||
- alert: UpsWrongInputVoltage
|
- alert: UpsWrongInputVoltage
|
||||||
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
||||||
for: 10m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -144,7 +194,7 @@ groups:
|
||||||
expr: >-
|
expr: >-
|
||||||
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
|
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
|
||||||
< 3 * stddev_over_time(upsOutputVoltage[1d])
|
< 3 * stddev_over_time(upsOutputVoltage[1d])
|
||||||
for: 10m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -153,7 +203,7 @@ groups:
|
||||||
|
|
||||||
- alert: UpsTimeRemaining
|
- alert: UpsTimeRemaining
|
||||||
expr: upsEstimatedMinutesRemaining < 8
|
expr: upsEstimatedMinutesRemaining < 8
|
||||||
for: 1m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -162,10 +212,11 @@ groups:
|
||||||
|
|
||||||
- alert: UpsTimeRemaining
|
- alert: UpsTimeRemaining
|
||||||
expr: upsEstimatedMinutesRemaining < 5
|
expr: upsEstimatedMinutesRemaining < 5
|
||||||
for: 1m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: >-
|
summary: >-
|
||||||
Autonomie restante de {{ raw('$value') }} min
|
Autonomie restante de {{ raw('$value') }} min
|
||||||
|
|
||||||
...
|
...
|
||||||
|
|
Loading…
Reference in a new issue