Separate AP/servers down alerts and add temperature alerts #46
1 changed files with 59 additions and 8 deletions
|
@ -6,6 +6,7 @@
|
|||
{%- endmacro %}
|
||||
|
||||
groups:
|
||||
|
||||
- name: alert.rules
|
||||
rules:
|
||||
|
||||
|
@ -42,6 +43,55 @@ groups:
|
|||
{{ raw('$value | printf "%.1f"') }}% de mémoire
|
||||
libre
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: >-
|
||||
(
|
||||
1 - (
|
||||
node_memory_SwapFree_bytes
|
||||
/ node_memory_SwapTotal_bytes
|
||||
)
|
||||
) * 100 > 10
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: >-
|
||||
La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}%
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: node_hwmon_temp_celsius > 75
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: La température de l'hôte est de {{ raw('$value') }}°C
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: node_hwmon_temp_crit_alarm_celsius == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: L'alarme de température de l'hôte est active
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer)
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr: increase(node_edac_correctable_errors_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: >-
|
||||
{{ raw('$value | print "%.1f"') }} erreur(s) ont été
|
||||
corrigée(s) (EDAC)
|
||||
|
||||
- alert: OutOfDiskSpace
|
||||
expr: >-
|
||||
node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
|
||||
|
@ -97,7 +147,7 @@ groups:
|
|||
|
||||
- alert: UpsOutputSourceChanged
|
||||
expr: upsOutputSource != 3
|
||||
for: 1m
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
@ -106,7 +156,7 @@ groups:
|
|||
|
||||
- alert: UpsBatteryStatus
|
||||
expr: upsBatteryStatus == 3
|
||||
for: 2m
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
@ -115,7 +165,7 @@ groups:
|
|||
|
||||
- alert: UpsBatteryStatus
|
||||
expr: upsBatteryStatus == 4
|
||||
for: 10m
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
@ -124,7 +174,7 @@ groups:
|
|||
|
||||
- alert: UpsHighLoad
|
||||
expr: upsOutputPercentLoad > 70
|
||||
for: 5m
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
@ -133,7 +183,7 @@ groups:
|
|||
|
||||
- alert: UpsWrongInputVoltage
|
||||
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
||||
for: 10m
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
@ -144,7 +194,7 @@ groups:
|
|||
expr: >-
|
||||
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
|
||||
< 3 * stddev_over_time(upsOutputVoltage[1d])
|
||||
for: 10m
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
@ -153,7 +203,7 @@ groups:
|
|||
|
||||
- alert: UpsTimeRemaining
|
||||
expr: upsEstimatedMinutesRemaining < 8
|
||||
for: 1m
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
@ -162,10 +212,11 @@ groups:
|
|||
|
||||
- alert: UpsTimeRemaining
|
||||
expr: upsEstimatedMinutesRemaining < 5
|
||||
for: 1m
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
Autonomie restante de {{ raw('$value') }} min
|
||||
|
||||
...
|
||||
|
|
Loading…
Reference in a new issue