Separate AP/servers down alerts and add temperature alerts #46
1 changed files with 76 additions and 10 deletions
|
@ -6,11 +6,12 @@
|
||||||
{%- endmacro %}
|
{%- endmacro %}
|
||||||
|
|
||||||
groups:
|
groups:
|
||||||
|
|
||||||
- name: alert.rules
|
- name: alert.rules
|
||||||
rules:
|
rules:
|
||||||
|
|
||||||
- alert: InstanceDown
|
- alert: InstanceDown
|
||||||
expr: up == 0
|
expr: up{instance!~".*.borne.auro.re$"} == 0
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
@ -18,6 +19,15 @@ groups:
|
||||||
summary: >-
|
summary: >-
|
||||||
Invisible depuis plus de 3 minutes
|
Invisible depuis plus de 3 minutes
|
||||||
|
|
||||||
|
- alert: AccessPointDown
|
||||||
|
expr: up{instance=~".*.borne.auro.re$"} == 0
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Invisible depuis plus de 3 minutes
|
||||||
|
|
||||||
- alert: OutOfMemory
|
- alert: OutOfMemory
|
||||||
expr: >-
|
expr: >-
|
||||||
(
|
(
|
||||||
|
@ -33,6 +43,59 @@ groups:
|
||||||
{{ raw('$value | printf "%.1f"') }}% de mémoire
|
{{ raw('$value | printf "%.1f"') }}% de mémoire
|
||||||
libre
|
libre
|
||||||
|
|
||||||
|
- alert: HostSwapIsFillingUp
|
||||||
|
expr: >-
|
||||||
|
(
|
||||||
|
1 - (
|
||||||
|
node_memory_SwapFree_bytes
|
||||||
|
/ node_memory_SwapTotal_bytes
|
||||||
|
)
|
||||||
|
) * 100 > 10
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}%
|
||||||
|
|
||||||
|
- alert: HostPhysicalComponentTooHot
|
||||||
|
expr: node_hwmon_temp_celsius > 79
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
La température de l'hôte est de {{ raw('$value') }}°C
|
||||||
|
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
|
||||||
|
|
||||||
|
- alert: HostNodeOvertemperatureAlarm
|
||||||
|
expr: node_hwmon_temp_crit_alarm_celsius == 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
L'alarme de température de l'hôte est active
|
||||||
|
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
|
||||||
|
|
||||||
|
- alert: HostOomKillDetected
|
||||||
|
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer)
|
||||||
|
|
||||||
|
- alert: HostEdacCorrectableErrorsDetected
|
||||||
|
expr: increase(node_edac_correctable_errors_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
{{ raw('$value | print "%.1f"') }} erreur(s) ont été
|
||||||
|
corrigée(s) (EDAC)
|
||||||
|
|
||||||
- alert: OutOfDiskSpace
|
- alert: OutOfDiskSpace
|
||||||
expr: >-
|
expr: >-
|
||||||
node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
|
node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
|
||||||
|
@ -88,7 +151,7 @@ groups:
|
||||||
|
|
||||||
- alert: UpsOutputSourceChanged
|
- alert: UpsOutputSourceChanged
|
||||||
expr: upsOutputSource != 3
|
expr: upsOutputSource != 3
|
||||||
for: 1m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -97,7 +160,7 @@ groups:
|
||||||
|
|
||||||
- alert: UpsBatteryStatus
|
- alert: UpsBatteryStatus
|
||||||
expr: upsBatteryStatus == 3
|
expr: upsBatteryStatus == 3
|
||||||
for: 2m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -106,7 +169,7 @@ groups:
|
||||||
|
|
||||||
- alert: UpsBatteryStatus
|
- alert: UpsBatteryStatus
|
||||||
expr: upsBatteryStatus == 4
|
expr: upsBatteryStatus == 4
|
||||||
for: 10m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -115,7 +178,7 @@ groups:
|
||||||
|
|
||||||
- alert: UpsHighLoad
|
- alert: UpsHighLoad
|
||||||
expr: upsOutputPercentLoad > 70
|
expr: upsOutputPercentLoad > 70
|
||||||
for: 5m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -124,7 +187,7 @@ groups:
|
||||||
|
|
||||||
- alert: UpsWrongInputVoltage
|
- alert: UpsWrongInputVoltage
|
||||||
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
||||||
for: 10m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -132,8 +195,10 @@ groups:
|
||||||
Tension d'entrée de {{ raw('$value') }}V
|
Tension d'entrée de {{ raw('$value') }}V
|
||||||
|
|
||||||
- alert: UpsWrongOutputVoltage
|
- alert: UpsWrongOutputVoltage
|
||||||
expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240)
|
expr: >-
|
||||||
for: 10m
|
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
|
||||||
|
< 3 * stddev_over_time(upsOutputVoltage[1d])
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -142,7 +207,7 @@ groups:
|
||||||
|
|
||||||
- alert: UpsTimeRemaining
|
- alert: UpsTimeRemaining
|
||||||
expr: upsEstimatedMinutesRemaining < 8
|
expr: upsEstimatedMinutesRemaining < 8
|
||||||
for: 1m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
|
@ -151,10 +216,11 @@ groups:
|
||||||
|
|
||||||
- alert: UpsTimeRemaining
|
- alert: UpsTimeRemaining
|
||||||
expr: upsEstimatedMinutesRemaining < 5
|
expr: upsEstimatedMinutesRemaining < 5
|
||||||
for: 1m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: >-
|
summary: >-
|
||||||
Autonomie restante de {{ raw('$value') }} min
|
Autonomie restante de {{ raw('$value') }} min
|
||||||
|
|
||||||
...
|
...
|
||||||
|
|
Loading…
Reference in a new issue