Merge pull request 'Separate AP/servers down alerts and add temperature alerts' (#46) from prometheus_alerts into master
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
Reviewed-on: Aurore/ansible#46
This commit is contained in:
commit
e2f5529498
1 changed files with 76 additions and 10 deletions
|
@ -6,11 +6,12 @@
|
|||
{%- endmacro %}
|
||||
|
||||
groups:
|
||||
|
||||
- name: alert.rules
|
||||
rules:
|
||||
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
expr: up{instance!~".*.borne.auro.re$"} == 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -18,6 +19,15 @@ groups:
|
|||
summary: >-
|
||||
Invisible depuis plus de 3 minutes
|
||||
|
||||
- alert: AccessPointDown
|
||||
expr: up{instance=~".*.borne.auro.re$"} == 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: >-
|
||||
Invisible depuis plus de 3 minutes
|
||||
|
||||
- alert: OutOfMemory
|
||||
expr: >-
|
||||
(
|
||||
|
@ -33,6 +43,59 @@ groups:
|
|||
{{ raw('$value | printf "%.1f"') }}% de mémoire
|
||||
libre
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: >-
|
||||
(
|
||||
1 - (
|
||||
node_memory_SwapFree_bytes
|
||||
/ node_memory_SwapTotal_bytes
|
||||
)
|
||||
) * 100 > 10
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: >-
|
||||
La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}%
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: node_hwmon_temp_celsius > 79
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
La température de l'hôte est de {{ raw('$value') }}°C
|
||||
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: node_hwmon_temp_crit_alarm_celsius == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
L'alarme de température de l'hôte est active
|
||||
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer)
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr: increase(node_edac_correctable_errors_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: >-
|
||||
{{ raw('$value | print "%.1f"') }} erreur(s) ont été
|
||||
corrigée(s) (EDAC)
|
||||
|
||||
- alert: OutOfDiskSpace
|
||||
expr: >-
|
||||
node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
|
||||
|
@ -88,7 +151,7 @@ groups:
|
|||
|
||||
- alert: UpsOutputSourceChanged
|
||||
expr: upsOutputSource != 3
|
||||
for: 1m
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
@ -97,7 +160,7 @@ groups:
|
|||
|
||||
- alert: UpsBatteryStatus
|
||||
expr: upsBatteryStatus == 3
|
||||
for: 2m
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
@ -106,7 +169,7 @@ groups:
|
|||
|
||||
- alert: UpsBatteryStatus
|
||||
expr: upsBatteryStatus == 4
|
||||
for: 10m
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
@ -115,7 +178,7 @@ groups:
|
|||
|
||||
- alert: UpsHighLoad
|
||||
expr: upsOutputPercentLoad > 70
|
||||
for: 5m
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
@ -124,7 +187,7 @@ groups:
|
|||
|
||||
- alert: UpsWrongInputVoltage
|
||||
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
||||
for: 10m
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
@ -132,8 +195,10 @@ groups:
|
|||
Tension d'entrée de {{ raw('$value') }}V
|
||||
|
||||
- alert: UpsWrongOutputVoltage
|
||||
expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240)
|
||||
for: 10m
|
||||
expr: >-
|
||||
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
|
||||
< 3 * stddev_over_time(upsOutputVoltage[1d])
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
@ -142,7 +207,7 @@ groups:
|
|||
|
||||
- alert: UpsTimeRemaining
|
||||
expr: upsEstimatedMinutesRemaining < 8
|
||||
for: 1m
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
@ -151,10 +216,11 @@ groups:
|
|||
|
||||
- alert: UpsTimeRemaining
|
||||
expr: upsEstimatedMinutesRemaining < 5
|
||||
for: 1m
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
Autonomie restante de {{ raw('$value') }} min
|
||||
|
||||
...
|
||||
|
|
Loading…
Reference in a new issue