Add more node-exporter alerts
continuous-integration/drone/push Build is passing Details

Source: https://awesome-prometheus-alerts.grep.to/rules.html
This commit is contained in:
jeltz 2021-04-02 22:55:51 +02:00
parent f80435cb31
commit 1c3127dbbe
1 changed files with 59 additions and 8 deletions

View File

@ -6,6 +6,7 @@
{%- endmacro %}
groups:
- name: alert.rules
rules:
@ -42,6 +43,55 @@ groups:
{{ raw('$value | printf "%.1f"') }}% de mémoire
libre
- alert: HostSwapIsFillingUp
expr: >-
(
1 - (
node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes
)
) * 100 > 10
for: 3m
labels:
severity: warning
annotations:
summary: >-
La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}%
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 3m
labels:
severity: critical
annotations:
summary: La température de l'hôte est de {{ raw('$value') }}°C
- alert: HostNodeOvertemperatureAlarm
expr: node_hwmon_temp_crit_alarm_celsius == 1
for: 0m
labels:
severity: critical
annotations:
summary: L'alarme de température de l'hôte est active
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer)
- alert: HostEdacCorrectableErrorsDetected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: >-
{{ raw('$value | print "%.1f"') }} erreur(s) ont été
corrigée(s) (EDAC)
- alert: OutOfDiskSpace
expr: >-
node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
@ -97,7 +147,7 @@ groups:
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 1m
for: 0m
labels:
severity: critical
annotations:
@ -106,7 +156,7 @@ groups:
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 3
for: 2m
for: 0m
labels:
severity: warning
annotations:
@ -115,7 +165,7 @@ groups:
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 4
for: 10m
for: 0m
labels:
severity: critical
annotations:
@ -124,7 +174,7 @@ groups:
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 5m
for: 3m
labels:
severity: critical
annotations:
@ -133,7 +183,7 @@ groups:
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 10m
for: 5m
labels:
severity: warning
annotations:
@ -144,7 +194,7 @@ groups:
expr: >-
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
< 3 * stddev_over_time(upsOutputVoltage[1d])
for: 10m
for: 5m
labels:
severity: warning
annotations:
@ -153,7 +203,7 @@ groups:
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 8
for: 1m
for: 0m
labels:
severity: warning
annotations:
@ -162,10 +212,11 @@ groups:
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 5
for: 1m
for: 0m
labels:
severity: critical
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
...