From 06f101527df401610b73879670371e8876909610 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Fri, 2 Apr 2021 13:57:34 +0200 Subject: [PATCH 1/4] Use a dynamic interval for UPS output voltage alerts --- roles/prometheus/templates/alert.rules.yml.j2 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 index c5ba1c2..e6f10ce 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -132,7 +132,9 @@ groups: Tension d'entrée de {{ raw('$value') }}V - alert: UpsWrongOutputVoltage - expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) + expr: >- + abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d])) + < 3 * stddev_over_time(upsOutputVoltage[1d]) for: 10m labels: severity: warning From f80435cb314b67a1740861e41f3da0955ff2e14e Mon Sep 17 00:00:00 2001 From: Jeltz Date: Fri, 2 Apr 2021 21:54:38 +0200 Subject: [PATCH 2/4] Differentiate alerts for servers and Wi-Fi APs --- roles/prometheus/templates/alert.rules.yml.j2 | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 index e6f10ce..97d5e22 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -10,7 +10,7 @@ groups: rules: - alert: InstanceDown - expr: up == 0 + expr: up{instance!~".*.borne.auro.re$"} == 0 for: 3m labels: severity: critical @@ -18,6 +18,15 @@ groups: summary: >- Invisible depuis plus de 3 minutes + - alert: AccessPointDown + expr: up{instance=~".*.borne.auro.re$"} == 0 + for: 3m + labels: + severity: warning + annotations: + summary: >- + Invisible depuis plus de 3 minutes + - alert: OutOfMemory expr: >- ( From 1c3127dbbe2dcbba663cba7e3e77776253d1f135 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Fri, 2 Apr 2021 22:55:51 +0200 Subject: [PATCH 3/4] Add more node-exporter alerts Source: https://awesome-prometheus-alerts.grep.to/rules.html --- roles/prometheus/templates/alert.rules.yml.j2 | 67 ++++++++++++++++--- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 index 97d5e22..7097e47 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -6,6 +6,7 @@ {%- endmacro %} groups: + - name: alert.rules rules: @@ -42,6 +43,55 @@ groups: {{ raw('$value | printf "%.1f"') }}% de mémoire libre + - alert: HostSwapIsFillingUp + expr: >- + ( + 1 - ( + node_memory_SwapFree_bytes + / node_memory_SwapTotal_bytes + ) + ) * 100 > 10 + for: 3m + labels: + severity: warning + annotations: + summary: >- + La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}% + + - alert: HostPhysicalComponentTooHot + expr: node_hwmon_temp_celsius > 75 + for: 3m + labels: + severity: critical + annotations: + summary: La température de l'hôte est de {{ raw('$value') }}°C + + - alert: HostNodeOvertemperatureAlarm + expr: node_hwmon_temp_crit_alarm_celsius == 1 + for: 0m + labels: + severity: critical + annotations: + summary: L'alarme de température de l'hôte est active + + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer) + + - alert: HostEdacCorrectableErrorsDetected + expr: increase(node_edac_correctable_errors_total[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: >- + {{ raw('$value | print "%.1f"') }} erreur(s) ont été + corrigée(s) (EDAC) + - alert: OutOfDiskSpace expr: >- node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10 @@ -97,7 +147,7 @@ groups: - alert: UpsOutputSourceChanged expr: upsOutputSource != 3 - for: 1m + for: 0m labels: severity: critical annotations: @@ -106,7 +156,7 @@ groups: - alert: UpsBatteryStatus expr: upsBatteryStatus == 3 - for: 2m + for: 0m labels: severity: warning annotations: @@ -115,7 +165,7 @@ groups: - alert: UpsBatteryStatus expr: upsBatteryStatus == 4 - for: 10m + for: 0m labels: severity: critical annotations: @@ -124,7 +174,7 @@ groups: - alert: UpsHighLoad expr: upsOutputPercentLoad > 70 - for: 5m + for: 3m labels: severity: critical annotations: @@ -133,7 +183,7 @@ groups: - alert: UpsWrongInputVoltage expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) - for: 10m + for: 5m labels: severity: warning annotations: @@ -144,7 +194,7 @@ groups: expr: >- abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d])) < 3 * stddev_over_time(upsOutputVoltage[1d]) - for: 10m + for: 5m labels: severity: warning annotations: @@ -153,7 +203,7 @@ groups: - alert: UpsTimeRemaining expr: upsEstimatedMinutesRemaining < 8 - for: 1m + for: 0m labels: severity: warning annotations: @@ -162,10 +212,11 @@ groups: - alert: UpsTimeRemaining expr: upsEstimatedMinutesRemaining < 5 - for: 1m + for: 0m labels: severity: critical annotations: summary: >- Autonomie restante de {{ raw('$value') }} min + ... From 91817b324cbc64c752c7a89ec7344ba96816aeab Mon Sep 17 00:00:00 2001 From: Jeltz Date: Sat, 3 Apr 2021 08:04:10 +0200 Subject: [PATCH 4/4] Increase the alert threshold for temperatures --- roles/prometheus/templates/alert.rules.yml.j2 | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 index 7097e47..84d8aa2 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -59,12 +59,14 @@ groups: La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}% - alert: HostPhysicalComponentTooHot - expr: node_hwmon_temp_celsius > 75 + expr: node_hwmon_temp_celsius > 79 for: 3m labels: severity: critical annotations: - summary: La température de l'hôte est de {{ raw('$value') }}°C + summary: >- + La température de l'hôte est de {{ raw('$value') }}°C + ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }}) - alert: HostNodeOvertemperatureAlarm expr: node_hwmon_temp_crit_alarm_celsius == 1 @@ -72,7 +74,9 @@ groups: labels: severity: critical annotations: - summary: L'alarme de température de l'hôte est active + summary: >- + L'alarme de température de l'hôte est active + ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }}) - alert: HostOomKillDetected expr: increase(node_vmstat_oom_kill[1m]) > 0