From dff0d9922c414210829e88c543e25810a7cdb110 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Tue, 30 Mar 2021 10:05:47 +0200 Subject: [PATCH 01/18] Store log.adm.auro.re local logs in /var/log/remote --- roles/docker/templates/update-motd.d/05-service.j2 | 3 --- roles/nginx/templates/update-motd.d/05-service.j2 | 3 --- roles/re2o_service/templates/update-motd.d/05-service.j2 | 3 --- roles/rsyslog_collector/templates/20-collector.conf.j2 | 3 +++ roles/unifi_controller/templates/update-motd.d/05-service.j2 | 3 --- 5 files changed, 3 insertions(+), 12 deletions(-) delete mode 100755 roles/docker/templates/update-motd.d/05-service.j2 delete mode 100755 roles/nginx/templates/update-motd.d/05-service.j2 delete mode 100755 roles/re2o_service/templates/update-motd.d/05-service.j2 delete mode 100755 roles/unifi_controller/templates/update-motd.d/05-service.j2 diff --git a/roles/docker/templates/update-motd.d/05-service.j2 b/roles/docker/templates/update-motd.d/05-service.j2 deleted file mode 100755 index ce5faaa..0000000 --- a/roles/docker/templates/update-motd.d/05-service.j2 +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -# {{ ansible_managed }} -echo "> Les recettes Docker-compose se trouvent dans /var/local/ansible-docker" diff --git a/roles/nginx/templates/update-motd.d/05-service.j2 b/roles/nginx/templates/update-motd.d/05-service.j2 deleted file mode 100755 index c52c655..0000000 --- a/roles/nginx/templates/update-motd.d/05-service.j2 +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/tail +14 -{{ ansible_managed | comment }} -> NGINX a été déployé sur cette machine. Voir /etc/nginx/. diff --git a/roles/re2o_service/templates/update-motd.d/05-service.j2 b/roles/re2o_service/templates/update-motd.d/05-service.j2 deleted file mode 100755 index 4ed8a74..0000000 --- a/roles/re2o_service/templates/update-motd.d/05-service.j2 +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -# {{ ansible_managed }} -echo "✨ Le service re2o {{ service_name }} est dans {{ service_homedir }}/{{ service_name }}" diff --git a/roles/rsyslog_collector/templates/20-collector.conf.j2 b/roles/rsyslog_collector/templates/20-collector.conf.j2 index 897945f..e8b277f 100644 --- a/roles/rsyslog_collector/templates/20-collector.conf.j2 +++ b/roles/rsyslog_collector/templates/20-collector.conf.j2 @@ -52,3 +52,6 @@ input( ruleset="handleIncomingLogs" ) {% endfor %} + +# Store local logs too +call handleIncomingLogs diff --git a/roles/unifi_controller/templates/update-motd.d/05-service.j2 b/roles/unifi_controller/templates/update-motd.d/05-service.j2 deleted file mode 100755 index b768773..0000000 --- a/roles/unifi_controller/templates/update-motd.d/05-service.j2 +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -# {{ ansible_managed }} -echo "> Le contrôleur Unifi a été déployé sur cette machine." From 8ab4159d383787bb6e46e02175761fab5aafb48b Mon Sep 17 00:00:00 2001 From: Jeltz Date: Tue, 30 Mar 2021 10:06:36 +0200 Subject: [PATCH 02/18] Don't try to configure rsyslog on Wi-Fi APs --- log.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/log.yml b/log.yml index fb8db3d..e63f59a 100644 --- a/log.yml +++ b/log.yml @@ -3,7 +3,7 @@ roles: - rsyslog_collector -- hosts: all +- hosts: all,!unifi roles: - rsyslog_common ... From ac05da71732ff900c5f017d4e07b479241c5aeae Mon Sep 17 00:00:00 2001 From: Jeltz Date: Tue, 30 Mar 2021 10:08:21 +0200 Subject: [PATCH 03/18] Use update_motd everywhere --- roles/baseconfig/tasks/main.yml | 5 ----- roles/docker/tasks/main.yml | 13 ++++++++----- roles/nginx/tasks/main.yml | 13 ++++++++----- roles/re2o_service/tasks/main.yml | 14 +++++++++----- roles/unifi_controller/tasks/main.yml | 13 ++++++++----- 5 files changed, 33 insertions(+), 25 deletions(-) diff --git a/roles/baseconfig/tasks/main.yml b/roles/baseconfig/tasks/main.yml index cdedd37..77cb834 100644 --- a/roles/baseconfig/tasks/main.yml +++ b/roles/baseconfig/tasks/main.yml @@ -32,11 +32,6 @@ - include_role: name: update_motd -- name: Remove Debian warranty motd - file: - path: /etc/motd - state: absent - # Configure APT mirrors on Debian Stretch - name: Configure APT mirrors when: diff --git a/roles/docker/tasks/main.yml b/roles/docker/tasks/main.yml index 66eed8c..5484f88 100644 --- a/roles/docker/tasks/main.yml +++ b/roles/docker/tasks/main.yml @@ -51,8 +51,11 @@ dest: /usr/local/bin/docker-compose mode: "0755" -- name: Indicate role in motd - template: - src: update-motd.d/05-service.j2 - dest: /etc/update-motd.d/05-docker - mode: 0755 +- include_role: + name: update_motd + vars: + motd_messages: + - key: 10-docker + message: >- + Docker est installé sur ce serveur. +... diff --git a/roles/nginx/tasks/main.yml b/roles/nginx/tasks/main.yml index a8fb885..146b93c 100644 --- a/roles/nginx/tasks/main.yml +++ b/roles/nginx/tasks/main.yml @@ -148,11 +148,13 @@ group: www-data mode: 0644 -- name: Indicate role in motd - template: - src: update-motd.d/05-service.j2 - dest: /etc/update-motd.d/05-nginx - mode: 0755 +- include_role: + name: update_motd + vars: + motd_messages: + - key: 10-nginx + message: >- + NGinx est installé sur ce serveur. Voir /etc/nginx. - name: Clean old files file: @@ -162,3 +164,4 @@ - "/etc/nginx/snippets/options-ssl.conf" - "/var/www/custom_401.html" - "/var/www/robots.txt" +... diff --git a/roles/re2o_service/tasks/main.yml b/roles/re2o_service/tasks/main.yml index 68e963c..882fdf9 100644 --- a/roles/re2o_service/tasks/main.yml +++ b/roles/re2o_service/tasks/main.yml @@ -40,8 +40,12 @@ group: nogroup state: link -- name: Indicate in motd service location - template: - src: update-motd.d/05-service.j2 - dest: "/etc/update-motd.d/05-re2o-{{ service_name }}" - mode: 0755 +- include_role: + name: update_motd + vars: + motd_messages: + - key: "15-re2o-service-{{ service_name }}" + message: >- + Le service re2o {{ service_name }} est dans + {{ service_homedir }}/{{ service_name }}. +... diff --git a/roles/unifi_controller/tasks/main.yml b/roles/unifi_controller/tasks/main.yml index 7f886f2..b43c74b 100644 --- a/roles/unifi_controller/tasks/main.yml +++ b/roles/unifi_controller/tasks/main.yml @@ -40,8 +40,11 @@ retries: 3 until: apt_result is succeeded -- name: Indicate role in motd - template: - src: update-motd.d/05-service.j2 - dest: /etc/update-motd.d/05-unifi-controller - mode: 0755 +- include_role: + name: update_motd + vars: + motd_messages: + - key: 10-unifi-controller + message: >- + Le contrôleur Unifi a été installé sur ce serveur. +... From e247aa3f70fe089c36d174f45a8d198ae36afb4c Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 05:21:08 +0200 Subject: [PATCH 04/18] Uniform labels for alerts --- roles/prometheus/handlers/main.yml | 1 + .../templates/prometheus/alert.rules.yml.j2 | 297 ++++++++++-------- .../templates/prometheus/prometheus.yml.j2 | 4 +- 3 files changed, 164 insertions(+), 138 deletions(-) diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml index 670847b..d501c14 100644 --- a/roles/prometheus/handlers/main.yml +++ b/roles/prometheus/handlers/main.yml @@ -8,3 +8,4 @@ service: name: prometheus-snmp-exporter state: restarted +... diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index bf4127b..1de7e24 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -1,138 +1,161 @@ -# {{ ansible_managed }} -{# As this is also Jinja2 it will conflict without a raw block #} -{# Depending of Prometheus Node exporter version, rules can change depending of version #} -{% raw %} +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ string }} +{%- endmacro %} + +{% set instance = '[{{ raw("$label.instance") }}]' %} + groups: -- name: alert.rules - rules: - - # Alert for any instance that is unreachable for >3 minutes. - - alert: InstanceDown - expr: up == 0 - for: 3m - labels: - severity: critical - annotations: - summary: "{{ $labels.instance }} est invisible depuis plus de 3 minutes !" - - # Alert for out of memory - - alert: OutOfMemory - expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Mémoire libre de {{ $labels.instance }} à {{ humanize $value }}%." - - # Alert for out of disk space - - alert: OutOfDiskSpace - expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ humanize $value }}%." - - # Alert for out of inode space on disk - - alert: OutOfInodes - expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}." - - # Alert for high CPU usage - - alert: CpuUsage - expr: (100 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 75 - for: 10m - labels: - severity: warning - annotations: - summary: "CPU sur {{ $labels.instance }} à {{ humanize $value }}%." - - # Check systemd unit (> buster) - - alert: SystemdServiceFailed - expr: node_systemd_unit_state{state="failed"} == 1 - for: 10m - labels: - severity: warning - annotations: - summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" - - # Check load of instance - - alert: LoadUsage - expr: node_load1 > 5 - for: 2m - labels: - severity: warning - annotations: - summary: "La charge de {{ $labels.instance }} est à {{ $value }} !" - - # Check UPS - - alert: UpsOutputSourceChanged - expr: upsOutputSource != 3 - for: 1m - labels: - severity: warning - annotations: - summary: "La source d'alimentation de {{ $labels.instance }} a changé !" - - - alert: UpsBatteryStatusWarning - expr: upsBatteryStatus == 3 - for: 2m - labels: - severity: warning - annotations: - summary: "L'état de la batterie de {{ $labels.instance }} est faible !" - - - alert: UpsBatteryStatusCritical - expr: upsBatteryStatus == 4 - for: 10m - labels: - severity: warning - annotations: - summary: "L'état de la batterie de {{ $labels.instance }} est affaibli !" - - - alert: UpsHighLoad - expr: upsOutputPercentLoad > 70 - for: 5m - labels: - severity: critical - annotations: - summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !" - - - alert: UpsWrongInputVoltage - expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) - for: 10m - labels: - severity: warning - annotations: - summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V." - - - alert: UpsWrongOutputVoltage - expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) - for: 10m - labels: - severity: warning - annotations: - summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V." - - - alert: UpsTimeRemainingWarning - expr: upsEstimatedMinutesRemaining < 8 - for: 1m - labels: - severity: warning - annotations: - summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." - - - alert: UpsTimeRemainingCritical - expr: upsEstimatedMinutesRemaining < 5 - for: 1m - labels: - severity: critical - annotations: - summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." - - -{% endraw %} + - name: alert.rules + rules: + + - alert: InstanceDown + expr: up == 0 + for: 3m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} Invisible depuis plus de 3 minutes + + - alert: OutOfMemory + expr: >- + ( + node_memory_MemFree_bytes + + node_memory_Cached_bytes + + node_memory_Buffers_bytes + ) / node_memory_MemTotal_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} {{ raw("$value | round") }}% de mémoire libre + + - alert: OutOfDiskSpace + expr: >- + node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} Espace libre de {{ raw("$labels.mountpoint") }} sur + à {{ raw("$value | round") }}% + + - alert: OutOfInodes + expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} {{ raw("$value | round") }}% d'inodes restants + pour {{ raw("$labels.mountpoint") }} + + - alert: CpuUsage + expr: >- + ( + 100 - avg by (instance) ( + irate(node_cpu_seconds_total{mode="idle"}[5m]) + ) * 100 + ) > 75 + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} CPU à {{ raw("$value | round") }}% + + - alert: SystemdServiceFailed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} {{ raw("$label.name") }} a échoué + + - alert: LoadUsage + expr: node_load1 > 5 + for: 2m + labels: + severity: warning + annotations: + summary: > + {{ instance }} Charge à {{ raw("$value") }} + + - alert: UpsOutputSourceChanged + expr: upsOutputSource != 3 + for: 1m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} Source d'alimentation changée + + - alert: UpsBatteryStatusWarning + expr: upsBatteryStatus == 3 + for: 2m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} État de la batterie faible + + - alert: UpsBatteryStatusCritical + expr: upsBatteryStatus == 4 + for: 10m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} État de la batterie critique + + - alert: UpsHighLoad + expr: upsOutputPercentLoad > 70 + for: 5m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} Charge de {{ raw("$value | round") }}% + + - alert: UpsWrongInputVoltage + expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} Tension d'entrée de {{ raw("$value") }}V + + - alert: UpsWrongOutputVoltage + expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} Tension de sortie de {{ raw("$value") }}V + + - alert: UpsTimeRemainingWarning + expr: upsEstimatedMinutesRemaining < 8 + for: 1m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} Autonomie restante de {{ raw("$value") }} min + + - alert: UpsTimeRemainingCritical + expr: upsEstimatedMinutesRemaining < 5 + for: 1m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} Autonomie restante de {{ raw("$value") }} min +... diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 index 7399f48..e97e986 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -1,4 +1,5 @@ -# {{ ansible_managed }} +--- +{{ ansible_managed | comment }} global: # scrape_interval is set to the global default (60s) @@ -100,3 +101,4 @@ scrape_configs: file_sd_configs: - files: - '/etc/prometheus/targets_docker.json' +... From eeaf0f8486141e741643c3da24e78f7e0cbe76e2 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 06:00:47 +0200 Subject: [PATCH 05/18] Fix syntax errors --- .../templates/prometheus/alert.rules.yml.j2 | 37 ++++++++++--------- .../templates/prometheus/alert.rules.yml.j2 | 9 ++++- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index 1de7e24..c958bac 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -2,10 +2,10 @@ {{ ansible_managed | comment }} {% macro raw(string) -%} -{{ string }} +{{ "{{" }} {{ string }} {{ "}}" }} {%- endmacro %} -{% set instance = '[{{ raw("$label.instance") }}]' %} +{% set instance = "[{{ $labels.instance }}]" %} groups: - name: alert.rules @@ -15,10 +15,10 @@ groups: expr: up == 0 for: 3m labels: - severity: critical + severity: critical annotations: - summary: >- - {{ instance }} Invisible depuis plus de 3 minutes + summary: >- + {{ instance }} Invisible depuis plus de 3 minutes - alert: OutOfMemory expr: >- @@ -32,7 +32,8 @@ groups: severity: warning annotations: summary: >- - {{ instance }} {{ raw("$value | round") }}% de mémoire libre + {{ instance }} {{ raw('$value | printf "%.1f"') }}% de mémoire + libre - alert: OutOfDiskSpace expr: >- @@ -42,8 +43,8 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Espace libre de {{ raw("$labels.mountpoint") }} sur - à {{ raw("$value | round") }}% + {{ instance }} Espace libre de {{ raw('$labels.mountpoint') }} sur + à {{ raw('$value | printf "%.1f"') }}% - alert: OutOfInodes expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 @@ -52,8 +53,8 @@ groups: severity: warning annotations: summary: >- - {{ instance }} {{ raw("$value | round") }}% d'inodes restants - pour {{ raw("$labels.mountpoint") }} + {{ instance }} {{ raw('$value | printf "%.1f"') }}% d'inodes + restants pour {{ raw('$labels.mountpoint') }} - alert: CpuUsage expr: >- @@ -67,7 +68,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} CPU à {{ raw("$value | round") }}% + {{ instance }} CPU à {{ raw('$value | printf "%.1f"') }}% - alert: SystemdServiceFailed expr: node_systemd_unit_state{state="failed"} == 1 @@ -76,7 +77,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} {{ raw("$label.name") }} a échoué + {{ instance }} {{ raw('$labels.name') }} a échoué - alert: LoadUsage expr: node_load1 > 5 @@ -85,7 +86,7 @@ groups: severity: warning annotations: summary: > - {{ instance }} Charge à {{ raw("$value") }} + {{ instance }} Charge à {{ raw('$value') }} - alert: UpsOutputSourceChanged expr: upsOutputSource != 3 @@ -121,7 +122,7 @@ groups: severity: critical annotations: summary: >- - {{ instance }} Charge de {{ raw("$value | round") }}% + {{ instance }} Charge de {{ raw('$value | printf "%.1f"') }}% - alert: UpsWrongInputVoltage expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) @@ -130,7 +131,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Tension d'entrée de {{ raw("$value") }}V + {{ instance }} Tension d'entrée de {{ raw('$value') }}V - alert: UpsWrongOutputVoltage expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) @@ -139,7 +140,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Tension de sortie de {{ raw("$value") }}V + {{ instance }} Tension de sortie de {{ raw('$value') }}V - alert: UpsTimeRemainingWarning expr: upsEstimatedMinutesRemaining < 8 @@ -148,7 +149,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Autonomie restante de {{ raw("$value") }} min + {{ instance }} Autonomie restante de {{ raw('$value') }} min - alert: UpsTimeRemainingCritical expr: upsEstimatedMinutesRemaining < 5 @@ -157,5 +158,5 @@ groups: severity: critical annotations: summary: >- - {{ instance }} Autonomie restante de {{ raw("$value") }} min + {{ instance }} Autonomie restante de {{ raw('$value') }} min ... diff --git a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 index d30511f..030e418 100644 --- a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 @@ -1,6 +1,12 @@ --- {{ ansible_managed | comment }} +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + +{% set instance = "[{{ $labels.instance }}]" %} + groups: - name: alert.rules rules: @@ -11,6 +17,5 @@ groups: severity: critical annotations: summary: >- - Federate : {{ "{{" }} $labels.instance {{ "}}" }} est invisible - depuis plus de 3 minutes ! + {{ instance }} Invisible depuis plus de 3 minutes ... From 5bcc42889550705126b286fa932bf4d15b90c665 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 09:36:11 +0200 Subject: [PATCH 06/18] Remove 'instance' from description and fix typos --- .../templates/prometheus/alert.rules.yml.j2 | 40 +++++++++---------- .../templates/prometheus/alert.rules.yml.j2 | 4 +- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index c958bac..275f0a1 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -5,8 +5,6 @@ {{ "{{" }} {{ string }} {{ "}}" }} {%- endmacro %} -{% set instance = "[{{ $labels.instance }}]" %} - groups: - name: alert.rules rules: @@ -18,7 +16,7 @@ groups: severity: critical annotations: summary: >- - {{ instance }} Invisible depuis plus de 3 minutes + Invisible depuis plus de 3 minutes - alert: OutOfMemory expr: >- @@ -32,7 +30,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} {{ raw('$value | printf "%.1f"') }}% de mémoire + {{ raw('$value | printf "%.1f"') }}% de mémoire libre - alert: OutOfDiskSpace @@ -43,7 +41,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Espace libre de {{ raw('$labels.mountpoint') }} sur + Espace libre de {{ raw('$labels.mountpoint') }} sur à {{ raw('$value | printf "%.1f"') }}% - alert: OutOfInodes @@ -53,7 +51,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} {{ raw('$value | printf "%.1f"') }}% d'inodes + {{ raw('$value | printf "%.1f"') }}% d'inodes restants pour {{ raw('$labels.mountpoint') }} - alert: CpuUsage @@ -68,7 +66,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} CPU à {{ raw('$value | printf "%.1f"') }}% + CPU à {{ raw('$value | printf "%.1f"') }}% - alert: SystemdServiceFailed expr: node_systemd_unit_state{state="failed"} == 1 @@ -77,7 +75,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} {{ raw('$labels.name') }} a échoué + {{ raw('$labels.name') }} a échoué - alert: LoadUsage expr: node_load1 > 5 @@ -86,7 +84,7 @@ groups: severity: warning annotations: summary: > - {{ instance }} Charge à {{ raw('$value') }} + Charge à {{ raw('$value') }} - alert: UpsOutputSourceChanged expr: upsOutputSource != 3 @@ -95,25 +93,25 @@ groups: severity: critical annotations: summary: >- - {{ instance }} Source d'alimentation changée + Source d'alimentation changée - - alert: UpsBatteryStatusWarning + - alert: UpsBatteryStatus expr: upsBatteryStatus == 3 for: 2m labels: severity: warning annotations: summary: >- - {{ instance }} État de la batterie faible + État de la batterie faible - - alert: UpsBatteryStatusCritical + - alert: UpsBatteryStatus expr: upsBatteryStatus == 4 for: 10m labels: severity: critical annotations: summary: >- - {{ instance }} État de la batterie critique + État de la batterie critique - alert: UpsHighLoad expr: upsOutputPercentLoad > 70 @@ -122,7 +120,7 @@ groups: severity: critical annotations: summary: >- - {{ instance }} Charge de {{ raw('$value | printf "%.1f"') }}% + Charge de {{ raw('$value | printf "%.1f"') }}% - alert: UpsWrongInputVoltage expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) @@ -131,7 +129,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Tension d'entrée de {{ raw('$value') }}V + Tension d'entrée de {{ raw('$value') }}V - alert: UpsWrongOutputVoltage expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) @@ -140,23 +138,23 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Tension de sortie de {{ raw('$value') }}V + Tension de sortie de {{ raw('$value') }}V - - alert: UpsTimeRemainingWarning + - alert: UpsTimeRemaining expr: upsEstimatedMinutesRemaining < 8 for: 1m labels: severity: warning annotations: summary: >- - {{ instance }} Autonomie restante de {{ raw('$value') }} min + Autonomie restante de {{ raw('$value') }} min - - alert: UpsTimeRemainingCritical + - alert: UpsTimeRemaining expr: upsEstimatedMinutesRemaining < 5 for: 1m labels: severity: critical annotations: summary: >- - {{ instance }} Autonomie restante de {{ raw('$value') }} min + Autonomie restante de {{ raw('$value') }} min ... diff --git a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 index 030e418..95f457e 100644 --- a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 @@ -5,8 +5,6 @@ {{ "{{" }} {{ string }} {{ "}}" }} {%- endmacro %} -{% set instance = "[{{ $labels.instance }}]" %} - groups: - name: alert.rules rules: @@ -17,5 +15,5 @@ groups: severity: critical annotations: summary: >- - {{ instance }} Invisible depuis plus de 3 minutes + Invisible depuis plus de 3 minutes ... From bc35cd8e908bbb9d5d2b9093aaac8ca3b175ce70 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 09:40:22 +0200 Subject: [PATCH 07/18] Move templates of the prometheus role --- roles/prometheus/tasks/main.yml | 23 ++++++++----------- .../{prometheus => }/alert.rules.yml.j2 | 0 .../{prometheus => }/django.rules.yml.j2 | 0 .../{prometheus => }/prometheus.yml.j2 | 0 .../templates/{prometheus => }/snmp.yml.j2 | 0 5 files changed, 10 insertions(+), 13 deletions(-) rename roles/prometheus/templates/{prometheus => }/alert.rules.yml.j2 (100%) rename roles/prometheus/templates/{prometheus => }/django.rules.yml.j2 (100%) rename roles/prometheus/templates/{prometheus => }/prometheus.yml.j2 (100%) rename roles/prometheus/templates/{prometheus => }/snmp.yml.j2 (100%) diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index a3d2063..3a590c9 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -11,20 +11,16 @@ - name: Configure Prometheus template: - src: prometheus/prometheus.yml.j2 + src: prometheus.yml.j2 dest: /etc/prometheus/prometheus.yml - mode: 0644 - notify: Restart Prometheus - -- name: Configure Prometheus alert rules - template: - src: "prometheus/{{ item }}.j2" - dest: "/etc/prometheus/{{ item }}" - mode: 0644 - notify: Restart Prometheus + owner: prometheus + group: prometheus + mode: u=r,g=r,o= loop: + - prometheus.yml - alert.rules.yml - django.rules.yml + notify: Restart Prometheus - name: Make Prometheus snmp-exporter listen on localhost only lineinfile: @@ -36,10 +32,11 @@ # This file store SNMP OIDs - name: Configure Prometheus snmp-exporter template: - src: "prometheus/snmp.yml.j2" - dest: "/etc/prometheus/snmp.yml" - mode: 0600 + src: snmp.yml.j2 + dest: /etc/prometheus/snmp.yml owner: prometheus + group: prometheus + mode: u=r,g=r,o= notify: Restart prometheus-snmp-exporter # We don't need to restart Prometheus when updating nodes diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 similarity index 100% rename from roles/prometheus/templates/prometheus/alert.rules.yml.j2 rename to roles/prometheus/templates/alert.rules.yml.j2 diff --git a/roles/prometheus/templates/prometheus/django.rules.yml.j2 b/roles/prometheus/templates/django.rules.yml.j2 similarity index 100% rename from roles/prometheus/templates/prometheus/django.rules.yml.j2 rename to roles/prometheus/templates/django.rules.yml.j2 diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 similarity index 100% rename from roles/prometheus/templates/prometheus/prometheus.yml.j2 rename to roles/prometheus/templates/prometheus.yml.j2 diff --git a/roles/prometheus/templates/prometheus/snmp.yml.j2 b/roles/prometheus/templates/snmp.yml.j2 similarity index 100% rename from roles/prometheus/templates/prometheus/snmp.yml.j2 rename to roles/prometheus/templates/snmp.yml.j2 From a743ce09fb29fbfe9436f6384135abd95a69de14 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 09:42:31 +0200 Subject: [PATCH 08/18] Move templates of the prometheus_federate role --- roles/prometheus_federate/tasks/main.yml | 16 ++++++---------- .../{prometheus => }/alert.rules.yml.j2 | 0 .../templates/{prometheus => }/prometheus.yml.j2 | 0 3 files changed, 6 insertions(+), 10 deletions(-) rename roles/prometheus_federate/templates/{prometheus => }/alert.rules.yml.j2 (100%) rename roles/prometheus_federate/templates/{prometheus => }/prometheus.yml.j2 (100%) diff --git a/roles/prometheus_federate/tasks/main.yml b/roles/prometheus_federate/tasks/main.yml index 73ae803..c5d81bf 100644 --- a/roles/prometheus_federate/tasks/main.yml +++ b/roles/prometheus_federate/tasks/main.yml @@ -10,19 +10,15 @@ - name: Configure Prometheus template: - src: prometheus/prometheus.yml.j2 - dest: /etc/prometheus/prometheus.yml - mode: 0644 - notify: Restart Prometheus - -- name: Configure Prometheus alert rules - template: - src: "prometheus/{{ item }}.j2" + src: "{{ item }}.j2" dest: "/etc/prometheus/{{ item }}" - mode: 0644 - notify: Restart Prometheus + owner: root + group: root + mode: u=r,g=r,o= loop: + - prometheus.yml - alert.rules.yml + notify: Restart Prometheus # We don't need to restart Prometheus when updating nodes - name: Configure Prometheus Federate devices diff --git a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus_federate/templates/alert.rules.yml.j2 similarity index 100% rename from roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 rename to roles/prometheus_federate/templates/alert.rules.yml.j2 diff --git a/roles/prometheus_federate/templates/prometheus/prometheus.yml.j2 b/roles/prometheus_federate/templates/prometheus.yml.j2 similarity index 100% rename from roles/prometheus_federate/templates/prometheus/prometheus.yml.j2 rename to roles/prometheus_federate/templates/prometheus.yml.j2 From 5d681a95ea4643b96d17efcacc1626c16b141e22 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 17:31:21 +0200 Subject: [PATCH 09/18] Remove unmanaged hosts from inventory --- hosts | 2 -- 1 file changed, 2 deletions(-) diff --git a/hosts b/hosts index f3ec2af..22ea7aa 100644 --- a/hosts +++ b/hosts @@ -63,7 +63,6 @@ matrix-services.adm.auro.re [ovh_vm] serge.adm.auro.re passbolt.adm.auro.re -vpn-ovh.adm.auro.re docker-ovh.adm.auro.re switchs-manager.adm.auro.re ldap-replica-ovh.adm.auro.re @@ -252,7 +251,6 @@ perceval.adm.auro.re [edc_pve] chapalux.adm.auro.re -escalope.adm.auro.re [edc_vm] routeur-edc.adm.auro.re From f69dfd87994f073caa1696ec85eaceb6ff772992 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 17:41:22 +0200 Subject: [PATCH 10/18] Remove other unmanaged hosts --- hosts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/hosts b/hosts index 22ea7aa..79a2d38 100644 --- a/hosts +++ b/hosts @@ -29,7 +29,6 @@ stream.adm.auro.re re2o-server.adm.auro.re re2o-ldap.adm.auro.re re2o-db.adm.auro.re -backup.adm.auro.re mail.adm.auro.re wikijs.adm.auro.re prometheus-aurore.adm.auro.re @@ -39,7 +38,6 @@ log.adm.auro.re bdd.adm.auro.re bdd-ovh.adm.auro.re litl.adm.auro.re -services-bdd-local.adm.auro.re log.adm.auro.re [aurore_testing_vm] @@ -53,7 +51,6 @@ horus.adm.auro.re [ovh_container] synapse.adm.auro.re -#services-bdd.adm.auro.re phabricator.adm.auro.re wiki.adm.auro.re www.adm.auro.re @@ -514,5 +511,4 @@ proxy.adm.auro.re [bdd] bdd.adm.auro.re bdd-ovh.adm.auro.re -services-bdd-local.adm.auro.re re2o-db.adm.auro.re From 083fc4da9acaaf102cdd79f4f0dd8e9600630936 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 18:15:09 +0200 Subject: [PATCH 11/18] Fix permissions on prometheus.yml --- roles/prometheus_federate/tasks/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/prometheus_federate/tasks/main.yml b/roles/prometheus_federate/tasks/main.yml index c5d81bf..02ae85e 100644 --- a/roles/prometheus_federate/tasks/main.yml +++ b/roles/prometheus_federate/tasks/main.yml @@ -12,8 +12,8 @@ template: src: "{{ item }}.j2" dest: "/etc/prometheus/{{ item }}" - owner: root - group: root + owner: prometheus + group: prometheus mode: u=r,g=r,o= loop: - prometheus.yml From 11335a6077c01e1fcb5d282aefc00efefccc1569 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 18:15:22 +0200 Subject: [PATCH 12/18] Fix typo in alert description --- roles/prometheus/templates/alert.rules.yml.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 index 275f0a1..3528823 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -41,8 +41,8 @@ groups: severity: warning annotations: summary: >- - Espace libre de {{ raw('$labels.mountpoint') }} sur - à {{ raw('$value | printf "%.1f"') }}% + Espace libre de {{ raw('$labels.mountpoint') }} à + {{ raw('$value | printf "%.1f"') }}% - alert: OutOfInodes expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 From 35286a661ab169647c83e039d5b1cefead6b1f9e Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 18:24:03 +0200 Subject: [PATCH 13/18] Change an alert description --- roles/prometheus/templates/alert.rules.yml.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 index 3528823..c5ba1c2 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -41,8 +41,8 @@ groups: severity: warning annotations: summary: >- - Espace libre de {{ raw('$labels.mountpoint') }} à - {{ raw('$value | printf "%.1f"') }}% + {{ raw('$value | printf "%.1f"') }}% d'espace libre pour + {{ raw('$labels.mountpoint') }} - alert: OutOfInodes expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 From 83f5b35e59ea427fe146eb739f6a0c1b96ac0ee8 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 18:24:21 +0200 Subject: [PATCH 14/18] Fix a filename typo --- roles/prometheus/tasks/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 3a590c9..4dc518b 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -11,8 +11,8 @@ - name: Configure Prometheus template: - src: prometheus.yml.j2 - dest: /etc/prometheus/prometheus.yml + src: "{{ item }}.j2" + dest: "/etc/prometheus/{{ item }}" owner: prometheus group: prometheus mode: u=r,g=r,o= From 06f101527df401610b73879670371e8876909610 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Fri, 2 Apr 2021 13:57:34 +0200 Subject: [PATCH 15/18] Use a dynamic interval for UPS output voltage alerts --- roles/prometheus/templates/alert.rules.yml.j2 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 index c5ba1c2..e6f10ce 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -132,7 +132,9 @@ groups: Tension d'entrée de {{ raw('$value') }}V - alert: UpsWrongOutputVoltage - expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) + expr: >- + abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d])) + < 3 * stddev_over_time(upsOutputVoltage[1d]) for: 10m labels: severity: warning From f80435cb314b67a1740861e41f3da0955ff2e14e Mon Sep 17 00:00:00 2001 From: Jeltz Date: Fri, 2 Apr 2021 21:54:38 +0200 Subject: [PATCH 16/18] Differentiate alerts for servers and Wi-Fi APs --- roles/prometheus/templates/alert.rules.yml.j2 | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 index e6f10ce..97d5e22 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -10,7 +10,7 @@ groups: rules: - alert: InstanceDown - expr: up == 0 + expr: up{instance!~".*.borne.auro.re$"} == 0 for: 3m labels: severity: critical @@ -18,6 +18,15 @@ groups: summary: >- Invisible depuis plus de 3 minutes + - alert: AccessPointDown + expr: up{instance=~".*.borne.auro.re$"} == 0 + for: 3m + labels: + severity: warning + annotations: + summary: >- + Invisible depuis plus de 3 minutes + - alert: OutOfMemory expr: >- ( From 1c3127dbbe2dcbba663cba7e3e77776253d1f135 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Fri, 2 Apr 2021 22:55:51 +0200 Subject: [PATCH 17/18] Add more node-exporter alerts Source: https://awesome-prometheus-alerts.grep.to/rules.html --- roles/prometheus/templates/alert.rules.yml.j2 | 67 ++++++++++++++++--- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 index 97d5e22..7097e47 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -6,6 +6,7 @@ {%- endmacro %} groups: + - name: alert.rules rules: @@ -42,6 +43,55 @@ groups: {{ raw('$value | printf "%.1f"') }}% de mémoire libre + - alert: HostSwapIsFillingUp + expr: >- + ( + 1 - ( + node_memory_SwapFree_bytes + / node_memory_SwapTotal_bytes + ) + ) * 100 > 10 + for: 3m + labels: + severity: warning + annotations: + summary: >- + La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}% + + - alert: HostPhysicalComponentTooHot + expr: node_hwmon_temp_celsius > 75 + for: 3m + labels: + severity: critical + annotations: + summary: La température de l'hôte est de {{ raw('$value') }}°C + + - alert: HostNodeOvertemperatureAlarm + expr: node_hwmon_temp_crit_alarm_celsius == 1 + for: 0m + labels: + severity: critical + annotations: + summary: L'alarme de température de l'hôte est active + + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer) + + - alert: HostEdacCorrectableErrorsDetected + expr: increase(node_edac_correctable_errors_total[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: >- + {{ raw('$value | print "%.1f"') }} erreur(s) ont été + corrigée(s) (EDAC) + - alert: OutOfDiskSpace expr: >- node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10 @@ -97,7 +147,7 @@ groups: - alert: UpsOutputSourceChanged expr: upsOutputSource != 3 - for: 1m + for: 0m labels: severity: critical annotations: @@ -106,7 +156,7 @@ groups: - alert: UpsBatteryStatus expr: upsBatteryStatus == 3 - for: 2m + for: 0m labels: severity: warning annotations: @@ -115,7 +165,7 @@ groups: - alert: UpsBatteryStatus expr: upsBatteryStatus == 4 - for: 10m + for: 0m labels: severity: critical annotations: @@ -124,7 +174,7 @@ groups: - alert: UpsHighLoad expr: upsOutputPercentLoad > 70 - for: 5m + for: 3m labels: severity: critical annotations: @@ -133,7 +183,7 @@ groups: - alert: UpsWrongInputVoltage expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) - for: 10m + for: 5m labels: severity: warning annotations: @@ -144,7 +194,7 @@ groups: expr: >- abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d])) < 3 * stddev_over_time(upsOutputVoltage[1d]) - for: 10m + for: 5m labels: severity: warning annotations: @@ -153,7 +203,7 @@ groups: - alert: UpsTimeRemaining expr: upsEstimatedMinutesRemaining < 8 - for: 1m + for: 0m labels: severity: warning annotations: @@ -162,10 +212,11 @@ groups: - alert: UpsTimeRemaining expr: upsEstimatedMinutesRemaining < 5 - for: 1m + for: 0m labels: severity: critical annotations: summary: >- Autonomie restante de {{ raw('$value') }} min + ... From 91817b324cbc64c752c7a89ec7344ba96816aeab Mon Sep 17 00:00:00 2001 From: Jeltz Date: Sat, 3 Apr 2021 08:04:10 +0200 Subject: [PATCH 18/18] Increase the alert threshold for temperatures --- roles/prometheus/templates/alert.rules.yml.j2 | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 index 7097e47..84d8aa2 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -59,12 +59,14 @@ groups: La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}% - alert: HostPhysicalComponentTooHot - expr: node_hwmon_temp_celsius > 75 + expr: node_hwmon_temp_celsius > 79 for: 3m labels: severity: critical annotations: - summary: La température de l'hôte est de {{ raw('$value') }}°C + summary: >- + La température de l'hôte est de {{ raw('$value') }}°C + ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }}) - alert: HostNodeOvertemperatureAlarm expr: node_hwmon_temp_crit_alarm_celsius == 1 @@ -72,7 +74,9 @@ groups: labels: severity: critical annotations: - summary: L'alarme de température de l'hôte est active + summary: >- + L'alarme de température de l'hôte est active + ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }}) - alert: HostOomKillDetected expr: increase(node_vmstat_oom_kill[1m]) > 0