diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml index 670847b..d501c14 100644 --- a/roles/prometheus/handlers/main.yml +++ b/roles/prometheus/handlers/main.yml @@ -8,3 +8,4 @@ service: name: prometheus-snmp-exporter state: restarted +... diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index bf4127b..1de7e24 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -1,138 +1,161 @@ -# {{ ansible_managed }} -{# As this is also Jinja2 it will conflict without a raw block #} -{# Depending of Prometheus Node exporter version, rules can change depending of version #} -{% raw %} +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ string }} +{%- endmacro %} + +{% set instance = '[{{ raw("$label.instance") }}]' %} + groups: -- name: alert.rules - rules: + - name: alert.rules + rules: - # Alert for any instance that is unreachable for >3 minutes. - - alert: InstanceDown - expr: up == 0 - for: 3m - labels: - severity: critical - annotations: - summary: "{{ $labels.instance }} est invisible depuis plus de 3 minutes !" + - alert: InstanceDown + expr: up == 0 + for: 3m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} Invisible depuis plus de 3 minutes - # Alert for out of memory - - alert: OutOfMemory - expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Mémoire libre de {{ $labels.instance }} à {{ humanize $value }}%." + - alert: OutOfMemory + expr: >- + ( + node_memory_MemFree_bytes + + node_memory_Cached_bytes + + node_memory_Buffers_bytes + ) / node_memory_MemTotal_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} {{ raw("$value | round") }}% de mémoire libre - # Alert for out of disk space - - alert: OutOfDiskSpace - expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ humanize $value }}%." + - alert: OutOfDiskSpace + expr: >- + node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} Espace libre de {{ raw("$labels.mountpoint") }} sur + à {{ raw("$value | round") }}% - # Alert for out of inode space on disk - - alert: OutOfInodes - expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}." + - alert: OutOfInodes + expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} {{ raw("$value | round") }}% d'inodes restants + pour {{ raw("$labels.mountpoint") }} - # Alert for high CPU usage - - alert: CpuUsage - expr: (100 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 75 - for: 10m - labels: - severity: warning - annotations: - summary: "CPU sur {{ $labels.instance }} à {{ humanize $value }}%." + - alert: CpuUsage + expr: >- + ( + 100 - avg by (instance) ( + irate(node_cpu_seconds_total{mode="idle"}[5m]) + ) * 100 + ) > 75 + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} CPU à {{ raw("$value | round") }}% - # Check systemd unit (> buster) - - alert: SystemdServiceFailed - expr: node_systemd_unit_state{state="failed"} == 1 - for: 10m - labels: - severity: warning - annotations: - summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" + - alert: SystemdServiceFailed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} {{ raw("$label.name") }} a échoué - # Check load of instance - - alert: LoadUsage - expr: node_load1 > 5 - for: 2m - labels: - severity: warning - annotations: - summary: "La charge de {{ $labels.instance }} est à {{ $value }} !" - - # Check UPS - - alert: UpsOutputSourceChanged - expr: upsOutputSource != 3 - for: 1m - labels: - severity: warning - annotations: - summary: "La source d'alimentation de {{ $labels.instance }} a changé !" + - alert: LoadUsage + expr: node_load1 > 5 + for: 2m + labels: + severity: warning + annotations: + summary: > + {{ instance }} Charge à {{ raw("$value") }} + + - alert: UpsOutputSourceChanged + expr: upsOutputSource != 3 + for: 1m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} Source d'alimentation changée - - alert: UpsBatteryStatusWarning - expr: upsBatteryStatus == 3 - for: 2m - labels: - severity: warning - annotations: - summary: "L'état de la batterie de {{ $labels.instance }} est faible !" + - alert: UpsBatteryStatusWarning + expr: upsBatteryStatus == 3 + for: 2m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} État de la batterie faible - - alert: UpsBatteryStatusCritical - expr: upsBatteryStatus == 4 - for: 10m - labels: - severity: warning - annotations: - summary: "L'état de la batterie de {{ $labels.instance }} est affaibli !" + - alert: UpsBatteryStatusCritical + expr: upsBatteryStatus == 4 + for: 10m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} État de la batterie critique - - alert: UpsHighLoad - expr: upsOutputPercentLoad > 70 - for: 5m - labels: - severity: critical - annotations: - summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !" + - alert: UpsHighLoad + expr: upsOutputPercentLoad > 70 + for: 5m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} Charge de {{ raw("$value | round") }}% - - alert: UpsWrongInputVoltage - expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) - for: 10m - labels: - severity: warning - annotations: - summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V." + - alert: UpsWrongInputVoltage + expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} Tension d'entrée de {{ raw("$value") }}V - - alert: UpsWrongOutputVoltage - expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) - for: 10m - labels: - severity: warning - annotations: - summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V." + - alert: UpsWrongOutputVoltage + expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} Tension de sortie de {{ raw("$value") }}V - - alert: UpsTimeRemainingWarning - expr: upsEstimatedMinutesRemaining < 8 - for: 1m - labels: - severity: warning - annotations: - summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." + - alert: UpsTimeRemainingWarning + expr: upsEstimatedMinutesRemaining < 8 + for: 1m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} Autonomie restante de {{ raw("$value") }} min - - alert: UpsTimeRemainingCritical - expr: upsEstimatedMinutesRemaining < 5 - for: 1m - labels: - severity: critical - annotations: - summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." - - -{% endraw %} + - alert: UpsTimeRemainingCritical + expr: upsEstimatedMinutesRemaining < 5 + for: 1m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} Autonomie restante de {{ raw("$value") }} min +... diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 index 7399f48..e97e986 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -1,4 +1,5 @@ -# {{ ansible_managed }} +--- +{{ ansible_managed | comment }} global: # scrape_interval is set to the global default (60s) @@ -100,3 +101,4 @@ scrape_configs: file_sd_configs: - files: - '/etc/prometheus/targets_docker.json' +...