diff --git a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 index f78df48..d30511f 100644 --- a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 @@ -1,138 +1,16 @@ -# {{ ansible_managed }} -{# As this is also Jinja2 it will conflict without a raw block #} -{# Depending of Prometheus Node exporter version, rules can change depending of version #} -{% raw %} +--- +{{ ansible_managed | comment }} + groups: -- name: alert.rules - rules: - - # Alert for any instance that is unreachable for >3 minutes. - - alert: InstanceDown - expr: up == 0 - for: 3m - labels: - severity: critical - annotations: - summary: "Federate : {{ $labels.exported_instance }} est invisible depuis plus de 3 minutes !" - - # Alert for out of memory - - alert: OutOfMemory - expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Federate : Mémoire libre de {{ $labels.exported_instance }} à {{ humanize $value }}%." - - # Alert for out of disk space - - alert: OutOfDiskSpace - expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.exported_instance }} à {{ humanize $value }}%." - - # Alert for out of inode space on disk - - alert: OutOfInodes - expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Federate : Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.exported_instance }}." - - # Alert for high CPU usage - - alert: CpuUsage - expr: (100 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 75 - for: 10m - labels: - severity: warning - annotations: - summary: "Federate : CPU sur {{ $labels.exported_instance }} à {{ humanize $value }}%." - - # Check systemd unit (> buster) - - alert: SystemdServiceFailed - expr: node_systemd_unit_state{state="failed"} == 1 - for: 10m - labels: - severity: warning - annotations: - summary: "Federate : {{ $labels.name }} a échoué sur {{ $labels.exported_instance }}" - - # Check load of instance - - alert: LoadUsage - expr: node_load1 > 5 - for: 2m - labels: - severity: warning - annotations: - summary: "Federate : la charge de {{ $labels.exported_instance }} est à {{ $value }} !" - - # Check UPS - - alert: UpsOutputSourceChanged - expr: upsOutputSource != 3 - for: 1m - labels: - severity: warning - annotations: - summary: "Federate : La source d'alimentation de {{ $labels.exported_instance }} a changé !" - - - alert: UpsBatteryStatusWarning - expr: upsBatteryStatus == 3 - for: 2m - labels: - severity: warning - annotations: - summary: "Federate : L'état de la batterie de {{ $labels.exported_instance }} est faible !" - - - alert: UpsBatteryStatusCritical - expr: upsBatteryStatus == 4 - for: 10m - labels: - severity: warning - annotations: - summary: "L'état de la batterie de {{ $labels.exported_instance }} est affaibli !" - - - alert: UpsHighLoad - expr: upsOutputPercentLoad > 70 - for: 5m - labels: - severity: critical - annotations: - summary: "Federate : La charge de {{ $labels.exported_instance }} est de {{ $value }}% !" - - - alert: UpsWrongInputVoltage - expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) - for: 10m - labels: - severity: warning - annotations: - summary: "Federate : La tension d'entrée de {{ $labels.exported_instance }} est de {{ $value }}V." - - - alert: UpsWrongOutputVoltage - expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) - for: 10m - labels: - severity: warning - annotations: - summary: "Federate : La tension de sortie de {{ $labels.exported_instance }} est de {{ $value }}V." - - - alert: UpsTimeRemainingWarning - expr: upsEstimatedMinutesRemaining < 15 - for: 1m - labels: - severity: warning - annotations: - summary: "Federate : L'autonomie restante sur {{ $labels.exported_instance }} est de {{ $value }} min." - - - alert: UpsTimeRemainingCritical - expr: upsEstimatedMinutesRemaining < 5 - for: 1m - labels: - severity: critical - annotations: - summary: "Federate : L'autonomie restante sur {{ $labels.exported_instance }} est de {{ $value }} min." - - -{% endraw %} + - name: alert.rules + rules: + - alert: FederateInstanceDown + expr: up{job="federate"} == 0 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Federate : {{ "{{" }} $labels.instance {{ "}}" }} est invisible + depuis plus de 3 minutes ! +...