---
{{ ansible_managed | comment }}

{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}

groups:

  - name: server.rules
    rules:

      - alert: InstanceDown
        expr: up{instance!~".*.borne.auro.re$"} == 0
        for: 3m
        labels:
          severity: critical
        annotations:
          summary: >-
            Invisible depuis plus de 3 minutes

      - alert: AccessPointDown
        expr: up{instance=~".*.borne.auro.re$"} == 0
        for: 3m
        labels:
          severity: warning
        annotations:
          summary: >-
            Invisible depuis plus de 3 minutes

      - alert: OutOfMemory
        expr: >-
          (
            node_memory_MemFree_bytes
            + node_memory_Cached_bytes
            + node_memory_Buffers_bytes
          ) / node_memory_MemTotal_bytes * 100 < 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: >-
            {{ raw('$value | printf "%.1f"') }}% de mémoire
            libre

      - alert: HostSwapIsFillingUp
        expr: >-
          (
            1 - (
              node_memory_SwapFree_bytes
              / node_memory_SwapTotal_bytes
            )
          ) * 100 >= 20
        for: 3m
        labels:
          severity: warning
        annotations:
          summary: >-
            La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}%

      - alert: HostPhysicalComponentTooHot
        expr: node_hwmon_temp_celsius > 79
        for: 3m
        labels:
          severity: critical
        annotations:
          summary: >-
            La température de l'hôte est de {{ raw('$value') }}°C
            ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})

      - alert: HostNodeOvertemperatureAlarm
        expr: node_hwmon_temp_crit_alarm_celsius == 1
        for: 0m
        labels: 
          severity: critical
        annotations:
          summary: >-
            L'alarme de température de l'hôte est active
            ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})

      - alert: HostOomKillDetected
        expr: increase(node_vmstat_oom_kill[1m]) > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer)

      - alert: HostEdacCorrectableErrorsDetected
        expr: increase(node_edac_correctable_errors_total[1m]) > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: >-
            {{ raw('$value | print "%.1f"') }} erreur(s) ont été
            corrigée(s) (EDAC)

      - alert: OutOfDiskSpace
        expr: >-
          node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: >-
            {{ raw('$value | printf "%.1f"') }}% d'espace libre pour
            {{ raw('$labels.mountpoint') }}

      - alert: OutOfInodes
        expr: node_filesystem_files_free / node_filesystem_files * 100 < 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: >-
            {{ raw('$value | printf "%.1f"') }}% d'inodes
            restants pour {{ raw('$labels.mountpoint') }}

      - alert: CpuUsage
        expr: >-
          (
            100 - avg by (instance) (
              irate(node_cpu_seconds_total{mode="idle"}[5m])
            ) * 100
          ) > 75
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: >-
            CPU à {{ raw('$value | printf "%.1f"') }}%

      - alert: SystemdServiceFailed
        expr: node_systemd_unit_state{state="failed"} == 1
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: >-
            {{ raw('$labels.name') }} a échoué

      - alert: LoadUsage
        expr: node_load1 > 5
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: >
            Charge à {{ raw('$value') }}
      
      - alert: UnhealthyDisk
        expr: smartmon_device_smart_healthy < 1 
        for: 10m
        labels:
          severity: "critical"
        annotations:
          summary: "Le Disque {{ raw('$labels.disk') }} n'est pas en bonne santé !"
...