--- {{ ansible_managed | comment }} {% macro raw(string) -%} {{ "{{" }} {{ string }} {{ "}}" }} {%- endmacro %} groups: - name: server.rules rules: - alert: MachineDown expr: up{instance!~".*.borne.auro.re$"} == 0 for: 3m labels: severity: critical - alert: AccessPointDown expr: up{instance=~".*.borne.auro.re$"} == 0 for: 3m labels: severity: warning - alert: OutOfMemory expr: >- ( node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes ) / node_memory_MemTotal_bytes * 100 < 10 for: 5m labels: severity: warning annotations: summary: >- {{ raw('$value | printf "%.1f"') }}% de mémoire libre - alert: HostSwapIsFillingUp expr: >- ( 1 - ( node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes ) ) * 100 >= 50 for: 3m labels: severity: warning annotations: summary: >- La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}% - alert: HostPhysicalComponentTooHot expr: node_hwmon_temp_celsius > 79 for: 3m labels: severity: critical annotations: summary: >- La température de l'hôte est de {{ raw('$value') }}°C ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }}) - alert: HostNodeOvertemperatureAlarm expr: node_hwmon_temp_crit_alarm_celsius == 1 for: 0m labels: severity: critical annotations: summary: >- L'alarme de température de l'hôte est active ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }}) - alert: HostOomKillDetected expr: increase(node_vmstat_oom_kill[1m]) > 0 for: 0m labels: severity: warning annotations: summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer) - alert: HostEdacCorrectableErrorsDetected expr: increase(node_edac_correctable_errors_total[1m]) > 0 for: 0m labels: severity: warning annotations: summary: >- {{ raw('$value | print "%.1f"') }} erreur(s) ont été corrigée(s) (EDAC) - alert: OutOfDiskSpace expr: >- node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10 for: 5m labels: severity: warning annotations: summary: >- {{ raw('$value | printf "%.1f"') }}% d'espace libre pour {{ raw('$labels.mountpoint') }} - alert: OutOfInodes expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 for: 5m labels: severity: warning annotations: summary: >- {{ raw('$value | printf "%.1f"') }}% d'inodes restants pour {{ raw('$labels.mountpoint') }} - alert: CpuUsage expr: >- ( 100 - avg by (instance) ( irate(node_cpu_seconds_total{mode="idle"}[5m]) ) * 100 ) > 75 for: 10m labels: severity: warning annotations: summary: >- CPU à {{ raw('$value | printf "%.1f"') }}% - alert: SystemdServiceFailed expr: node_systemd_unit_state{state="failed"} == 1 for: 10m labels: severity: warning annotations: summary: >- {{ raw('$labels.name') }} a échoué - alert: LoadUsage expr: node_load1 > 5 for: 2m labels: severity: warning annotations: summary: > Charge à {{ raw('$value') }} - alert: UnhealthyDisk expr: smartmon_device_smart_healthy < 1 for: 10m labels: severity: "critical" annotations: summary: "Le Disque {{ raw('$labels.disk') }} n'est pas en bonne santé !" ...