179 lines
4.8 KiB
Django/Jinja
179 lines
4.8 KiB
Django/Jinja
---
|
|
{{ ansible_managed | comment }}
|
|
|
|
{% macro raw(string) -%}
|
|
{{ "{{" }} {{ string }} {{ "}}" }}
|
|
{%- endmacro %}
|
|
|
|
groups:
|
|
|
|
- name: server.rules
|
|
rules:
|
|
|
|
- alert: InstanceDown
|
|
expr: up{instance!~".*.borne.auro.re$"} == 0
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: >-
|
|
Invisible depuis plus de 3 minutes
|
|
|
|
- alert: AccessPointDown
|
|
expr: up{instance=~".*.borne.auro.re$"} == 0
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
Invisible depuis plus de 3 minutes
|
|
|
|
- alert: OutOfMemory
|
|
expr: >-
|
|
(
|
|
node_memory_MemFree_bytes
|
|
+ node_memory_Cached_bytes
|
|
+ node_memory_Buffers_bytes
|
|
) / node_memory_MemTotal_bytes * 100 < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
{{ raw('$value | printf "%.1f"') }}% de mémoire
|
|
libre
|
|
|
|
- alert: HostSwapIsFillingUp
|
|
expr: >-
|
|
(
|
|
1 - (
|
|
node_memory_SwapFree_bytes
|
|
/ node_memory_SwapTotal_bytes
|
|
)
|
|
) * 100 >= 50
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
La mémoire swap est utilisée à
|
|
{{ raw('$value | printf "%.1f"') }}%
|
|
|
|
- alert: HostPhysicalComponentTooHot
|
|
expr: node_hwmon_temp_celsius > 79
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: >-
|
|
La température de l'hôte est de {{ raw('$value') }}°C
|
|
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
|
|
|
|
- alert: HostNodeOvertemperatureAlarm
|
|
expr: node_hwmon_temp_crit_alarm_celsius == 1
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: >-
|
|
L'alarme de température de l'hôte est active
|
|
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
|
|
|
|
- alert: HostOomKillDetected
|
|
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer)
|
|
|
|
- alert: HostEdacCorrectableErrorsDetected
|
|
expr: increase(node_edac_correctable_errors_total[1m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
{{ raw('$value | print "%.1f"') }} erreur(s) ont été
|
|
corrigée(s) (EDAC)
|
|
|
|
- alert: OutOfDiskSpace
|
|
expr: >-
|
|
node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
{{ raw('$value | printf "%.1f"') }}% d'espace libre pour
|
|
{{ raw('$labels.mountpoint') }}
|
|
|
|
- alert: OutOfInodes
|
|
expr: node_filesystem_files_free / node_filesystem_files * 100 < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
{{ raw('$value | printf "%.1f"') }}% d'inodes
|
|
restants pour {{ raw('$labels.mountpoint') }}
|
|
|
|
- alert: CpuUsage
|
|
expr: >-
|
|
(
|
|
100 - avg by (instance) (
|
|
irate(node_cpu_seconds_total{mode="idle"}[5m])
|
|
) * 100
|
|
) > 75
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
CPU à {{ raw('$value | printf "%.1f"') }}%
|
|
|
|
- alert: SystemdServiceFailed
|
|
expr: node_systemd_unit_state{state="failed"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >-
|
|
{{ raw('$labels.name') }} a échoué
|
|
|
|
- alert: LoadUsage
|
|
expr: node_load1 > 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >
|
|
Charge à {{ raw('$value') }}
|
|
|
|
- alert: UnhealthyDisk
|
|
expr: smartmon_device_smart_healthy < 1
|
|
for: 10m
|
|
labels:
|
|
severity: "critical"
|
|
annotations:
|
|
summary: >
|
|
Le disque {{ raw('$labels.disk') }} n'est pas en bonne santé
|
|
|
|
- alert: AptUpgradesPending
|
|
expr: sum by (instance) (apt_upgrades_pending) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >
|
|
Des mises à jour de paquets sont en attente
|
|
|
|
- alert: RebootRequired
|
|
expr: node_reboot_required == 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: >
|
|
Un redémarrage est nécessaire
|
|
...
|