--- prometheus__alertmanager_targets: - docker-ovh.adm.auro.re:9093 prometheus__tsdb_retention_time: 90d prometheus__scraping: node: targets: "{{ groups.vm_network + groups.pve_network }}" address: port: 9100 prometheus: targets: "{{ groups.prom }}" address: port: 9090 kresd: targets: "{{ groups.dns }}" address: port: 8453 bird: targets: "{{ groups.router }}" address: port: 9324 quanta: targets: "{{ groups.quanta }}" address: 127.0.0.1:9116 path: /snmp params: module: - quanta snmp: targets: "{{ groups.prom }}" address: port: 9116 prometheus__alert_rules_prometheus: - alert: PrometheusTsdbCompactionFailed expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0" for: 0m labels: severity: critical prometheus__alert_rules_node: - alert: MachineDown expr: "up == 0" for: 3m labels: severity: critical annotations: summary: "Collecteur {{ '$labels.job' | interp }}" - alert: OutOfMemory expr: "( node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes ) / node_memory_MemTotal_bytes * 100 < 10" for: 5m labels: severity: warning annotations: summary: "Mémoire libre à {{ '$value' | interp_float }}%" - alert: HostSwapIsFillingUp expr: "( 1 - ( node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes ) ) * 100 >= 50" for: 3m labels: severity: critical annotations: summary: "Swap {{ '$value' | interp_float }}%" - alert: HostPhysicalComponentTooHot expr: "node_hwmon_temp_celsius > 79" for: 3m labels: severity: critical annotations: summary: "{{ '$value' | interp_float }}°C : {{ '$labels.chip' | interp }}, {{ '$labels.sensor' | interp }}" - alert: HostNodeOvertemperatureAlarm expr: "node_hwmon_temp_crit_alarm_celsius == 1" for: 0m labels: severity: critical annotations: summary: "{{ '$labels.chip' | interp }}, {{ '$labels.sensor' | interp }}" - alert: HostRaidArrayGotInactive expr: 'node_md_state{state="inactive"} > 0' for: 0m labels: severity: critical annotations: summary: "{{ '$labels.device' | interp }}" - alert: HostRaidDiskFailure expr: 'node_md_disks{state="failed"} > 0' for: 0m labels: severity: critical annotations: severity: "{{ '$labels.md_device' | interp }}" - alert: HostOomKillDetected expr: "increase(node_vmstat_oom_kill[1m]) > 0" for: 0m labels: severity: warning annotations: summary: "PID {{ '$value' | interp }}" - alert: HostEdacCorrectableErrorsDetected expr: "increase(node_edac_correctable_errors_total[1m]) > 0" for: 0m labels: severity: warning annotations: summary: "{{ '$value' | interp }} erreurs corrigées" - alert: HostEdacUncorrectableErrorsDetected expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0" for: 0m labels: severity: warning annotations: summary: "{{ '$value' | interp }} erreurs corrigées" - alert: OutOfDiskSpace expr: "( node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10 ) and on (instance, device, mountpoint) node_filesystem_readonly == 0" for: 5m labels: severity: critical annotations: summary: "{{ '$labels.mountpoint' | interp }} : {{ '$value' | interp_float }}% libre" - alert: HostConntrackLimit expr: "( node_nf_conntrack_entries / node_nf_conntrack_entries_limit ) * 100 > 80" for: 5m labels: severity: warning annotations: summary: "{{ '$value' | interp_float }}% complet" - alert: HostClockSkew expr: "(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)" for: 2m labels: severity: warning - alert: HostClockNotSynchronising expr: "min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16" for: 2m labels: severity: warning - alert: HostRequiresReboot expr: "node_reboot_required > 0" for: 5m labels: severity: warning - alert: OutOfInodes expr: "node_filesystem_files_free / node_filesystem_files * 100 < 10" for: 3m labels: severity: warning annotations: summary: "{{ '$labels.mountpoint' | interp }} : {{ '$value' | interp_float }}% libre" - alert: CpuUsage expr: '( 100 - avg by (instance) ( irate(node_cpu_seconds_total{mode="idle"}[5m]) ) * 100 ) > 75' for: 10m labels: severity: warning annotations: summary: "{{ '$value' | interp_float }}%" - alert: SystemdServiceFailed expr: 'node_systemd_unit_state{state="failed"} == 1' for: 10m labels: severity: warning annotations: summary: "{{ '$labels.name' | interp }}" - alert: LoadUsage expr: "node_load1 > 5" for: 2m labels: severity: warning annotations: summary: "{{ '$value' | interp_float }}" - alert: UnhealthyDisk expr: "smartmon_device_smart_healthy < 1" for: 10m labels: severity: critical annotations: summary: "{{ '$labels.disk' | interp }}" - alert: HostCpuStealNoisyNeighbor expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' for: 5m labels: severity: warning annotations: summary: "{{ '$labels.disk' | interp }}" prometheus__alert_rules_keepalived: - alert: KeepalivedVrrpFault expr: 'keepalived_vrrp_state{state="fault"} > 0' for: 0m labels: severity: critical annotations: summary: "{{ '$labels.instance' | interp }}" - alert: KeepalivedMasterChange expr: 'changes( keepalived_vrrp_state {keepalived_vvrp_state="master"}[1m]) > 1' for: 0m labels: severity: warning annotations: summary: "{{ '$labels.instance' | interp }}" prometheus__alert_rules_bird: - alert: BirdProtocolDown expr: "bird_protocol_up == 0" for: 0m labels: severity: critical annotations: summary: "{{ '$labels.name' | interp }} : {{ '$labels.state' | interp }}" prometheus__alert_rules_quanta: - alert: QuantaQueueOverflow expr: "snAgGblQueueOverflow == 1" for: 0m labels: severity: critical prometheus__alert_rules: prometheus: "{{ prometheus__alert_rules_prometheus }}" node: "{{ prometheus__alert_rules_node }}" keepalived: "{{ prometheus__alert_rules_keepalived }}" quanta: "{{ prometheus__alert_rules_quanta }}" #bird: "{{ prometheus__alert_rules_bird }}" ...