#!/usr/bin/env ansible-playbook --- - hosts: - pve_network - vm_network vars: prometheus_node__text_dir: /var/run/prometheus-node-exporter roles: - prometheus_node - hosts: - edge-1.back.infra.auro.re - edge-2.back.infra.auro.re - isp-1.back.infra.auro.re - isp-2.back.infra.auro.re - infra-1.back.infra.auro.re - infra-2.back.infra.auro.re vars: prometheus_keepalived__dest: /var/run/prometheus-node-exporter/keepalived.prom roles: - prometheus_keepalived - hosts: - edge-1.back.infra.auro.re - edge-2.back.infra.auro.re - isp-1.back.infra.auro.re - isp-2.back.infra.auro.re - infra-1.back.infra.auro.re - infra-2.back.infra.auro.re roles: - prometheus_bird - hosts: - prometheus-1.monit.infra.auro.re - prometheus-2.monit.infra.auro.re vars: prometheus__alertmanager_targets: - docker-ovh.adm.auro.re:9093 prometheus__tsdb_retention_time: 90d prometheus__scraping: node: targets: "{{ ['vm_network', 'pve_network'] | map('extract', groups) | flatten }}" address: port: 9100 prometheus: targets: - prometheus-1.monit.infra.auro.re - prometheus-2.monit.infra.auro.re address: port: 9090 kresd: targets: - dns-1.int.infra.auro.re - dns-2.int.infra.auro.re address: port: 8453 bird: targets: - edge-1.back.infra.auro.re - edge-2.back.infra.auro.re address: port: 9324 prometheus__alert_rules: prometheus: - alert: PrometheusTsdbCompactionFailed expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0" for: 0m labels: severity: critical node: - alert: MachineDown expr: "up == 0" for: 3m labels: severity: critical annotations: summary: "Collecteur {{ '$labels.job' | interp }}" - alert: OutOfMemory expr: "( node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes ) / node_memory_MemTotal_bytes * 100 < 10" for: 5m labels: severity: warning annotations: summary: "Mémoire libre à {{ '$value' | interp_float }}%" - alert: HostSwapIsFillingUp expr: "( 1 - ( node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes ) ) * 100 >= 50" for: 3m labels: severity: critical annotations: summary: "Swap {{ '$value' | interp_float }}%" - alert: HostPhysicalComponentTooHot expr: "node_hwmon_temp_celsius > 79" for: 3m labels: severity: critical annotations: summary: "{{ '$value' | interp_float }}°C : {{ '$labels.chip' | interp }}, {{ '$labels.sensor' | interp }}" - alert: HostNodeOvertemperatureAlarm expr: "node_hwmon_temp_crit_alarm_celsius == 1" for: 0m labels: severity: critical annotations: summary: "{{ '$labels.chip' | interp }}, {{ '$labels.sensor' | interp }}" - alert: HostRaidArrayGotInactive expr: 'node_md_state{state="inactive"} > 0' for: 0m labels: severity: critical annotations: summary: "{{ '$labels.device' | interp }}" - alert: HostRaidDiskFailure expr: 'node_md_disks{state="failed"} > 0' for: 0m labels: severity: critical annotations: severity: "{{ '$labels.md_device' | interp }}" - alert: HostOomKillDetected expr: "increase(node_vmstat_oom_kill[1m]) > 0" for: 0m labels: severity: warning annotations: summary: "PID {{ '$value' | interp }}" - alert: HostEdacCorrectableErrorsDetected expr: "increase(node_edac_correctable_errors_total[1m]) > 0" for: 0m labels: severity: warning annotations: summary: "{{ '$value' | interp }} erreurs corrigées" - alert: HostEdacUncorrectableErrorsDetected expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0" for: 0m labels: severity: warning annotations: summary: "{{ '$value' | interp }} erreurs corrigées" - alert: OutOfDiskSpace expr: "( node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10 ) and on (instance, device, mountpoint) node_filesystem_readonly == 0" for: 5m labels: severity: critical annotations: summary: "{{ '$labels.mountpoint' | interp }} : {{ '$value' | interp_float }}% libre" - alert: HostConntrackLimit expr: "( node_nf_conntrack_entries / node_nf_conntrack_entries_limit ) * 100 > 80" for: 5m labels: severity: warning annotations: summary: "{{ '$value' | interp_float }}% complet" - alert: HostClockSkew expr: "(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)" for: 2m labels: severity: warning - alert: HostClockNotSynchronising expr: "min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16" for: 2m labels: severity: warning - alert: HostRequiresReboot expr: "node_reboot_required > 0" for: 5m labels: severity: warning - alert: OutOfInodes expr: "node_filesystem_files_free / node_filesystem_files * 100 < 10" for: 3m labels: severity: warning annotations: summary: "{{ '$labels.mountpoint' | interp }} : {{ '$value' | interp_float }}% libre" - alert: CpuUsage expr: '( 100 - avg by (instance) ( irate(node_cpu_seconds_total{mode="idle"}[5m]) ) * 100 ) > 75' for: 10m labels: severity: warning annotations: summary: "{{ '$value' | interp_float }}%" - alert: SystemdServiceFailed expr: 'node_systemd_unit_state{state="failed"} == 1' for: 10m labels: severity: warning annotations: summary: "{{ '$labels.name' | interp }}" - alert: LoadUsage expr: "node_load1 > 5" for: 2m labels: severity: warning annotations: summary: "{{ '$value' | interp_float }}" - alert: UnhealthyDisk expr: "smartmon_device_smart_healthy < 1" for: 10m labels: severity: critical annotations: summary: "{{ '$labels.disk' | interp }}" - alert: HostCpuStealNoisyNeighbor expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' for: 5m labels: severity: warning annotations: summary: "{{ '$labels.disk' | interp }}" keepalived: - alert: KeepalivedVrrpFault expr: 'keepalived_vrrp_state{state="fault"} > 0' for: 0m labels: severity: critical annotations: summary: "{{ '$labels.instance' | interp }}" - alert: KeepalivedMasterChange expr: 'changes( keepalived_vrrp_state {keepalived_vvrp_state="master"}[1m]) > 1' for: 0m labels: severity: warning annotations: summary: "{{ '$labels.instance' | interp }}" #bird: # - alert: BirdProtocolDown # expr: "bird_protocol_up == 0" # for: 0m # labels: # severity: critical # annotations: # summary: "{{ '$labels.name' | interp }} : # {{ '$labels.state' | interp }}" roles: - prometheus #- hosts: prometheus-fleming.adm.auro.re # vars: # prometheus_alertmanager: docker-ovh.adm.auro.re:9093 # snmp_unifi_password: "{{ vault_snmp_unifi_password }}" # snmp_switch_community: "{{ vault_snmp_switch_community }}" # snmp_pdu_user: "{{ vault_snmp_pdu_user }}" # snmp_pdu_password: "{{ vault_snmp_pdu_password }}" # snmp_ilo_user: aurore # snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" # snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" # # prometheus_servers_targets: | # {{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }} # prometheus_unifi_snmp_targets: | # {{ groups['fleming_unifi'] | list | sort }} # prometheus_ilo_snmp_targets: | # {{ groups['fleming_ilo'] | list | sort }} # # update_motd: # prometheus: >- # Prometheus (en configuration fleming) est déployé (/etc/prometheus). # roles: # - prometheus # - update_motd # #- hosts: prometheus-pacaterie.adm.auro.re # vars: # prometheus_alertmanager: docker-ovh.adm.auro.re:9093 # snmp_unifi_password: "{{ vault_snmp_unifi_password }}" # snmp_switch_community: "{{ vault_snmp_switch_community }}" # snmp_pdu_user: "{{ vault_snmp_pdu_user }}" # snmp_pdu_password: "{{ vault_snmp_pdu_password }}" # snmp_ilo_user: aurore # snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" # snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" # # prometheus_servers_targets: | # {{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }} # prometheus_unifi_snmp_targets: | # {{ groups['pacaterie_unifi'] | list | sort }} # prometheus_ups_snmp_targets: # - ups-pn-1.ups.auro.re # - ups-ps-1.ups.auro.re # prometheus_ilo_snmp_targets: | # {{ groups['pacaterie_ilo'] | list | sort }} # # update_motd: # prometheus: >- # Prometheus (en configuration pacaterie) est déployé (/etc/prometheus). # roles: # - prometheus # - update_motd # #- hosts: prometheus-edc.adm.auro.re # vars: # prometheus_alertmanager: docker-ovh.adm.auro.re:9093 # snmp_unifi_password: "{{ vault_snmp_unifi_password }}" # snmp_switch_community: "{{ vault_snmp_switch_community }}" # snmp_pdu_user: "{{ vault_snmp_pdu_user }}" # snmp_pdu_password: "{{ vault_snmp_pdu_password }}" # snmp_ilo_user: aurore # snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" # snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" # # prometheus_ups_snmp_targets: # - ups-ec-1.ups.auro.re # # - ups-ec-2.ups.auro.re # - ups-ec-3.ups.auro.re # prometheus_servers_targets: | # {{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }} # prometheus_unifi_snmp_targets: | # {{ groups['edc_unifi'] | list | sort }} # prometheus_ilo_snmp_targets: | # {{ groups['edc_ilo'] | list | sort }} # # update_motd: # prometheus: >- # Prometheus (en configuration edc) est déployé (/etc/prometheus). # roles: # - prometheus # - update_motd # #- hosts: prometheus-gs.adm.auro.re # vars: # prometheus_alertmanager: docker-ovh.adm.auro.re:9093 # snmp_unifi_password: "{{ vault_snmp_unifi_password }}" # snmp_switch_community: "{{ vault_snmp_switch_community }}" # snmp_pdu_user: "{{ vault_snmp_pdu_user }}" # snmp_pdu_password: "{{ vault_snmp_pdu_password }}" # snmp_ilo_user: aurore # snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" # snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" # # prometheus_servers_targets: | # {{ groups['gs_pve'] + groups['gs_vm'] | list | sort }} # prometheus_unifi_snmp_targets: | # {{ groups['gs_unifi'] | list | sort }} # prometheus_ups_snmp_targets: # - ups-gk-1.ups.auro.re # prometheus_apc_pdu_snmp_targets: # - pdu-ga-1.ups.auro.re # prometheus_ilo_snmp_targets: | # {{ groups['gs_ilo'] | list | sort }} # # update_motd: # prometheus: >- # Prometheus (en configuration gs) est déployé (/etc/prometheus). # roles: # - prometheus # - update_motd # #- hosts: prometheus-rives.adm.auro.re # vars: # prometheus_alertmanager: docker-ovh.adm.auro.re:9093 # snmp_unifi_password: "{{ vault_snmp_unifi_password }}" # snmp_switch_community: "{{ vault_snmp_switch_community }}" # snmp_pdu_user: "{{ vault_snmp_pdu_user }}" # snmp_pdu_password: "{{ vault_snmp_pdu_password }}" # snmp_ilo_user: aurore # snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" # snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" # # prometheus_ups_snmp_targets: # - ups-r3-1.ups.auro.re # - ups-r1-1.ups.auro.re # prometheus_servers_targets: | # {{ groups['rives_pve'] + groups['rives_vm'] | list | sort }} # prometheus_unifi_snmp_targets: | # {{ groups['rives_unifi'] | list | sort }} # prometheus_ilo_snmp_targets: | # {{ groups['rives_ilo'] | list | sort }} # # update_motd: # prometheus: >- # Prometheus (en configuration rives) est déployé (/etc/prometheus). # roles: # - prometheus # - update_motd # #- hosts: prometheus-aurore.adm.auro.re # vars: # prometheus_alertmanager: docker-ovh.adm.auro.re:9093 # snmp_unifi_password: "{{ vault_snmp_unifi_password }}" # snmp_switch_community: "{{ vault_snmp_switch_community }}" # snmp_pdu_user: "{{ vault_snmp_pdu_user }}" # snmp_pdu_password: "{{ vault_snmp_pdu_password }}" # snmp_ilo_user: aurore # snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" # snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" # # prometheus_servers_targets: | # {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }} # prometheus_postgresql_targets: | # {{ groups['bdd'] + groups['radius'] | list | sort }} # prometheus_switch_snmp_targets: # - yggdrasil.switch.auro.re # - sw-pn-serveurs.switch.auro.re # - sw-ec-serveurs.switch.auro.re # - sw-gk-serveurs.switch.auro.re # - sw-fl-serveurs.switch.auro.re # - sw-ff-uplink.switch.auro.re # - sw-fl-core.switch.auro.re # - sw-fd-vcore.switch.auro.re # - sw-fl-vcore.switch.auro.re # - sw-ff-vcore.switch.auro.re # - sw-pn-core.switch.auro.re # - sw-ec-core.switch.auro.re # - sw-gk-core.switch.auro.re # - sw-r3-core.switch.auro.re # prometheus_ilo_snmp_targets: | # {{ groups['aurore_ilo'] | list | sort }} # # update_motd: # prometheus: >- # Prometheus (en configuration aurore) est déployé (/etc/prometheus). # roles: # - prometheus # - update_motd # #- hosts: prometheus-ovh.adm.auro.re # vars: # prometheus_alertmanager: docker-ovh.adm.auro.re:9093 # snmp_unifi_password: "{{ vault_snmp_unifi_password }}" # snmp_switch_community: "{{ vault_snmp_switch_community }}" # snmp_pdu_user: "{{ vault_snmp_pdu_user }}" # snmp_pdu_password: "{{ vault_snmp_pdu_password }}" # snmp_ilo_user: aurore # snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" # snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" # # prometheus_servers_targets: | # {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} # prometheus_postgresql_targets: # - bdd-ovh.adm.auro.re # prometheus_docker_targets: # - docker-ovh.adm.auro.re # # update_motd: # prometheus: >- # Prometheus (en configuration ovh) est déployé (/etc/prometheus). # roles: # - prometheus # - update_motd # ## Postgres Exporters #- hosts: bdd,radius # roles: # - prometheus_postgres