369 lines
12 KiB
YAML
Executable file
369 lines
12 KiB
YAML
Executable file
#!/usr/bin/env ansible-playbook
|
|
---
|
|
- hosts:
|
|
- pve_network
|
|
- vm_network
|
|
roles:
|
|
- prometheus_node
|
|
|
|
- hosts:
|
|
- prometheus-1.monit.infra.auro.re
|
|
- prometheus-2.monit.infra.auro.re
|
|
vars:
|
|
prometheus__tsdb_retention_time: 90d
|
|
prometheus__scraping:
|
|
node:
|
|
targets: "{{ ['vm_network', 'pve_network']
|
|
| map('extract', groups)
|
|
| flatten }}"
|
|
address:
|
|
port: 9100
|
|
prometheus__alert_rules:
|
|
node:
|
|
- alert: MachineDown
|
|
expr: "up == 0"
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Collecteur {{ '$labels.job' | interp }}"
|
|
- alert: OutOfMemory
|
|
expr: "( node_memory_MemFree_bytes
|
|
+ node_memory_Cached_bytes
|
|
+ node_memory_Buffers_bytes )
|
|
/ node_memory_MemTotal_bytes * 100 < 10"
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Mémoire libre à {{ '$value' | interp }}%"
|
|
- alert: HostSwapIsFillingUp
|
|
expr: "( 1 - ( node_memory_SwapFree_bytes
|
|
/ node_memory_SwapTotal_bytes ) )
|
|
* 100 >= 50"
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Swap {{ '$value' | interp }}%"
|
|
- alert: HostPhysicalComponentTooHot
|
|
expr: "node_hwmon_temp_celsius > 79"
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{{ '$value' | interp }}°C :
|
|
{{ '$labels.chip' | interp }},
|
|
{{ '$labels.sensor' | interp }}"
|
|
- alert: HostNodeOvertemperatureAlarm
|
|
expr: "node_hwmon_temp_crit_alarm_celsius == 1"
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{{ '$labels.chip' | interp }},
|
|
{{ '$labels.sensor' | interp }}"
|
|
- alert: HostOomKillDetected
|
|
expr: "increase(node_vmstat_oom_kill[1m]) > 0"
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PID {{ '$value' | interp }}"
|
|
- alert: HostEdacCorrectableErrorsDetected
|
|
expr: "increase(node_edac_correctable_errors_total[1m]) > 0"
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ '$value' | interp }} erreurs corrigées"
|
|
- alert: OutOfDiskSpace
|
|
expr: "node_filesystem_free_bytes
|
|
/ node_filesystem_size_bytes * 100 < 10"
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ '$labels.mountpoint' | interp }} :
|
|
{{ '$value' | interp }}% libre"
|
|
- alert: OutOfInodes
|
|
expr: "node_filesystem_files_free
|
|
/ node_filesystem_files * 100 < 10"
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ '$labels.mountpoint' | interp }} :
|
|
{{ '$value' | interp }}% libre"
|
|
- alert: CpuUsage
|
|
expr: '( 100 - avg by (instance)
|
|
( irate(node_cpu_seconds_total{mode="idle"}[5m]) )
|
|
* 100 ) > 75'
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ '$value' | interp }}%"
|
|
- alert: SystemdServiceFailed
|
|
expr: 'node_systemd_unit_state{state="failed"} == 1'
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ '$labels.name' | interp }}"
|
|
- alert: LoadUsage
|
|
expr: "node_load1 > 5"
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ '$value' | interp }}"
|
|
- alert: UnhealthyDisk
|
|
expr: "smartmon_device_smart_healthy < 1"
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{{ '$labels.disk' | interp }}"
|
|
roles:
|
|
- prometheus
|
|
|
|
#- hosts: prometheus-fleming.adm.auro.re
|
|
# vars:
|
|
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
|
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
|
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
|
|
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
|
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
|
# snmp_ilo_user: aurore
|
|
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
|
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
|
#
|
|
# prometheus_servers_targets: |
|
|
# {{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }}
|
|
# prometheus_unifi_snmp_targets: |
|
|
# {{ groups['fleming_unifi'] | list | sort }}
|
|
# prometheus_ilo_snmp_targets: |
|
|
# {{ groups['fleming_ilo'] | list | sort }}
|
|
#
|
|
# update_motd:
|
|
# prometheus: >-
|
|
# Prometheus (en configuration fleming) est déployé (/etc/prometheus).
|
|
# roles:
|
|
# - prometheus
|
|
# - update_motd
|
|
#
|
|
#- hosts: prometheus-pacaterie.adm.auro.re
|
|
# vars:
|
|
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
|
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
|
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
|
|
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
|
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
|
# snmp_ilo_user: aurore
|
|
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
|
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
|
#
|
|
# prometheus_servers_targets: |
|
|
# {{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }}
|
|
# prometheus_unifi_snmp_targets: |
|
|
# {{ groups['pacaterie_unifi'] | list | sort }}
|
|
# prometheus_ups_snmp_targets:
|
|
# - ups-pn-1.ups.auro.re
|
|
# - ups-ps-1.ups.auro.re
|
|
# prometheus_ilo_snmp_targets: |
|
|
# {{ groups['pacaterie_ilo'] | list | sort }}
|
|
#
|
|
# update_motd:
|
|
# prometheus: >-
|
|
# Prometheus (en configuration pacaterie) est déployé (/etc/prometheus).
|
|
# roles:
|
|
# - prometheus
|
|
# - update_motd
|
|
#
|
|
#- hosts: prometheus-edc.adm.auro.re
|
|
# vars:
|
|
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
|
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
|
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
|
|
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
|
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
|
# snmp_ilo_user: aurore
|
|
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
|
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
|
#
|
|
# prometheus_ups_snmp_targets:
|
|
# - ups-ec-1.ups.auro.re
|
|
# # - ups-ec-2.ups.auro.re
|
|
# - ups-ec-3.ups.auro.re
|
|
# prometheus_servers_targets: |
|
|
# {{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }}
|
|
# prometheus_unifi_snmp_targets: |
|
|
# {{ groups['edc_unifi'] | list | sort }}
|
|
# prometheus_ilo_snmp_targets: |
|
|
# {{ groups['edc_ilo'] | list | sort }}
|
|
#
|
|
# update_motd:
|
|
# prometheus: >-
|
|
# Prometheus (en configuration edc) est déployé (/etc/prometheus).
|
|
# roles:
|
|
# - prometheus
|
|
# - update_motd
|
|
#
|
|
#- hosts: prometheus-gs.adm.auro.re
|
|
# vars:
|
|
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
|
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
|
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
|
|
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
|
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
|
# snmp_ilo_user: aurore
|
|
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
|
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
|
#
|
|
# prometheus_servers_targets: |
|
|
# {{ groups['gs_pve'] + groups['gs_vm'] | list | sort }}
|
|
# prometheus_unifi_snmp_targets: |
|
|
# {{ groups['gs_unifi'] | list | sort }}
|
|
# prometheus_ups_snmp_targets:
|
|
# - ups-gk-1.ups.auro.re
|
|
# prometheus_apc_pdu_snmp_targets:
|
|
# - pdu-ga-1.ups.auro.re
|
|
# prometheus_ilo_snmp_targets: |
|
|
# {{ groups['gs_ilo'] | list | sort }}
|
|
#
|
|
# update_motd:
|
|
# prometheus: >-
|
|
# Prometheus (en configuration gs) est déployé (/etc/prometheus).
|
|
# roles:
|
|
# - prometheus
|
|
# - update_motd
|
|
#
|
|
#- hosts: prometheus-rives.adm.auro.re
|
|
# vars:
|
|
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
|
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
|
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
|
|
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
|
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
|
# snmp_ilo_user: aurore
|
|
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
|
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
|
#
|
|
# prometheus_ups_snmp_targets:
|
|
# - ups-r3-1.ups.auro.re
|
|
# - ups-r1-1.ups.auro.re
|
|
# prometheus_servers_targets: |
|
|
# {{ groups['rives_pve'] + groups['rives_vm'] | list | sort }}
|
|
# prometheus_unifi_snmp_targets: |
|
|
# {{ groups['rives_unifi'] | list | sort }}
|
|
# prometheus_ilo_snmp_targets: |
|
|
# {{ groups['rives_ilo'] | list | sort }}
|
|
#
|
|
# update_motd:
|
|
# prometheus: >-
|
|
# Prometheus (en configuration rives) est déployé (/etc/prometheus).
|
|
# roles:
|
|
# - prometheus
|
|
# - update_motd
|
|
#
|
|
#- hosts: prometheus-aurore.adm.auro.re
|
|
# vars:
|
|
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
|
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
|
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
|
|
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
|
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
|
# snmp_ilo_user: aurore
|
|
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
|
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
|
#
|
|
# prometheus_servers_targets: |
|
|
# {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
|
|
# prometheus_postgresql_targets: |
|
|
# {{ groups['bdd'] + groups['radius'] | list | sort }}
|
|
# prometheus_switch_snmp_targets:
|
|
# - yggdrasil.switch.auro.re
|
|
# - sw-pn-serveurs.switch.auro.re
|
|
# - sw-ec-serveurs.switch.auro.re
|
|
# - sw-gk-serveurs.switch.auro.re
|
|
# - sw-fl-serveurs.switch.auro.re
|
|
# - sw-ff-uplink.switch.auro.re
|
|
# - sw-fl-core.switch.auro.re
|
|
# - sw-fd-vcore.switch.auro.re
|
|
# - sw-fl-vcore.switch.auro.re
|
|
# - sw-ff-vcore.switch.auro.re
|
|
# - sw-pn-core.switch.auro.re
|
|
# - sw-ec-core.switch.auro.re
|
|
# - sw-gk-core.switch.auro.re
|
|
# - sw-r3-core.switch.auro.re
|
|
# prometheus_ilo_snmp_targets: |
|
|
# {{ groups['aurore_ilo'] | list | sort }}
|
|
#
|
|
# update_motd:
|
|
# prometheus: >-
|
|
# Prometheus (en configuration aurore) est déployé (/etc/prometheus).
|
|
# roles:
|
|
# - prometheus
|
|
# - update_motd
|
|
#
|
|
#- hosts: prometheus-ovh.adm.auro.re
|
|
# vars:
|
|
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
|
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
|
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
|
|
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
|
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
|
# snmp_ilo_user: aurore
|
|
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
|
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
|
#
|
|
# prometheus_servers_targets: |
|
|
# {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
|
|
# prometheus_postgresql_targets:
|
|
# - bdd-ovh.adm.auro.re
|
|
# prometheus_docker_targets:
|
|
# - docker-ovh.adm.auro.re
|
|
#
|
|
# update_motd:
|
|
# prometheus: >-
|
|
# Prometheus (en configuration ovh) est déployé (/etc/prometheus).
|
|
# roles:
|
|
# - prometheus
|
|
# - update_motd
|
|
#
|
|
#- hosts: prometheus-federate.adm.auro.re
|
|
# vars:
|
|
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
|
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
|
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
|
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
|
# snmp_ilo_user: aurore
|
|
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
|
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
|
#
|
|
# prometheus_servers_targets:
|
|
# - prometheus-edc.adm.auro.re
|
|
# - prometheus-gs.adm.auro.re
|
|
# - prometheus-fleming.adm.auro.re
|
|
# - prometheus-pacaterie.adm.auro.re
|
|
# - prometheus-rives.adm.auro.re
|
|
# - prometheus-aurore.adm.auro.re
|
|
# - prometheus-ovh.adm.auro.re
|
|
#
|
|
# update_motd:
|
|
# prometheus_federate: >-
|
|
# Prometheus (en configuration fédération) est déployé (/etc/prometheus).
|
|
# roles:
|
|
# - prometheus_federate
|
|
# - update_motd
|
|
#
|
|
## Postgres Exporters
|
|
#- hosts: bdd,radius
|
|
# roles:
|
|
# - prometheus_postgres
|
|
#
|
|
## Monitor all hosts
|
|
#- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
|
|
# roles:
|
|
# - prometheus_node
|