prometheus: cleanup role (lots of features missing)

2023-04-02 05:08:01 +02:00 · 2023-04-02 05:08:01 +02:00 · 922b6894a7
commit 922b6894a7
parent 7db15d9c63
15 changed files with 484 additions and 1651 deletions
--- a/filter_plugins/prometheus.py
+++ b/filter_plugins/prometheus.py
@ -0,0 +1,55 @@
+from ansible.parsing.yaml.objects import AnsibleUnicode
+
+class FilterModule:
+    def filters(self):
+        return {
+            "prometheus__convert_jobs": convert_jobs,
+            "interp": interp,
+        }
+
+
+def interp(string):
+    return AnsibleUnicode(f"{{{{ {string} }}}}")
+
+
+def convert_jobs(config):
+
+    for name, job in config.items():
+
+        config = {
+            "job_name": name,
+            "static_configs": [
+                {
+                    "targets": job["targets"],
+                }
+            ],
+            "params": job.get("params", {}),
+        }
+
+        if "path" in job:
+            config["metrics_path"] = job["path"]
+
+        if "address" in job:
+
+            try:
+                replacement = f"$1:{job['address']['port']}"
+            except Exception:
+                replacement = job["address"]
+
+            config["relabel_configs"] = [
+                {
+                    "source_labels": ["__address__"],
+                    "target_label": "__param_target",
+                },
+                {
+                    "source_labels": ["__param_target"],
+                    "target_label": "instance",
+                },
+                {
+                    "source_labels": ["__param_target"],
+                    "target_label": "__address__",
+                    "replacement": replacement,
+                },
+            ]
+
+        yield config
--- a/playbooks/prometheus.yml
+++ b/playbooks/prometheus.yml
@ -1,241 +1,369 @@
 #!/usr/bin/env ansible-playbook
 ---
- hosts: prometheus-fleming.adm.auro.re
-  vars:
-    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
-    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
-    snmp_switch_community: "{{ vault_snmp_switch_community }}"
-    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
-    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
-    snmp_ilo_user: aurore
-    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
-    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
-
-    prometheus_servers_targets: |
-      {{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }}
-    prometheus_unifi_snmp_targets: |
-      {{ groups['fleming_unifi'] | list | sort }}
-    prometheus_ilo_snmp_targets: |
-      {{ groups['fleming_ilo'] | list | sort }}
-
-    update_motd:
-      prometheus: >-
-        Prometheus (en configuration fleming) est déployé (/etc/prometheus).
-  roles:
-    - prometheus
-    - update_motd
-
- hosts: prometheus-pacaterie.adm.auro.re
-  vars:
-    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
-    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
-    snmp_switch_community: "{{ vault_snmp_switch_community }}"
-    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
-    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
-    snmp_ilo_user: aurore
-    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
-    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
-
-    prometheus_servers_targets: |
-      {{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }}
-    prometheus_unifi_snmp_targets: |
-      {{ groups['pacaterie_unifi'] | list | sort }}
-    prometheus_ups_snmp_targets:
-      - ups-pn-1.ups.auro.re
-      - ups-ps-1.ups.auro.re
-    prometheus_ilo_snmp_targets: |
-      {{ groups['pacaterie_ilo'] | list | sort }}
-
-    update_motd:
-      prometheus: >-
-        Prometheus (en configuration pacaterie) est déployé (/etc/prometheus).
-  roles:
-    - prometheus
-    - update_motd
-
- hosts: prometheus-edc.adm.auro.re
-  vars:
-    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
-    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
-    snmp_switch_community: "{{ vault_snmp_switch_community }}"
-    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
-    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
-    snmp_ilo_user: aurore
-    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
-    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
-
-    prometheus_ups_snmp_targets:
-      - ups-ec-1.ups.auro.re
-      # - ups-ec-2.ups.auro.re
-      - ups-ec-3.ups.auro.re
-    prometheus_servers_targets: |
-      {{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }}
-    prometheus_unifi_snmp_targets: |
-      {{ groups['edc_unifi'] | list | sort }}
-    prometheus_ilo_snmp_targets: |
-      {{ groups['edc_ilo'] | list | sort }}
-
-    update_motd:
-      prometheus: >-
-        Prometheus (en configuration edc) est déployé (/etc/prometheus).
-  roles:
-    - prometheus
-    - update_motd
-
- hosts: prometheus-gs.adm.auro.re
-  vars:
-    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
-    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
-    snmp_switch_community: "{{ vault_snmp_switch_community }}"
-    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
-    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
-    snmp_ilo_user: aurore
-    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
-    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
-
-    prometheus_servers_targets: |
-      {{ groups['gs_pve'] + groups['gs_vm'] | list | sort }}
-    prometheus_unifi_snmp_targets: |
-      {{ groups['gs_unifi'] | list | sort }}
-    prometheus_ups_snmp_targets:
-      - ups-gk-1.ups.auro.re
-    prometheus_apc_pdu_snmp_targets:
-      - pdu-ga-1.ups.auro.re
-    prometheus_ilo_snmp_targets: |
-      {{ groups['gs_ilo'] | list | sort }}
-
-    update_motd:
-      prometheus: >-
-        Prometheus (en configuration gs) est déployé (/etc/prometheus).
-  roles:
-    - prometheus
-    - update_motd
-
- hosts: prometheus-rives.adm.auro.re
-  vars:
-    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
-    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
-    snmp_switch_community: "{{ vault_snmp_switch_community }}"
-    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
-    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
-    snmp_ilo_user: aurore
-    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
-    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
-
-    prometheus_ups_snmp_targets:
-      - ups-r3-1.ups.auro.re
-      - ups-r1-1.ups.auro.re
-    prometheus_servers_targets: |
-      {{ groups['rives_pve'] + groups['rives_vm'] | list | sort }}
-    prometheus_unifi_snmp_targets: |
-      {{ groups['rives_unifi'] | list | sort }}
-    prometheus_ilo_snmp_targets: |
-      {{ groups['rives_ilo'] | list | sort }}
-
-    update_motd:
-      prometheus: >-
-        Prometheus (en configuration rives) est déployé (/etc/prometheus).
-  roles:
-    - prometheus
-    - update_motd
-
- hosts: prometheus-aurore.adm.auro.re
-  vars:
-    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
-    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
-    snmp_switch_community: "{{ vault_snmp_switch_community }}"
-    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
-    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
-    snmp_ilo_user: aurore
-    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
-    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
-
-    prometheus_servers_targets: |
-      {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
-    prometheus_postgresql_targets: |
-      {{ groups['bdd'] + groups['radius'] | list | sort }}
-    prometheus_switch_snmp_targets:
-      - yggdrasil.switch.auro.re
-      - sw-pn-serveurs.switch.auro.re
-      - sw-ec-serveurs.switch.auro.re
-      - sw-gk-serveurs.switch.auro.re
-      - sw-fl-serveurs.switch.auro.re
-      - sw-ff-uplink.switch.auro.re
-      - sw-fl-core.switch.auro.re
-      - sw-fd-vcore.switch.auro.re
-      - sw-fl-vcore.switch.auro.re
-      - sw-ff-vcore.switch.auro.re
-      - sw-pn-core.switch.auro.re
-      - sw-ec-core.switch.auro.re
-      - sw-gk-core.switch.auro.re
-      - sw-r3-core.switch.auro.re
-    prometheus_ilo_snmp_targets: |
-      {{ groups['aurore_ilo'] | list | sort }}
-
-    update_motd:
-      prometheus: >-
-        Prometheus (en configuration aurore) est déployé (/etc/prometheus).
-  roles:
-    - prometheus
-    - update_motd
-
- hosts: prometheus-ovh.adm.auro.re
-  vars:
-    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
-    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
-    snmp_switch_community: "{{ vault_snmp_switch_community }}"
-    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
-    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
-    snmp_ilo_user: aurore
-    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
-    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
-
-    prometheus_servers_targets: |
-      {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
-    prometheus_postgresql_targets:
-      - bdd-ovh.adm.auro.re
-    prometheus_docker_targets:
-      - docker-ovh.adm.auro.re
-
-    update_motd:
-      prometheus: >-
-        Prometheus (en configuration ovh) est déployé (/etc/prometheus).
-  roles:
-    - prometheus
-    - update_motd
-
- hosts: prometheus-federate.adm.auro.re
-  vars:
-    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
-    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
-    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
-    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
-    snmp_ilo_user: aurore
-    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
-    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
-
-    prometheus_servers_targets:
-      - prometheus-edc.adm.auro.re
-      - prometheus-gs.adm.auro.re
-      - prometheus-fleming.adm.auro.re
-      - prometheus-pacaterie.adm.auro.re
-      - prometheus-rives.adm.auro.re
-      - prometheus-aurore.adm.auro.re
-      - prometheus-ovh.adm.auro.re
-
-    update_motd:
-      prometheus_federate: >-
-        Prometheus (en configuration fédération) est déployé (/etc/prometheus).
-  roles:
-    - prometheus_federate
-    - update_motd
-
-# Postgres Exporters
- hosts: bdd,radius
-  roles:
-    - prometheus_postgres
-
-# Monitor all hosts
- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
+- hosts:
+    - pve_network
+    - vm_network
  roles:
    - prometheus_node
+
+- hosts:
+    - prometheus-1.monit.infra.auro.re
+    - prometheus-2.monit.infra.auro.re
+  vars:
+    prometheus__tsdb_retention_time: 90d
+    prometheus__scraping:
+      node:
+        targets: "{{ ['vm_network', 'pve_network']
+                     | map('extract', groups)
+                     | flatten }}"
+        address:
+          port: 9100
+    prometheus__alert_rules:
+      node:
+        - alert: MachineDown
+          expr: "up == 0"
+          for: 3m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Collecteur {{ '$labels.job' | interp }}"
+        - alert: OutOfMemory
+          expr: "( node_memory_MemFree_bytes
+                   + node_memory_Cached_bytes
+                   + node_memory_Buffers_bytes )
+                 / node_memory_MemTotal_bytes * 100 < 10"
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Mémoire libre à {{ '$value' | interp }}%"
+        - alert: HostSwapIsFillingUp
+          expr: "( 1 - ( node_memory_SwapFree_bytes
+                         / node_memory_SwapTotal_bytes ) )
+                 * 100 >= 50"
+          for: 3m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Swap {{ '$value' | interp }}%"
+        - alert: HostPhysicalComponentTooHot
+          expr: "node_hwmon_temp_celsius > 79"
+          for: 3m
+          labels:
+            severity: critical
+          annotations:
+            summary: "{{ '$value' | interp }}°C :
+                      {{ '$labels.chip' | interp }},
+                      {{ '$labels.sensor' | interp }}"
+        - alert: HostNodeOvertemperatureAlarm
+          expr: "node_hwmon_temp_crit_alarm_celsius == 1"
+          for: 0m
+          labels:
+            severity: critical
+          annotations:
+            summary: "{{ '$labels.chip' | interp }},
+                      {{ '$labels.sensor' | interp }}"
+        - alert: HostOomKillDetected
+          expr: "increase(node_vmstat_oom_kill[1m]) > 0"
+          for: 0m
+          labels:
+            severity: warning
+          annotations:
+            summary: "PID {{ '$value' | interp }}"
+        - alert: HostEdacCorrectableErrorsDetected
+          expr: "increase(node_edac_correctable_errors_total[1m]) > 0"
+          for: 0m
+          labels:
+            severity: warning
+          annotations:
+            summary: "{{ '$value' | interp }} erreurs corrigées"
+        - alert: OutOfDiskSpace
+          expr: "node_filesystem_free_bytes
+                 / node_filesystem_size_bytes * 100 < 10"
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "{{ '$labels.mountpoint' | interp }} :
+                      {{ '$value' | interp }}% libre"
+        - alert: OutOfInodes
+          expr: "node_filesystem_files_free
+                 / node_filesystem_files * 100 < 10"
+          for: 3m
+          labels:
+            severity: warning
+          annotations:
+            summary: "{{ '$labels.mountpoint' | interp }} :
+                      {{ '$value' | interp }}% libre"
+        - alert: CpuUsage
+          expr: '( 100 - avg by (instance)
+                   ( irate(node_cpu_seconds_total{mode="idle"}[5m]) ) 
+                   * 100 ) > 75'
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "{{ '$value' | interp }}%"
+        - alert: SystemdServiceFailed
+          expr: 'node_systemd_unit_state{state="failed"} == 1'
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "{{ '$labels.name' | interp }}"
+        - alert: LoadUsage
+          expr: "node_load1 > 5"
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: "{{ '$value' | interp }}"
+        - alert: UnhealthyDisk
+          expr: "smartmon_device_smart_healthy < 1"
+          for: 10m
+          labels:
+            severity: critical
+          annotations:
+            summary: "{{ '$labels.disk' | interp }}"
+  roles:
+    - prometheus
+
+#- hosts: prometheus-fleming.adm.auro.re
+#  vars:
+#    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
+#    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
+#    snmp_switch_community: "{{ vault_snmp_switch_community }}"
+#    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
+#    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
+#    snmp_ilo_user: aurore
+#    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
+#    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
+#
+#    prometheus_servers_targets: |
+#      {{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }}
+#    prometheus_unifi_snmp_targets: |
+#      {{ groups['fleming_unifi'] | list | sort }}
+#    prometheus_ilo_snmp_targets: |
+#      {{ groups['fleming_ilo'] | list | sort }}
+#
+#    update_motd:
+#      prometheus: >-
+#        Prometheus (en configuration fleming) est déployé (/etc/prometheus).
+#  roles:
+#    - prometheus
+#    - update_motd
+#
+#- hosts: prometheus-pacaterie.adm.auro.re
+#  vars:
+#    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
+#    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
+#    snmp_switch_community: "{{ vault_snmp_switch_community }}"
+#    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
+#    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
+#    snmp_ilo_user: aurore
+#    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
+#    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
+#
+#    prometheus_servers_targets: |
+#      {{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }}
+#    prometheus_unifi_snmp_targets: |
+#      {{ groups['pacaterie_unifi'] | list | sort }}
+#    prometheus_ups_snmp_targets:
+#      - ups-pn-1.ups.auro.re
+#      - ups-ps-1.ups.auro.re
+#    prometheus_ilo_snmp_targets: |
+#      {{ groups['pacaterie_ilo'] | list | sort }}
+#
+#    update_motd:
+#      prometheus: >-
+#        Prometheus (en configuration pacaterie) est déployé (/etc/prometheus).
+#  roles:
+#    - prometheus
+#    - update_motd
+#
+#- hosts: prometheus-edc.adm.auro.re
+#  vars:
+#    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
+#    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
+#    snmp_switch_community: "{{ vault_snmp_switch_community }}"
+#    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
+#    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
+#    snmp_ilo_user: aurore
+#    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
+#    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
+#
+#    prometheus_ups_snmp_targets:
+#      - ups-ec-1.ups.auro.re
+#      # - ups-ec-2.ups.auro.re
+#      - ups-ec-3.ups.auro.re
+#    prometheus_servers_targets: |
+#      {{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }}
+#    prometheus_unifi_snmp_targets: |
+#      {{ groups['edc_unifi'] | list | sort }}
+#    prometheus_ilo_snmp_targets: |
+#      {{ groups['edc_ilo'] | list | sort }}
+#
+#    update_motd:
+#      prometheus: >-
+#        Prometheus (en configuration edc) est déployé (/etc/prometheus).
+#  roles:
+#    - prometheus
+#    - update_motd
+#
+#- hosts: prometheus-gs.adm.auro.re
+#  vars:
+#    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
+#    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
+#    snmp_switch_community: "{{ vault_snmp_switch_community }}"
+#    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
+#    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
+#    snmp_ilo_user: aurore
+#    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
+#    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
+#
+#    prometheus_servers_targets: |
+#      {{ groups['gs_pve'] + groups['gs_vm'] | list | sort }}
+#    prometheus_unifi_snmp_targets: |
+#      {{ groups['gs_unifi'] | list | sort }}
+#    prometheus_ups_snmp_targets:
+#      - ups-gk-1.ups.auro.re
+#    prometheus_apc_pdu_snmp_targets:
+#      - pdu-ga-1.ups.auro.re
+#    prometheus_ilo_snmp_targets: |
+#      {{ groups['gs_ilo'] | list | sort }}
+#
+#    update_motd:
+#      prometheus: >-
+#        Prometheus (en configuration gs) est déployé (/etc/prometheus).
+#  roles:
+#    - prometheus
+#    - update_motd
+#
+#- hosts: prometheus-rives.adm.auro.re
+#  vars:
+#    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
+#    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
+#    snmp_switch_community: "{{ vault_snmp_switch_community }}"
+#    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
+#    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
+#    snmp_ilo_user: aurore
+#    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
+#    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
+#
+#    prometheus_ups_snmp_targets:
+#      - ups-r3-1.ups.auro.re
+#      - ups-r1-1.ups.auro.re
+#    prometheus_servers_targets: |
+#      {{ groups['rives_pve'] + groups['rives_vm'] | list | sort }}
+#    prometheus_unifi_snmp_targets: |
+#      {{ groups['rives_unifi'] | list | sort }}
+#    prometheus_ilo_snmp_targets: |
+#      {{ groups['rives_ilo'] | list | sort }}
+#
+#    update_motd:
+#      prometheus: >-
+#        Prometheus (en configuration rives) est déployé (/etc/prometheus).
+#  roles:
+#    - prometheus
+#    - update_motd
+#
+#- hosts: prometheus-aurore.adm.auro.re
+#  vars:
+#    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
+#    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
+#    snmp_switch_community: "{{ vault_snmp_switch_community }}"
+#    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
+#    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
+#    snmp_ilo_user: aurore
+#    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
+#    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
+#
+#    prometheus_servers_targets: |
+#      {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
+#    prometheus_postgresql_targets: |
+#      {{ groups['bdd'] + groups['radius'] | list | sort }}
+#    prometheus_switch_snmp_targets:
+#      - yggdrasil.switch.auro.re
+#      - sw-pn-serveurs.switch.auro.re
+#      - sw-ec-serveurs.switch.auro.re
+#      - sw-gk-serveurs.switch.auro.re
+#      - sw-fl-serveurs.switch.auro.re
+#      - sw-ff-uplink.switch.auro.re
+#      - sw-fl-core.switch.auro.re
+#      - sw-fd-vcore.switch.auro.re
+#      - sw-fl-vcore.switch.auro.re
+#      - sw-ff-vcore.switch.auro.re
+#      - sw-pn-core.switch.auro.re
+#      - sw-ec-core.switch.auro.re
+#      - sw-gk-core.switch.auro.re
+#      - sw-r3-core.switch.auro.re
+#    prometheus_ilo_snmp_targets: |
+#      {{ groups['aurore_ilo'] | list | sort }}
+#
+#    update_motd:
+#      prometheus: >-
+#        Prometheus (en configuration aurore) est déployé (/etc/prometheus).
+#  roles:
+#    - prometheus
+#    - update_motd
+#
+#- hosts: prometheus-ovh.adm.auro.re
+#  vars:
+#    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
+#    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
+#    snmp_switch_community: "{{ vault_snmp_switch_community }}"
+#    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
+#    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
+#    snmp_ilo_user: aurore
+#    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
+#    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
+#
+#    prometheus_servers_targets: |
+#      {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
+#    prometheus_postgresql_targets:
+#      - bdd-ovh.adm.auro.re
+#    prometheus_docker_targets:
+#      - docker-ovh.adm.auro.re
+#
+#    update_motd:
+#      prometheus: >-
+#        Prometheus (en configuration ovh) est déployé (/etc/prometheus).
+#  roles:
+#    - prometheus
+#    - update_motd
+#
+#- hosts: prometheus-federate.adm.auro.re
+#  vars:
+#    prometheus_alertmanager: docker-ovh.adm.auro.re:9093
+#    snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
+#    snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
+#    snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
+#    snmp_ilo_user: aurore
+#    snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
+#    snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
+#
+#    prometheus_servers_targets:
+#      - prometheus-edc.adm.auro.re
+#      - prometheus-gs.adm.auro.re
+#      - prometheus-fleming.adm.auro.re
+#      - prometheus-pacaterie.adm.auro.re
+#      - prometheus-rives.adm.auro.re
+#      - prometheus-aurore.adm.auro.re
+#      - prometheus-ovh.adm.auro.re
+#
+#    update_motd:
+#      prometheus_federate: >-
+#        Prometheus (en configuration fédération) est déployé (/etc/prometheus).
+#  roles:
+#    - prometheus_federate
+#    - update_motd
+#
+## Postgres Exporters
+#- hosts: bdd,radius
+#  roles:
+#    - prometheus_postgres
+#
+## Monitor all hosts
+#- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
+#  roles:
+#    - prometheus_node
--- a/roles/prometheus/defaults/main.yml
+++ b/roles/prometheus/defaults/main.yml
@ -0,0 +1,7 @@
+---
+prometheus__alertmanager_targets: []
+prometheus__scraping: {}
+prometheus__alert_rules: {}
+prometheus__tsdb_retention_time: 15d
+prometheus__page_title: "{{ inventory_hostname }}"
+...
--- a/roles/prometheus/handlers/main.yml
+++ b/roles/prometheus/handlers/main.yml
@ -1,11 +1,11 @@
 ---
- name: Restart Prometheus
-  service:
-    name: prometheus
+- name: Restart prometheus
+  systemd:
+    name: prometheus.service
    state: restarted

- name: Restart prometheus-snmp-exporter
-  service:
-    name: prometheus-snmp-exporter
-    state: restarted
+- name: Reload prometheus
+  systemd:
+    name: prometheus.service
+    state: reloaded
 ...
--- a/roles/prometheus/tasks/main.yml
+++ b/roles/prometheus/tasks/main.yml
@ -1,69 +1,55 @@
 ---
- name: Install Prometheus
+- name: Install prometheus
  apt:
-    update_cache: true
    name:
      - prometheus
-      - prometheus-snmp-exporter
-  register: apt_result
-  retries: 3
-  until: apt_result is succeeded

- name: Configure Prometheus
+- name: Configure prometheus
  template:
-    src: "{{ item }}.j2"
-    dest: "/etc/prometheus/{{ item }}"
+    src: prometheus.yml.j2
+    dest: /etc/prometheus/prometheus.yml
    owner: prometheus
    group: prometheus
-    mode: u=r,g=r,o=
-  loop:
-    - prometheus.yml
-  notify: Restart Prometheus
+    mode: u=rw,g=r,o=r
+  vars:
+    prometheus__config:
+      alerting:
+        alertmanagers:
+          - static_configs:
+              - targets: "{{ prometheus__alertmanager_targets }}"
+      rule_files:
+        - /etc/prometheus/rules.yml
+      scrape_configs: "{{ prometheus__scraping
+                          | prometheus__convert_jobs }}"
+  notify:
+    - Restart prometheus

- name: Creates directory for alerts
-  file:
-    path: /etc/prometheus/alerts
-    state: directory
-    owner: prometheus
-    group: prometheus
-    mode: 0755
-
- name: Configure Prometheus alerts
+- name: Configure prometheus default
  template:
-    src: "{{ item }}.j2"
-    dest: "/etc/prometheus/alerts/{{ item }}"
-    owner: prometheus
-    group: prometheus
-    mode: u=r,g=r,o=
-  loop:
-    - server.rules.yml
-    - docker.rules.yml
-    - ups.rules.yml
-    - postgres.rules.yml
-    - environmental.rules.yml
-    - ilo.rules.yml
-  notify: Restart Prometheus
+    src: default.j2
+    dest: /etc/default/prometheus
+    owner: root
+    group: root
+    mode: u=rw,g=r,o=r
+  notify:
+    - Restart prometheus

- name: Make Prometheus snmp-exporter listen on localhost only
-  lineinfile:
-    path: /etc/default/prometheus-snmp-exporter
-    regexp: '^ARGS='
-    line: "ARGS=\"--web.listen-address=127.0.0.1:9116\""
-  notify: Restart prometheus-snmp-exporter
-
-# These files store SNMP OIDs
- name: Configure Prometheus snmp-exporter
+- name: Configure prometheus rules
  template:
-    src: "{{ item }}.j2"
-    dest: "/etc/prometheus/{{ item }}"
+    src: rules.yml.j2
+    dest: /etc/prometheus/rules.yml
    owner: prometheus
    group: prometheus
-    mode: u=r,g=r,o=
-  loop:
-    - snmp.yml
-  notify: Restart prometheus-snmp-exporter
+    mode: u=rw,g=r,o=r
+    validate: "promtool check rules %s"
+  vars:
+    prometheus__rules:
+      groups: "{{ prometheus__alert_rules
+                  | dict2items(key_name='name', value_name='rules') }}"
+  notify:
+    - Reload prometheus

- name: Activate prometheus service
+- name: Enable prometheus
  systemd:
    name: prometheus
    enabled: true
--- a/roles/prometheus/templates/default.j2
+++ b/roles/prometheus/templates/default.j2
@ -0,0 +1,3 @@
+{{ ansible_managed | comment }}
+
+ARGS="--storage.tsdb.retention.time={{ prometheus__tsdb_retention_time | quote }} --web.page-title={{ prometheus__page_title | quote }}"
--- a/roles/prometheus/templates/docker.rules.yml.j2
+++ b/roles/prometheus/templates/docker.rules.yml.j2
@ -1,50 +0,0 @@
---
-{{ ansible_managed | comment }}
-
-{% macro raw(string) -%}
-{{ "{{" }} {{ string }} {{ "}}" }}
-{%- endmacro %}
-
-groups:
-
-  - name: docker.rules
-    rules:
-
-    - alert: ContainerDown
-      expr: docker_container_running_state != 1
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: >-
-          Le container Docker est éteint / tombé 
-          (container {{ raw('$labels.name') }})
-    
-    - alert: ContainerFailed
-      expr: sum(increase(docker_container_restart_count[5m])) > 2
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: >- 
-          Le container Docker redémarre souvent 
-          (container {{ raw('$labels.name') }})
-    
-    - alert: ContainerFailed
-      expr: 
-          (
-            docker_container_cpu_used_total 
-            / 
-            docker_container_cpu_capacity_total
-          ) * 100
-          > 30
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: >-
-          Le container Docker utilise beaucoup de CPU 
-          (container {{ raw('$labels.name') }}, 
-          valeur {{ raw('$value | printf "%.1f"') }})
-        
-...
--- a/roles/prometheus/templates/environmental.rules.yml.j2
+++ b/roles/prometheus/templates/environmental.rules.yml.j2
@ -1,52 +0,0 @@
---
-{{ ansible_managed | comment }}
-
-{% macro raw(string) -%}
-{{ "{{" }} {{ string }} {{ "}}" }}
-{%- endmacro %}
-
-groups:
-
-  - name: environmental.rules
-    rules:
-
-      - alert: EnvironmentalTemperature
-        expr: rPDU2SensorTempHumidityStatusTempC / 10 > 30
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            Température environnementale à {{ raw('$value') }}°
-
-      - alert: EnvironmentalTemperature
-        expr: rPDU2SensorTempHumidityStatusTempC / 10 > 40
-        for: 10m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Température environnementale à {{ raw('$value') }}° 
-
-
-      - alert: EnvironmentalTemperature
-        expr: xupsEnvRemoteTemp > 30
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            Température environnementale à {{ raw('$value') }}°
-
-      - alert: EnvironmentalTemperature
-        expr: xupsEnvRemoteTemp > 40
-        for: 10m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Température environnementale à {{ raw('$value') }}° 
-
-
-
-...
--- a/roles/prometheus/templates/ilo.rules.yml.j2
+++ b/roles/prometheus/templates/ilo.rules.yml.j2
@ -1,83 +0,0 @@
---
-{{ ansible_managed | comment }}
-
-{% macro raw(string) -%}
-{{ "{{" }} {{ string }} {{ "}}" }}
-{%- endmacro %}
-
-groups:
-
-  - name: ilo.rules
-    rules:
-
-      - alert: IloResilientMemoryDegraded
-        expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1
-        for: 3m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            La mémoire vive n'est plus résiliente
-            ({{ raw('$labels.cpqHeResilientMemCondition') }})
-
-      - alert: IloBiosSelfTestDegraded
-        expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
-        for: 3m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Une erreur a été détectée lors du POST du serveur
-            ({{ raw('$labels.cpqHeHWBiosCondition') }})
-
-      - alert: IloBatteryDegraded
-        expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1
-        for: 3m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            La batterie est dégradée
-            ({{ raw('$labels.cpqHeSysBatteryCondition') }})
-
-      - alert: IloTemperatureSensorDegraded
-        expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
-        for: 3m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Le capteur de température est dégradé
-            ({{ raw('$labels.cpqHeTemperatureCondition') }})
-
-      - alert: IloFanDegraded
-        expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1
-        for: 3m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Le ventilateur est dégradé
-            ({{ raw('$labels.cpqHeFltTolFanCondition') }})
-
-      - alert: IloPowerSupplyDegraded
-        expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1
-        for: 3m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            L'alimentation est dégradée
-            ({{ raw('$labels.cpqHeFltTolPowerSupplyStatus') }})
-
-      - alert: IloOverrideSwitchState
-        expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1
-        for: 3m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Le switch de réinitialisation n'est pas à l'état d'origine,
-            l'authentification est bypassée
-
-...
--- a/roles/prometheus/templates/postgres.rules.yml.j2
+++ b/roles/prometheus/templates/postgres.rules.yml.j2
@ -1,219 +0,0 @@
---
-{{ ansible_managed | comment }}
-
-{% macro raw(string) -%}
-{{ "{{" }} {{ string }} {{ "}}" }}
-{%- endmacro %}
-
-groups:
-
-  - name: postgres.rules
-    rules:
-      - alert: PostgresqlDown
-        expr: pg_up == 0
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: Serveur PostgreSQL down
-
-      - alert: PostgresqlRestarted
-        expr: time() - pg_postmaster_start_time_seconds < 60
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: Serveur PostgreSQL redémarré
-
-      - alert: PostgresqlExporterError
-        expr: pg_exporter_last_scrape_error > 0
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: Erreur dans l'exporter PostgreSQL
-
-      - alert: PostgresqlReplicationLag
-        expr:
-          pg_replication_lag > 30
-          and
-          ON(instance) pg_replication_is_replica == 1
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            La réplication PostgreSQL lag ({{ raw('$value') }} > 30s)
-            (base de données {{ raw('$labels.datname') }} )
-
-      - alert: PostgresqlTableNotVaccumed
-        expr:
-          time() - pg_stat_user_tables_last_autovacuum
-          > 60 * 60 * 24
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            Le démon autovacuum n'a pas été lancé depuis 24h
-            (base de données {{ raw('$labels.datname') }} )
-
-      - alert: PostgresqlTableNotAnalyzed
-        expr:
-          time() - pg_stat_user_tables_last_autoanalyze
-          > 60 * 60 * 24
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            Table non-analysée depuis 24h
-            (base de données {{ raw('$labels.datname') }})
-
-      - alert: PostgresqlTooManyConnections
-        expr:
-          (
-            sum by (datname)
-              (pg_stat_activity_count{datname!~"template.*|postgres"})
-          ) * 100
-          > pg_settings_max_connections * 80
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            PostgreSQL a trop de connexions
-            ({{ raw('$value | printf "%.1f"') }} > 80%)
-            (base de données {{ raw('$labels.datname') }})
-
-      - alert: PostgresqlDeadLocks
-        expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            PostgreSQL a plus de 5 deadlocks.
-            (base de données {{ raw('$labels.datname') }} )
-
-      - alert: PostgresqlSlowQueries
-        expr: pg_slow_queries > 0
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            Présence de requêtes lentes (slow-queries)
-            (base de données {{ raw('$labels.datname') }} )
-
-      - alert: PostgresqlHighRollbackRate
-        expr:
-          (
-            rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) /
-            rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m])
-          ) * 100 
-          > 20
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            PostgreSQL a un taux de retour en arrière (rollback) élevé
-            (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} %)
-
-      - alert: PostgresqlWaleReplicationStopped
-        expr: rate(pg_xlog_position_bytes[1m]) == 0
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Réplication de PostgreSQL WALE stoppée
-            (base de données {{ raw('$labels.datname') }} )
-
-      - alert: PostgresqlHighRateStatementTimeout
-        expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Beaucoup de requêtes PostgreSQL sont timeout
-            (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
-
-      - alert: PostgresqlHighRateDeadlock
-        expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            PostgreSQL a un fort taux de deadlock
-            (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
-
-#      - alert: PostgresqlReplicationLagBytes
-#        expr:
-#          (pg_xlog_position_bytes and pg_replication_is_replica == 0)
-#          - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1)
-#          > 1e+09
-#        for: 0m
-#        labels:
-#          severity: critical
-#        annotations:
-#          summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} )
-
-      - alert: PostgresqlTooManyDeadTuples
-        expr:
-          (
-            (pg_stat_user_tables_n_dead_tup > 10000)
-            / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)
-          ) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            Les tuples morts PostgreSQL sont trop volumineux
-            (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
-
-      - alert: PostgresqlSplitBrain
-        expr: count(pg_replication_is_replica == 0) != 1
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Split Brain : trop de bases de données PostgreSQL primaires en mode lecture-écriture
-            (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }} )
-
-      - alert: PostgresqlPromotedNode
-        expr:
-          pg_replication_is_replica
-          and
-          changes(pg_replication_is_replica[1m]) > 0
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            Le serveur de secours PostgreSQL a été promu comme nœud principal
-            (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }})
-
-      - alert: PostgresqlTooManyLocksAcquired
-        expr:
-          (
-            (sum (pg_locks_count))
-            / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)
-          ) * 100 > 20
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Trop de deadlocks acquis sur la base de données.
-            Si cette alerte se produit fréquemment, nous devrons peut-être augmenter
-            le paramètre postgres max_locks_per_transaction
-            (Valeur = {{ raw('$value | printf "%.1f"') }} )
-
-...
-
--- a/roles/prometheus/templates/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus.yml.j2
@ -1,6 +1,10 @@
 ---
 {{ ansible_managed | comment }}

+{{ prometheus__config | to_nice_yaml }}
+...
+
+{#
 global:
  # scrape_interval is set to the global default (60s)
  # evaluation_interval is set to the global default (60s)
@ -156,5 +160,5 @@ scrape_configs:
      - target_label: __address__
        replacement: 127.0.0.1:9116
 {% endif %}
-
 ...
+#}
--- a/roles/prometheus/templates/rules.yml.j2
+++ b/roles/prometheus/templates/rules.yml.j2
@ -0,0 +1,5 @@
+---
+{{ ansible_managed | comment }}
+
+{{ prometheus__rules | to_nice_yaml }}
+...
--- a/roles/prometheus/templates/server.rules.yml.j2
+++ b/roles/prometheus/templates/server.rules.yml.j2
@ -1,156 +0,0 @@
---
-{{ ansible_managed | comment }}
-
-{% macro raw(string) -%}
-{{ "{{" }} {{ string }} {{ "}}" }}
-{%- endmacro %}
-
-groups:
-
-  - name: server.rules
-    rules:
-
-      - alert: MachineDown
-        expr: up{instance!~".*.borne.auro.re$"} == 0
-        for: 3m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Le collecteur {{ raw('$labels.job') }} ne marche plus
-
-      - alert: AccessPointDown
-        expr: up{instance=~".*.borne.auro.re$"} == 0
-        for: 3m
-        labels:
-          severity: warning
-
-      - alert: OutOfMemory
-        expr: >-
-          (
-            node_memory_MemFree_bytes
-            + node_memory_Cached_bytes
-            + node_memory_Buffers_bytes
-          ) / node_memory_MemTotal_bytes * 100 < 10
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            {{ raw('$value | printf "%.1f"') }}% de mémoire
-            libre
-
-      - alert: HostSwapIsFillingUp
-        expr: >-
-          (
-            1 - (
-              node_memory_SwapFree_bytes
-              / node_memory_SwapTotal_bytes
-            )
-          ) * 100 >= 50
-        for: 3m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}%
-
-      - alert: HostPhysicalComponentTooHot
-        expr: node_hwmon_temp_celsius > 79
-        for: 3m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            La température de l'hôte est de {{ raw('$value') }}°C
-            ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
-
-      - alert: HostNodeOvertemperatureAlarm
-        expr: node_hwmon_temp_crit_alarm_celsius == 1
-        for: 0m
-        labels: 
-          severity: critical
-        annotations:
-          summary: >-
-            L'alarme de température de l'hôte est active
-            ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
-
-      - alert: HostOomKillDetected
-        expr: increase(node_vmstat_oom_kill[1m]) > 0
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer)
-
-      - alert: HostEdacCorrectableErrorsDetected
-        expr: increase(node_edac_correctable_errors_total[1m]) > 0
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            {{ raw('$value | print "%.1f"') }} erreur(s) ont été
-            corrigée(s) (EDAC)
-
-      - alert: OutOfDiskSpace
-        expr: >-
-          node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            {{ raw('$value | printf "%.1f"') }}% d'espace libre pour
-            {{ raw('$labels.mountpoint') }}
-
-      - alert: OutOfInodes
-        expr: node_filesystem_files_free / node_filesystem_files * 100 < 10
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            {{ raw('$value | printf "%.1f"') }}% d'inodes
-            restants pour {{ raw('$labels.mountpoint') }}
-
-      - alert: CpuUsage
-        expr: >-
-          (
-            100 - avg by (instance) (
-              irate(node_cpu_seconds_total{mode="idle"}[5m])
-            ) * 100
-          ) > 75
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            CPU à {{ raw('$value | printf "%.1f"') }}%
-
-      - alert: SystemdServiceFailed
-        expr: node_systemd_unit_state{state="failed"} == 1
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            {{ raw('$labels.name') }} a échoué
-
-      - alert: LoadUsage
-        expr: node_load1 > 5
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: >
-            Charge à {{ raw('$value') }}
-      
-      - alert: UnhealthyDisk
-        expr: smartmon_device_smart_healthy < 1 
-        for: 10m
-        labels:
-          severity: "critical"
-        annotations:
-          summary: "Le Disque {{ raw('$labels.disk') }} n'est pas en bonne santé !"
-...
--- a/roles/prometheus/templates/snmp.yml.j2
+++ b/roles/prometheus/templates/snmp.yml.j2
@ -1,708 +0,0 @@
---
-{{ ansible_managed | comment }}
-
-# TODOlist :
-# - Faire fonctionner le monitoring des switchs défini ici
-#   * Configurer tous les switchs avec un compte SNMPv3
-#   * Mettre l'inventaire des switchs dans Ansible
-# - Optimiser les règles pour les bornes Unifi,
-#   on pourrait indexer avec les SSID
-
-eatonups:
-  walk:
-  - 1.3.6.1.2.1.33.1.2
-  - 1.3.6.1.2.1.33.1.3
-  - 1.3.6.1.2.1.33.1.4
-  - 1.3.6.1.4.1.534.1.6
-  - 1.3.6.1.4.1.318.1.1.10.2.3.2.1.4
-  get:
-  - 1.3.6.1.2.1.1.3.0
-  metrics:
-  - name: sysUpTime
-    oid: 1.3.6.1.2.1.1.3
-    type: gauge
-    help: The time (in hundredths of a second) since the network management portion
-      of the system was last re-initialized. - 1.3.6.1.2.1.1.3
-  - name: upsBatteryStatus
-    oid: 1.3.6.1.2.1.33.1.2.1
-    type: gauge
-    help: The indication of the capacity remaining in the UPS system's batteries -
-      1.3.6.1.2.1.33.1.2.1
-  - name: upsEstimatedMinutesRemaining
-    oid: 1.3.6.1.2.1.33.1.2.3
-    type: gauge
-    help: An estimate of the time to battery charge depletion under the present load
-      conditions if the utility power is off and remains off, or if it were to be
-      lost and remain off. - 1.3.6.1.2.1.33.1.2.3
-  - name: upsInputVoltage
-    oid: 1.3.6.1.2.1.33.1.3.3.1.3
-    type: gauge
-    help: The magnitude of the present input voltage. - 1.3.6.1.2.1.33.1.3.3.1.3
-    indexes:
-    - labelname: upsInputLineIndex
-      type: gauge
-  - name: upsOutputSource
-    oid: 1.3.6.1.2.1.33.1.4.1
-    type: gauge
-    help: The present source of output power - 1.3.6.1.2.1.33.1.4.1
-  - name: upsOutputVoltage
-    oid: 1.3.6.1.2.1.33.1.4.4.1.2
-    type: gauge
-    help: The present output voltage. - 1.3.6.1.2.1.33.1.4.4.1.2
-    indexes:
-    - labelname: upsOutputLineIndex
-      type: gauge
-  - name: upsOutputPower
-    oid: 1.3.6.1.2.1.33.1.4.4.1.4
-    type: gauge
-    help: The present output true power. - 1.3.6.1.2.1.33.1.4.4.1.4
-    indexes:
-    - labelname: upsOutputLineIndex
-      type: gauge
-  - name: upsOutputPercentLoad
-    oid: 1.3.6.1.2.1.33.1.4.4.1.5
-    type: gauge
-    help: The percentage of the UPS power capacity presently being used on this output
-      line, i.e., the greater of the percent load of true power capacity and the percent
-      load of VA. - 1.3.6.1.2.1.33.1.4.4.1.5
-    indexes:
-    - labelname: upsOutputLineIndex
-      type: gauge
-  - name: xupsEnvRemoteTemp
-    oid: 1.3.6.1.4.1.318.1.1.10.2.3.2.1.4
-    type: gauge
-    help: The reading of an EMP's temperature sensor (APC MIB) - 1.3.6.1.4.1.318.1.1.10.2.3.2.1.4
-  - name: xupsEnvRemoteTemp
-    oid: 1.3.6.1.4.1.534.1.6.5
-    type: gauge
-    help: The reading of an EMP's temperature sensor (Eaton MIB) - 1.3.6.1.4.1.534.1.6.5
-  - name: xupsEnvRemoteHumidity
-    oid: 1.3.6.1.4.1.534.1.6.6
-    type: gauge
-    help: The reading of an EMP's humidity sensor. - 1.3.6.1.4.1.534.1.6.6
-  version: 1
-  auth:
-    community: public
-
-procurve_switch:
-  walk:
-    - 1.3.6.1.2.1.31.1.1.1.10
-    - 1.3.6.1.2.1.31.1.1.1
-    - 1.3.6.1.2.1.2.2.1.2
-    - 1.3.6.1.2.1.31.1.1.1.18
-  get:
-    - 1.3.6.1.2.1.1.3.0
-    - 1.3.6.1.2.1.1.5.0
-    - 1.3.6.1.2.1.1.6.0
-  metrics:
-    - name: sysUpTime
-      oid: 1.3.6.1.2.1.1.3
-      type: gauge
-      help: The time (in hundredths of a second) since the network management
-        portion of the system was last re-initialized. - 1.3.6.1.2.1.1.3
-    - name: sysName
-      oid: 1.3.6.1.2.1.1.5
-      type: DisplayString
-      help: An administratively-assigned name for this managed node
-        - 1.3.6.1.2.1.1.5
-    - name: sysLocation
-      oid: 1.3.6.1.2.1.1.6
-      type: DisplayString
-      help: The physical location of this node (e.g., 'telephone closet, 3rd
-        floor') - 1.3.6.1.2.1.1.6
-    - name: ifHCOutOctets
-      oid: 1.3.6.1.2.1.31.1.1.1.10
-      type: counter
-      help: The total number of octets transmitted out of the interface,
-        including framing characters - 1.3.6.1.2.1.31.1.1.1.10
-      indexes:
-        - labelname: ifIndex
-          type: gauge
-      lookups:
-        - labels:
-            - ifIndex
-          labelname: ifDescr
-          oid: 1.3.6.1.2.1.2.2.1.2
-          type: DisplayString
-        - labels:
-            - ifIndex
-          labelname: ifName
-          oid: 1.3.6.1.2.1.31.1.1.1.1
-          type: DisplayString
-    - name: ifHCInOctets
-      oid: 1.3.6.1.2.1.31.1.1.1.6
-      type: counter
-      help: The total number of octets received on the interface, including
-        framing characters - 1.3.6.1.2.1.31.1.1.1.6
-      indexes:
-        - labelname: ifIndex
-          type: gauge
-      lookups:
-        - labels:
-            - ifIndex
-          labelname: ifDescr
-          oid: 1.3.6.1.2.1.2.2.1.2
-          type: DisplayString
-        - labels:
-            - ifIndex
-          labelname: ifName
-          oid: 1.3.6.1.2.1.31.1.1.1.1
-          type: DisplayString
-  version: 2
-  auth:
-    community: "{{ snmp_switch_community }}"
-
-ubiquiti_unifi:
-  walk:
-  - 1.3.6.1.4.1.41112.1.6
-  get:
-  - 1.3.6.1.2.1.1.5.0
-  - 1.3.6.1.2.1.1.6.0
-  metrics:
-# Pour faire une WifiMap un jour, on peut entrer la location dans la conf des bornes
-#  - name: sysLocation
-#    oid: 1.3.6.1.2.1.1.6
-#    type: DisplayString
-#    help: The physical location of this node (e.g., 'telephone closet, 3rd floor')
-#      - 1.3.6.1.2.1.1.6
-  - name: unifiVapIndex
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.1
-    type: gauge
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.1'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapChannel
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.4
-    type: gauge
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.4'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapEssId
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.6
-    type: DisplayString
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.6'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapName
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.7
-    type: DisplayString
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.7'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifi_vap_num_stations
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.8
-    type: gauge
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.8'
-    indexes:
-    - labelname: unifi_vap_index
-      type: gauge
-    lookups:
-    - labels: [unifi_vap_index]
-      labelname: unifi_vap_essid
-      oid: 1.3.6.1.4.1.41112.1.6.1.2.1.6
-      type: DisplayString
-    - labels: [unifi_vap_index]
-      labelname: unifi_vap_radio
-      oid: 1.3.6.1.4.1.41112.1.6.1.2.1.9
-      type: DisplayString
-    - labels: []
-      labelname: unifi_vap_index
-#  - name: unifiVapNumStations
-#    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.8
-#    type: gauge
-#    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.8'
-#    indexes:
-#    - labelname: unifiVapIndex
-#      type: gauge
-  - name: unifiVapRadio
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.9
-    type: DisplayString
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.9'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapRxBytes
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.10
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.10'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapRxCrypts
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.11
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.11'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapRxDropped
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.12
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.12'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapRxErrors
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.13
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.13'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapRxFrags
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.14
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.14'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapRxPackets
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.15
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.15'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapTxBytes
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.16
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.16'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapTxDropped
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.17
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.17'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapTxErrors
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.18
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.18'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapTxPackets
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.19
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.19'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapTxRetries
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.20
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.20'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapTxPower
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.21
-    type: gauge
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.21'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapUp
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.22
-    type: gauge
-    help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.22'
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiVapUsage
-    oid: 1.3.6.1.4.1.41112.1.6.1.2.1.23
-    type: DisplayString
-    help: guest or regular user - 1.3.6.1.4.1.41112.1.6.1.2.1.23
-    indexes:
-    - labelname: unifiVapIndex
-      type: gauge
-  - name: unifiIfIndex
-    oid: 1.3.6.1.4.1.41112.1.6.2.1.1.1
-    type: gauge
-    help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.1'
-    indexes:
-    - labelname: unifiIfIndex
-      type: gauge
-  - name: unifiIfName
-    oid: 1.3.6.1.4.1.41112.1.6.2.1.1.5
-    type: DisplayString
-    help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.5'
-    indexes:
-    - labelname: unifiIfIndex
-      type: gauge
-  - name: unifiIfRxBytes
-    oid: 1.3.6.1.4.1.41112.1.6.2.1.1.6
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.6'
-    indexes:
-    - labelname: unifiIfIndex
-      type: gauge
-  - name: unifiIfRxDropped
-    oid: 1.3.6.1.4.1.41112.1.6.2.1.1.7
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.7'
-    indexes:
-    - labelname: unifiIfIndex
-      type: gauge
-  - name: unifiIfRxError
-    oid: 1.3.6.1.4.1.41112.1.6.2.1.1.8
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.8'
-    indexes:
-    - labelname: unifiIfIndex
-      type: gauge
-  - name: unifiIfRxMulticast
-    oid: 1.3.6.1.4.1.41112.1.6.2.1.1.9
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.9'
-    indexes:
-    - labelname: unifiIfIndex
-      type: gauge
-  - name: unifiIfRxPackets
-    oid: 1.3.6.1.4.1.41112.1.6.2.1.1.10
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.10'
-    indexes:
-    - labelname: unifiIfIndex
-      type: gauge
-  - name: unifiIfTxBytes
-    oid: 1.3.6.1.4.1.41112.1.6.2.1.1.12
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.12'
-    indexes:
-    - labelname: unifiIfIndex
-      type: gauge
-  - name: unifiIfTxDropped
-    oid: 1.3.6.1.4.1.41112.1.6.2.1.1.13
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.13'
-    indexes:
-    - labelname: unifiIfIndex
-      type: gauge
-  - name: unifiIfTxError
-    oid: 1.3.6.1.4.1.41112.1.6.2.1.1.14
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.14'
-    indexes:
-    - labelname: unifiIfIndex
-      type: gauge
-  - name: unifiIfTxPackets
-    oid: 1.3.6.1.4.1.41112.1.6.2.1.1.15
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.15'
-    indexes:
-    - labelname: unifiIfIndex
-      type: gauge
-  - name: unifiApSystemModel
-    oid: 1.3.6.1.4.1.41112.1.6.3.3
-    type: DisplayString
-    help: ' - 1.3.6.1.4.1.41112.1.6.3.3'
-  - name: unifiApSystemUptime
-    oid: 1.3.6.1.4.1.41112.1.6.3.5
-    type: counter
-    help: ' - 1.3.6.1.4.1.41112.1.6.3.5'
-  version: 3
-  auth:
-    security_level: authPriv
-    username: snmp_prometheus
-    password: {{ snmp_unifi_password }}
-    auth_protocol: SHA
-    priv_protocol: AES
-    priv_password: {{ snmp_unifi_password }}
-
-
-apc_pdu:
-  walk:
-  - 1.3.6.1.4.1.318.1.1.26.10.2.2.1.8
-  - 1.3.6.1.4.1.318.1.1.26.4.3.1.4
-  - 1.3.6.1.4.1.318.1.1.26.4.3.1.5
-  - 1.3.6.1.4.1.318.1.1.26.4.3.1.6
-  - 1.3.6.1.4.1.318.1.1.26.6.3.1.9
-  - 1.3.6.1.4.1.318.1.1.26.9.4.3.1.7
-  metrics:
-  - name: rPDU2SensorTempHumidityStatusTempC
-    oid: 1.3.6.1.4.1.318.1.1.26.10.2.2.1.8
-    type: gauge
-    help: Sensor temperature reading in tenths of degrees Celsius - 1.3.6.1.4.1.318.1.1.26.10.2.2.1.8
-    indexes:
-    - labelname: rPDU2SensorTempHumidityStatusIndex
-      type: gauge
-  - name: rPDU2DeviceStatusLoadState
-    oid: 1.3.6.1.4.1.318.1.1.26.4.3.1.4
-    type: gauge
-    help: Indicates the present load status of the Rack PDU - 1.3.6.1.4.1.318.1.1.26.4.3.1.4
-    indexes:
-    - labelname: rPDU2DeviceStatusIndex
-      type: gauge
-  - name: rPDU2DeviceStatusPower
-    oid: 1.3.6.1.4.1.318.1.1.26.4.3.1.5
-    type: gauge
-    help: The power consumption of the Rack PDU load in hundredths of kilowatts -
-      1.3.6.1.4.1.318.1.1.26.4.3.1.5
-    indexes:
-    - labelname: rPDU2DeviceStatusIndex
-      type: gauge
-  - name: rPDU2DeviceStatusPeakPower
-    oid: 1.3.6.1.4.1.318.1.1.26.4.3.1.6
-    type: gauge
-    help: The peak power consumption of the Rack PDU load in hundredths of kilowatts
-      - 1.3.6.1.4.1.318.1.1.26.4.3.1.6
-    indexes:
-    - labelname: rPDU2DeviceStatusIndex
-      type: gauge
-  - name: rPDU2PhaseStatusPowerFactor
-    oid: 1.3.6.1.4.1.318.1.1.26.6.3.1.9
-    type: gauge
-    help: Indicates the load power factor, in hundredths, of the Rack PDU phase being
-      queried - 1.3.6.1.4.1.318.1.1.26.6.3.1.9
-    indexes:
-    - labelname: rPDU2PhaseStatusIndex
-      type: gauge
-  - name: rPDU2OutletMeteredStatusPower
-    oid: 1.3.6.1.4.1.318.1.1.26.9.4.3.1.7
-    type: gauge
-    help: Indicates the power draw of the load on the Rack PDU outlet being queried
-      - 1.3.6.1.4.1.318.1.1.26.9.4.3.1.7
-    indexes:
-    - labelname: rPDU2OutletMeteredStatusIndex
-      type: gauge
-  version: 3
-  auth:
-    security_level: authPriv
-    username: {{ snmp_pdu_user }}
-    password: {{ snmp_pdu_password }}
-    auth_protocol: SHA
-    priv_protocol: AES
-    priv_password: {{ snmp_pdu_password }}
-
-ilo:
-  walk:
-  - 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory
-  - 1.3.6.1.4.1.232.6.2.15.3 # Power meter
-  - 1.3.6.1.4.1.232.6.2.16.1 # POST tests
-  - 1.3.6.1.4.1.232.6.2.17.1 # Battery
-  - 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location
-  - 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value
-  - 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit
-  - 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition
-  - 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location
-  - 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition
-  - 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply
-  - 1.3.6.1.4.1.232.9.2.2 # iLO
-  metrics:
-  - name: cpqHeResilientMemCondition
-    oid: 1.3.6.1.4.1.232.6.2.14.4
-    type: EnumAsStateSet
-    help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4
-    enum_values:
-      1: other
-      2: ok
-      3: degraded
-      4: failed
-  - name: cpqHePowerMeterCurrReading
-    oid: 1.3.6.1.4.1.232.6.2.15.3
-    type: gauge
-    help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3
-  - name: cpqHeHWBiosCondition
-    oid: 1.3.6.1.4.1.232.6.2.16.1
-    type: EnumAsStateSet
-    help: This value indicates an error has been detected during Pre-OS Test (POST)
-      or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1
-    enum_values:
-      1: other
-      2: ok
-      3: degraded
-      4: failed
-  - name: cpqHeSysBatteryCondition
-    oid: 1.3.6.1.4.1.232.6.2.17.1
-    type: EnumAsStateSet
-    help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1
-    indexes:
-    - labelname: cpqHeSysBatteryChassis
-      type: gauge
-    - labelname: cpqHeSysBatteryIndex
-      type: gauge
-    enum_values:
-      1: other
-      2: ok
-      3: degraded
-      4: failed
-  - name: cpqHeTemperatureLocale
-    oid: 1.3.6.1.4.1.232.6.2.6.8.1.3
-    type: EnumAsInfo
-    help: This specifies the location of the temperature sensor present in the system.
-      - 1.3.6.1.4.1.232.6.2.6.8.1.3
-    indexes:
-    - labelname: cpqHeTemperatureChassis
-      type: gauge
-    - labelname: cpqHeTemperatureIndex
-      type: gauge
-    enum_values:
-      1: other
-      2: unknown
-      3: system
-      4: systemBoard
-      5: ioBoard
-      6: cpu
-      7: memory
-      8: storage
-      9: removableMedia
-      10: powerSupply
-      11: ambient
-      12: chassis
-      13: bridgeCard
-  - name: cpqHeTemperatureCelsius
-    oid: 1.3.6.1.4.1.232.6.2.6.8.1.4
-    type: gauge
-    help: This is the current temperature sensor reading in degrees celsius - 1.3.6.1.4.1.232.6.2.6.8.1.4
-    indexes:
-    - labelname: cpqHeTemperatureChassis
-      type: gauge
-    - labelname: cpqHeTemperatureIndex
-      type: gauge
-  - name: cpqHeTemperatureThreshold
-    oid: 1.3.6.1.4.1.232.6.2.6.8.1.5
-    type: gauge
-    help: This is the shutdown threshold temperature sensor setting in degrees celsius
-      - 1.3.6.1.4.1.232.6.2.6.8.1.5
-    indexes:
-    - labelname: cpqHeTemperatureChassis
-      type: gauge
-    - labelname: cpqHeTemperatureIndex
-      type: gauge
-  - name: cpqHeTemperatureCondition
-    oid: 1.3.6.1.4.1.232.6.2.6.8.1.6
-    type: EnumAsStateSet
-    help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6
-    indexes:
-    - labelname: cpqHeTemperatureChassis
-      type: gauge
-    - labelname: cpqHeTemperatureIndex
-      type: gauge
-    enum_values:
-      1: other
-      2: ok
-      3: degraded
-      4: failed
-  - name: cpqHeFltTolFanLocale
-    oid: 1.3.6.1.4.1.232.6.2.6.7.1.3
-    type: EnumAsInfo
-    help: This specifies the location of the fan present in the system.
-      - 1.3.6.1.4.1.232.6.2.6.7.1.3
-    indexes:
-    - labelname: cpqHeFltTolFanChassis
-      type: gauge
-    - labelname: cpqHeFltTolFanIndex
-      type: gauge
-    enum_values:
-      1: other
-      2: unknown
-      3: system
-      4: systemBoard
-      5: ioBoard
-      6: cpu
-      7: memory
-      8: storage
-      9: removableMedia
-      10: powerSupply
-      11: ambient
-      12: chassis
-      13: bridgeCard
-  - name: cpqHeFltTolFanCondition
-    oid: 1.3.6.1.4.1.232.6.2.6.7.1.9
-    type: EnumAsStateSet
-    help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9
-    indexes:
-    - labelname: cpqHeFltTolFanChassis
-      type: gauge
-    - labelname: cpqHeFltTolFanIndex
-      type: gauge
-    enum_values:
-      1: other
-      2: ok
-      3: degraded
-      4: failed
-  - name: cpqHeFltTolPowerSupplyStatus
-    oid: 1.3.6.1.4.1.232.6.2.9.3.1.5
-    type: EnumAsStateSet
-    help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5
-    indexes:
-    - labelname: cpqHeFltTolPowerSupplyChassis
-      type: gauge
-    - labelname: cpqHeFltTolPowerSupplyBay
-      type: gauge
-    enum_values:
-      1: noError
-      2: generalFailure
-      3: bistFailure
-      4: fanFailure
-      5: tempFailure
-      6: interlockOpen
-      7: epromFailed
-      8: vrefFailed
-      9: dacFailed
-      10: ramTestFailed
-      11: voltageChannelFailed
-      12: orringdiodeFailed
-      13: brownOut
-      14: giveupOnStartup
-      15: nvramInvalid
-      16: calibrationTableInvalid
-      17: noPowerInput
-  - name: cpqSm2CntlrInterfaceStatus
-    oid: 1.3.6.1.4.1.232.9.2.2.17
-    type: EnumAsStateSet
-    help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17
-    enum_values:
-      1: other
-      2: ok
-      3: notResponding
-  - name: cpqSm2CntlriLOSecurityOverrideSwitchState
-    oid: 1.3.6.1.4.1.232.9.2.2.27
-    type: EnumAsStateSet
-    help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27
-    enum_values:
-      1: notSupported
-      2: set
-      3: notSet
-  - name: cpqSm2CntlrLicenseActive
-    oid: 1.3.6.1.4.1.232.9.2.2.30
-    type: EnumAsStateSet
-    help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30
-    enum_values:
-      1: none
-      2: iloAdvanced
-      3: iloLight
-      4: iloAdvancedBlade
-      5: iloStandard
-      6: iloEssentials
-      7: iloScaleOut
-      8: iloAdvancedPremiumSecurity
-  - name: cpqSm2CntlrServerPowerState
-    oid: 1.3.6.1.4.1.232.9.2.2.32
-    type: EnumAsStateSet
-    help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32
-    enum_values:
-      1: unknown
-      2: poweredOff
-      3: poweredOn
-      4: insufficientPowerOrPowerOnDenied
-  version: 3
-  # Reduce timeout to retry faster
-  timeout: 1s
-  auth:
-    security_level: authPriv
-    username: {{ snmp_ilo_user }}
-    password: {{ snmp_ilo_auth }}
-    auth_protocol: SHA
-    priv_protocol: AES
-    priv_password: {{ snmp_ilo_priv }}
-
-...
--- a/roles/prometheus/templates/ups.rules.yml.j2
+++ b/roles/prometheus/templates/ups.rules.yml.j2
@ -1,87 +0,0 @@
---
-{{ ansible_managed | comment }}
-
-{% macro raw(string) -%}
-{{ "{{" }} {{ string }} {{ "}}" }}
-{%- endmacro %}
-
-groups:
-
-  - name: ups.rules
-    rules:
-
-      - alert: UpsOutputSourceChanged
-        expr: upsOutputSource != 3
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Source d'alimentation changée
-
-      - alert: UpsBatteryStatus
-        expr: upsBatteryStatus == 3
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            État de la batterie faible
-
-      - alert: UpsBatteryStatus
-        expr: upsBatteryStatus == 4
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            État de la batterie critique
-
-      - alert: UpsHighLoad
-        expr: upsOutputPercentLoad > 70
-        for: 3m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Charge de {{ raw('$value | printf "%.1f"') }}%
-
-      - alert: UpsWrongInputVoltage
-        expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            Tension d'entrée de {{ raw('$value') }}V
-
-      - alert: UpsWrongOutputVoltage
-        expr: >-
-          abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
-            < 3 * stddev_over_time(upsOutputVoltage[1d])
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            Tension de sortie de {{ raw('$value') }}V
-
-      - alert: UpsTimeRemaining
-        expr: upsEstimatedMinutesRemaining < 8
-        for: 0m
-        labels:
-          severity: warning
-        annotations:
-          summary: >-
-            Autonomie restante de {{ raw('$value') }} min
-
-      - alert: UpsTimeRemaining
-        expr: upsEstimatedMinutesRemaining < 5
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: >-
-            Autonomie restante de {{ raw('$value') }} min
-
-...