From 922b6894a7b68d7bd9777d373c2d6818866c2965 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Sun, 2 Apr 2023 05:08:01 +0200 Subject: [PATCH] prometheus: cleanup role (lots of features missing) --- filter_plugins/prometheus.py | 55 ++ playbooks/prometheus.yml | 602 +++++++++------ roles/prometheus/defaults/main.yml | 7 + roles/prometheus/handlers/main.yml | 14 +- roles/prometheus/tasks/main.yml | 88 +-- roles/prometheus/templates/default.j2 | 3 + .../prometheus/templates/docker.rules.yml.j2 | 50 -- .../templates/environmental.rules.yml.j2 | 52 -- roles/prometheus/templates/ilo.rules.yml.j2 | 83 -- .../templates/postgres.rules.yml.j2 | 219 ------ roles/prometheus/templates/prometheus.yml.j2 | 6 +- roles/prometheus/templates/rules.yml.j2 | 5 + .../prometheus/templates/server.rules.yml.j2 | 156 ---- roles/prometheus/templates/snmp.yml.j2 | 708 ------------------ roles/prometheus/templates/ups.rules.yml.j2 | 87 --- 15 files changed, 484 insertions(+), 1651 deletions(-) create mode 100644 filter_plugins/prometheus.py create mode 100644 roles/prometheus/defaults/main.yml create mode 100644 roles/prometheus/templates/default.j2 delete mode 100644 roles/prometheus/templates/docker.rules.yml.j2 delete mode 100644 roles/prometheus/templates/environmental.rules.yml.j2 delete mode 100644 roles/prometheus/templates/ilo.rules.yml.j2 delete mode 100644 roles/prometheus/templates/postgres.rules.yml.j2 create mode 100644 roles/prometheus/templates/rules.yml.j2 delete mode 100644 roles/prometheus/templates/server.rules.yml.j2 delete mode 100644 roles/prometheus/templates/snmp.yml.j2 delete mode 100644 roles/prometheus/templates/ups.rules.yml.j2 diff --git a/filter_plugins/prometheus.py b/filter_plugins/prometheus.py new file mode 100644 index 0000000..2494af5 --- /dev/null +++ b/filter_plugins/prometheus.py @@ -0,0 +1,55 @@ +from ansible.parsing.yaml.objects import AnsibleUnicode + +class FilterModule: + def filters(self): + return { + "prometheus__convert_jobs": convert_jobs, + "interp": interp, + } + + +def interp(string): + return AnsibleUnicode(f"{{{{ {string} }}}}") + + +def convert_jobs(config): + + for name, job in config.items(): + + config = { + "job_name": name, + "static_configs": [ + { + "targets": job["targets"], + } + ], + "params": job.get("params", {}), + } + + if "path" in job: + config["metrics_path"] = job["path"] + + if "address" in job: + + try: + replacement = f"$1:{job['address']['port']}" + except Exception: + replacement = job["address"] + + config["relabel_configs"] = [ + { + "source_labels": ["__address__"], + "target_label": "__param_target", + }, + { + "source_labels": ["__param_target"], + "target_label": "instance", + }, + { + "source_labels": ["__param_target"], + "target_label": "__address__", + "replacement": replacement, + }, + ] + + yield config diff --git a/playbooks/prometheus.yml b/playbooks/prometheus.yml index 6f16471..443f7d4 100755 --- a/playbooks/prometheus.yml +++ b/playbooks/prometheus.yml @@ -1,241 +1,369 @@ #!/usr/bin/env ansible-playbook --- -- hosts: prometheus-fleming.adm.auro.re - vars: - prometheus_alertmanager: docker-ovh.adm.auro.re:9093 - snmp_unifi_password: "{{ vault_snmp_unifi_password }}" - snmp_switch_community: "{{ vault_snmp_switch_community }}" - snmp_pdu_user: "{{ vault_snmp_pdu_user }}" - snmp_pdu_password: "{{ vault_snmp_pdu_password }}" - snmp_ilo_user: aurore - snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" - snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" - - prometheus_servers_targets: | - {{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }} - prometheus_unifi_snmp_targets: | - {{ groups['fleming_unifi'] | list | sort }} - prometheus_ilo_snmp_targets: | - {{ groups['fleming_ilo'] | list | sort }} - - update_motd: - prometheus: >- - Prometheus (en configuration fleming) est déployé (/etc/prometheus). - roles: - - prometheus - - update_motd - -- hosts: prometheus-pacaterie.adm.auro.re - vars: - prometheus_alertmanager: docker-ovh.adm.auro.re:9093 - snmp_unifi_password: "{{ vault_snmp_unifi_password }}" - snmp_switch_community: "{{ vault_snmp_switch_community }}" - snmp_pdu_user: "{{ vault_snmp_pdu_user }}" - snmp_pdu_password: "{{ vault_snmp_pdu_password }}" - snmp_ilo_user: aurore - snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" - snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" - - prometheus_servers_targets: | - {{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }} - prometheus_unifi_snmp_targets: | - {{ groups['pacaterie_unifi'] | list | sort }} - prometheus_ups_snmp_targets: - - ups-pn-1.ups.auro.re - - ups-ps-1.ups.auro.re - prometheus_ilo_snmp_targets: | - {{ groups['pacaterie_ilo'] | list | sort }} - - update_motd: - prometheus: >- - Prometheus (en configuration pacaterie) est déployé (/etc/prometheus). - roles: - - prometheus - - update_motd - -- hosts: prometheus-edc.adm.auro.re - vars: - prometheus_alertmanager: docker-ovh.adm.auro.re:9093 - snmp_unifi_password: "{{ vault_snmp_unifi_password }}" - snmp_switch_community: "{{ vault_snmp_switch_community }}" - snmp_pdu_user: "{{ vault_snmp_pdu_user }}" - snmp_pdu_password: "{{ vault_snmp_pdu_password }}" - snmp_ilo_user: aurore - snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" - snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" - - prometheus_ups_snmp_targets: - - ups-ec-1.ups.auro.re - # - ups-ec-2.ups.auro.re - - ups-ec-3.ups.auro.re - prometheus_servers_targets: | - {{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }} - prometheus_unifi_snmp_targets: | - {{ groups['edc_unifi'] | list | sort }} - prometheus_ilo_snmp_targets: | - {{ groups['edc_ilo'] | list | sort }} - - update_motd: - prometheus: >- - Prometheus (en configuration edc) est déployé (/etc/prometheus). - roles: - - prometheus - - update_motd - -- hosts: prometheus-gs.adm.auro.re - vars: - prometheus_alertmanager: docker-ovh.adm.auro.re:9093 - snmp_unifi_password: "{{ vault_snmp_unifi_password }}" - snmp_switch_community: "{{ vault_snmp_switch_community }}" - snmp_pdu_user: "{{ vault_snmp_pdu_user }}" - snmp_pdu_password: "{{ vault_snmp_pdu_password }}" - snmp_ilo_user: aurore - snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" - snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" - - prometheus_servers_targets: | - {{ groups['gs_pve'] + groups['gs_vm'] | list | sort }} - prometheus_unifi_snmp_targets: | - {{ groups['gs_unifi'] | list | sort }} - prometheus_ups_snmp_targets: - - ups-gk-1.ups.auro.re - prometheus_apc_pdu_snmp_targets: - - pdu-ga-1.ups.auro.re - prometheus_ilo_snmp_targets: | - {{ groups['gs_ilo'] | list | sort }} - - update_motd: - prometheus: >- - Prometheus (en configuration gs) est déployé (/etc/prometheus). - roles: - - prometheus - - update_motd - -- hosts: prometheus-rives.adm.auro.re - vars: - prometheus_alertmanager: docker-ovh.adm.auro.re:9093 - snmp_unifi_password: "{{ vault_snmp_unifi_password }}" - snmp_switch_community: "{{ vault_snmp_switch_community }}" - snmp_pdu_user: "{{ vault_snmp_pdu_user }}" - snmp_pdu_password: "{{ vault_snmp_pdu_password }}" - snmp_ilo_user: aurore - snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" - snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" - - prometheus_ups_snmp_targets: - - ups-r3-1.ups.auro.re - - ups-r1-1.ups.auro.re - prometheus_servers_targets: | - {{ groups['rives_pve'] + groups['rives_vm'] | list | sort }} - prometheus_unifi_snmp_targets: | - {{ groups['rives_unifi'] | list | sort }} - prometheus_ilo_snmp_targets: | - {{ groups['rives_ilo'] | list | sort }} - - update_motd: - prometheus: >- - Prometheus (en configuration rives) est déployé (/etc/prometheus). - roles: - - prometheus - - update_motd - -- hosts: prometheus-aurore.adm.auro.re - vars: - prometheus_alertmanager: docker-ovh.adm.auro.re:9093 - snmp_unifi_password: "{{ vault_snmp_unifi_password }}" - snmp_switch_community: "{{ vault_snmp_switch_community }}" - snmp_pdu_user: "{{ vault_snmp_pdu_user }}" - snmp_pdu_password: "{{ vault_snmp_pdu_password }}" - snmp_ilo_user: aurore - snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" - snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" - - prometheus_servers_targets: | - {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }} - prometheus_postgresql_targets: | - {{ groups['bdd'] + groups['radius'] | list | sort }} - prometheus_switch_snmp_targets: - - yggdrasil.switch.auro.re - - sw-pn-serveurs.switch.auro.re - - sw-ec-serveurs.switch.auro.re - - sw-gk-serveurs.switch.auro.re - - sw-fl-serveurs.switch.auro.re - - sw-ff-uplink.switch.auro.re - - sw-fl-core.switch.auro.re - - sw-fd-vcore.switch.auro.re - - sw-fl-vcore.switch.auro.re - - sw-ff-vcore.switch.auro.re - - sw-pn-core.switch.auro.re - - sw-ec-core.switch.auro.re - - sw-gk-core.switch.auro.re - - sw-r3-core.switch.auro.re - prometheus_ilo_snmp_targets: | - {{ groups['aurore_ilo'] | list | sort }} - - update_motd: - prometheus: >- - Prometheus (en configuration aurore) est déployé (/etc/prometheus). - roles: - - prometheus - - update_motd - -- hosts: prometheus-ovh.adm.auro.re - vars: - prometheus_alertmanager: docker-ovh.adm.auro.re:9093 - snmp_unifi_password: "{{ vault_snmp_unifi_password }}" - snmp_switch_community: "{{ vault_snmp_switch_community }}" - snmp_pdu_user: "{{ vault_snmp_pdu_user }}" - snmp_pdu_password: "{{ vault_snmp_pdu_password }}" - snmp_ilo_user: aurore - snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" - snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" - - prometheus_servers_targets: | - {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} - prometheus_postgresql_targets: - - bdd-ovh.adm.auro.re - prometheus_docker_targets: - - docker-ovh.adm.auro.re - - update_motd: - prometheus: >- - Prometheus (en configuration ovh) est déployé (/etc/prometheus). - roles: - - prometheus - - update_motd - -- hosts: prometheus-federate.adm.auro.re - vars: - prometheus_alertmanager: docker-ovh.adm.auro.re:9093 - snmp_unifi_password: "{{ vault_snmp_unifi_password }}" - snmp_pdu_user: "{{ vault_snmp_pdu_user }}" - snmp_pdu_password: "{{ vault_snmp_pdu_password }}" - snmp_ilo_user: aurore - snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" - snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" - - prometheus_servers_targets: - - prometheus-edc.adm.auro.re - - prometheus-gs.adm.auro.re - - prometheus-fleming.adm.auro.re - - prometheus-pacaterie.adm.auro.re - - prometheus-rives.adm.auro.re - - prometheus-aurore.adm.auro.re - - prometheus-ovh.adm.auro.re - - update_motd: - prometheus_federate: >- - Prometheus (en configuration fédération) est déployé (/etc/prometheus). - roles: - - prometheus_federate - - update_motd - -# Postgres Exporters -- hosts: bdd,radius - roles: - - prometheus_postgres - -# Monitor all hosts -- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container +- hosts: + - pve_network + - vm_network roles: - prometheus_node + +- hosts: + - prometheus-1.monit.infra.auro.re + - prometheus-2.monit.infra.auro.re + vars: + prometheus__tsdb_retention_time: 90d + prometheus__scraping: + node: + targets: "{{ ['vm_network', 'pve_network'] + | map('extract', groups) + | flatten }}" + address: + port: 9100 + prometheus__alert_rules: + node: + - alert: MachineDown + expr: "up == 0" + for: 3m + labels: + severity: critical + annotations: + summary: "Collecteur {{ '$labels.job' | interp }}" + - alert: OutOfMemory + expr: "( node_memory_MemFree_bytes + + node_memory_Cached_bytes + + node_memory_Buffers_bytes ) + / node_memory_MemTotal_bytes * 100 < 10" + for: 5m + labels: + severity: warning + annotations: + summary: "Mémoire libre à {{ '$value' | interp }}%" + - alert: HostSwapIsFillingUp + expr: "( 1 - ( node_memory_SwapFree_bytes + / node_memory_SwapTotal_bytes ) ) + * 100 >= 50" + for: 3m + labels: + severity: critical + annotations: + summary: "Swap {{ '$value' | interp }}%" + - alert: HostPhysicalComponentTooHot + expr: "node_hwmon_temp_celsius > 79" + for: 3m + labels: + severity: critical + annotations: + summary: "{{ '$value' | interp }}°C : + {{ '$labels.chip' | interp }}, + {{ '$labels.sensor' | interp }}" + - alert: HostNodeOvertemperatureAlarm + expr: "node_hwmon_temp_crit_alarm_celsius == 1" + for: 0m + labels: + severity: critical + annotations: + summary: "{{ '$labels.chip' | interp }}, + {{ '$labels.sensor' | interp }}" + - alert: HostOomKillDetected + expr: "increase(node_vmstat_oom_kill[1m]) > 0" + for: 0m + labels: + severity: warning + annotations: + summary: "PID {{ '$value' | interp }}" + - alert: HostEdacCorrectableErrorsDetected + expr: "increase(node_edac_correctable_errors_total[1m]) > 0" + for: 0m + labels: + severity: warning + annotations: + summary: "{{ '$value' | interp }} erreurs corrigées" + - alert: OutOfDiskSpace + expr: "node_filesystem_free_bytes + / node_filesystem_size_bytes * 100 < 10" + for: 5m + labels: + severity: warning + annotations: + summary: "{{ '$labels.mountpoint' | interp }} : + {{ '$value' | interp }}% libre" + - alert: OutOfInodes + expr: "node_filesystem_files_free + / node_filesystem_files * 100 < 10" + for: 3m + labels: + severity: warning + annotations: + summary: "{{ '$labels.mountpoint' | interp }} : + {{ '$value' | interp }}% libre" + - alert: CpuUsage + expr: '( 100 - avg by (instance) + ( irate(node_cpu_seconds_total{mode="idle"}[5m]) ) + * 100 ) > 75' + for: 10m + labels: + severity: warning + annotations: + summary: "{{ '$value' | interp }}%" + - alert: SystemdServiceFailed + expr: 'node_systemd_unit_state{state="failed"} == 1' + for: 10m + labels: + severity: warning + annotations: + summary: "{{ '$labels.name' | interp }}" + - alert: LoadUsage + expr: "node_load1 > 5" + for: 2m + labels: + severity: warning + annotations: + summary: "{{ '$value' | interp }}" + - alert: UnhealthyDisk + expr: "smartmon_device_smart_healthy < 1" + for: 10m + labels: + severity: critical + annotations: + summary: "{{ '$labels.disk' | interp }}" + roles: + - prometheus + +#- hosts: prometheus-fleming.adm.auro.re +# vars: +# prometheus_alertmanager: docker-ovh.adm.auro.re:9093 +# snmp_unifi_password: "{{ vault_snmp_unifi_password }}" +# snmp_switch_community: "{{ vault_snmp_switch_community }}" +# snmp_pdu_user: "{{ vault_snmp_pdu_user }}" +# snmp_pdu_password: "{{ vault_snmp_pdu_password }}" +# snmp_ilo_user: aurore +# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" +# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" +# +# prometheus_servers_targets: | +# {{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }} +# prometheus_unifi_snmp_targets: | +# {{ groups['fleming_unifi'] | list | sort }} +# prometheus_ilo_snmp_targets: | +# {{ groups['fleming_ilo'] | list | sort }} +# +# update_motd: +# prometheus: >- +# Prometheus (en configuration fleming) est déployé (/etc/prometheus). +# roles: +# - prometheus +# - update_motd +# +#- hosts: prometheus-pacaterie.adm.auro.re +# vars: +# prometheus_alertmanager: docker-ovh.adm.auro.re:9093 +# snmp_unifi_password: "{{ vault_snmp_unifi_password }}" +# snmp_switch_community: "{{ vault_snmp_switch_community }}" +# snmp_pdu_user: "{{ vault_snmp_pdu_user }}" +# snmp_pdu_password: "{{ vault_snmp_pdu_password }}" +# snmp_ilo_user: aurore +# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" +# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" +# +# prometheus_servers_targets: | +# {{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }} +# prometheus_unifi_snmp_targets: | +# {{ groups['pacaterie_unifi'] | list | sort }} +# prometheus_ups_snmp_targets: +# - ups-pn-1.ups.auro.re +# - ups-ps-1.ups.auro.re +# prometheus_ilo_snmp_targets: | +# {{ groups['pacaterie_ilo'] | list | sort }} +# +# update_motd: +# prometheus: >- +# Prometheus (en configuration pacaterie) est déployé (/etc/prometheus). +# roles: +# - prometheus +# - update_motd +# +#- hosts: prometheus-edc.adm.auro.re +# vars: +# prometheus_alertmanager: docker-ovh.adm.auro.re:9093 +# snmp_unifi_password: "{{ vault_snmp_unifi_password }}" +# snmp_switch_community: "{{ vault_snmp_switch_community }}" +# snmp_pdu_user: "{{ vault_snmp_pdu_user }}" +# snmp_pdu_password: "{{ vault_snmp_pdu_password }}" +# snmp_ilo_user: aurore +# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" +# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" +# +# prometheus_ups_snmp_targets: +# - ups-ec-1.ups.auro.re +# # - ups-ec-2.ups.auro.re +# - ups-ec-3.ups.auro.re +# prometheus_servers_targets: | +# {{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }} +# prometheus_unifi_snmp_targets: | +# {{ groups['edc_unifi'] | list | sort }} +# prometheus_ilo_snmp_targets: | +# {{ groups['edc_ilo'] | list | sort }} +# +# update_motd: +# prometheus: >- +# Prometheus (en configuration edc) est déployé (/etc/prometheus). +# roles: +# - prometheus +# - update_motd +# +#- hosts: prometheus-gs.adm.auro.re +# vars: +# prometheus_alertmanager: docker-ovh.adm.auro.re:9093 +# snmp_unifi_password: "{{ vault_snmp_unifi_password }}" +# snmp_switch_community: "{{ vault_snmp_switch_community }}" +# snmp_pdu_user: "{{ vault_snmp_pdu_user }}" +# snmp_pdu_password: "{{ vault_snmp_pdu_password }}" +# snmp_ilo_user: aurore +# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" +# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" +# +# prometheus_servers_targets: | +# {{ groups['gs_pve'] + groups['gs_vm'] | list | sort }} +# prometheus_unifi_snmp_targets: | +# {{ groups['gs_unifi'] | list | sort }} +# prometheus_ups_snmp_targets: +# - ups-gk-1.ups.auro.re +# prometheus_apc_pdu_snmp_targets: +# - pdu-ga-1.ups.auro.re +# prometheus_ilo_snmp_targets: | +# {{ groups['gs_ilo'] | list | sort }} +# +# update_motd: +# prometheus: >- +# Prometheus (en configuration gs) est déployé (/etc/prometheus). +# roles: +# - prometheus +# - update_motd +# +#- hosts: prometheus-rives.adm.auro.re +# vars: +# prometheus_alertmanager: docker-ovh.adm.auro.re:9093 +# snmp_unifi_password: "{{ vault_snmp_unifi_password }}" +# snmp_switch_community: "{{ vault_snmp_switch_community }}" +# snmp_pdu_user: "{{ vault_snmp_pdu_user }}" +# snmp_pdu_password: "{{ vault_snmp_pdu_password }}" +# snmp_ilo_user: aurore +# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" +# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" +# +# prometheus_ups_snmp_targets: +# - ups-r3-1.ups.auro.re +# - ups-r1-1.ups.auro.re +# prometheus_servers_targets: | +# {{ groups['rives_pve'] + groups['rives_vm'] | list | sort }} +# prometheus_unifi_snmp_targets: | +# {{ groups['rives_unifi'] | list | sort }} +# prometheus_ilo_snmp_targets: | +# {{ groups['rives_ilo'] | list | sort }} +# +# update_motd: +# prometheus: >- +# Prometheus (en configuration rives) est déployé (/etc/prometheus). +# roles: +# - prometheus +# - update_motd +# +#- hosts: prometheus-aurore.adm.auro.re +# vars: +# prometheus_alertmanager: docker-ovh.adm.auro.re:9093 +# snmp_unifi_password: "{{ vault_snmp_unifi_password }}" +# snmp_switch_community: "{{ vault_snmp_switch_community }}" +# snmp_pdu_user: "{{ vault_snmp_pdu_user }}" +# snmp_pdu_password: "{{ vault_snmp_pdu_password }}" +# snmp_ilo_user: aurore +# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" +# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" +# +# prometheus_servers_targets: | +# {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }} +# prometheus_postgresql_targets: | +# {{ groups['bdd'] + groups['radius'] | list | sort }} +# prometheus_switch_snmp_targets: +# - yggdrasil.switch.auro.re +# - sw-pn-serveurs.switch.auro.re +# - sw-ec-serveurs.switch.auro.re +# - sw-gk-serveurs.switch.auro.re +# - sw-fl-serveurs.switch.auro.re +# - sw-ff-uplink.switch.auro.re +# - sw-fl-core.switch.auro.re +# - sw-fd-vcore.switch.auro.re +# - sw-fl-vcore.switch.auro.re +# - sw-ff-vcore.switch.auro.re +# - sw-pn-core.switch.auro.re +# - sw-ec-core.switch.auro.re +# - sw-gk-core.switch.auro.re +# - sw-r3-core.switch.auro.re +# prometheus_ilo_snmp_targets: | +# {{ groups['aurore_ilo'] | list | sort }} +# +# update_motd: +# prometheus: >- +# Prometheus (en configuration aurore) est déployé (/etc/prometheus). +# roles: +# - prometheus +# - update_motd +# +#- hosts: prometheus-ovh.adm.auro.re +# vars: +# prometheus_alertmanager: docker-ovh.adm.auro.re:9093 +# snmp_unifi_password: "{{ vault_snmp_unifi_password }}" +# snmp_switch_community: "{{ vault_snmp_switch_community }}" +# snmp_pdu_user: "{{ vault_snmp_pdu_user }}" +# snmp_pdu_password: "{{ vault_snmp_pdu_password }}" +# snmp_ilo_user: aurore +# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" +# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" +# +# prometheus_servers_targets: | +# {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} +# prometheus_postgresql_targets: +# - bdd-ovh.adm.auro.re +# prometheus_docker_targets: +# - docker-ovh.adm.auro.re +# +# update_motd: +# prometheus: >- +# Prometheus (en configuration ovh) est déployé (/etc/prometheus). +# roles: +# - prometheus +# - update_motd +# +#- hosts: prometheus-federate.adm.auro.re +# vars: +# prometheus_alertmanager: docker-ovh.adm.auro.re:9093 +# snmp_unifi_password: "{{ vault_snmp_unifi_password }}" +# snmp_pdu_user: "{{ vault_snmp_pdu_user }}" +# snmp_pdu_password: "{{ vault_snmp_pdu_password }}" +# snmp_ilo_user: aurore +# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" +# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" +# +# prometheus_servers_targets: +# - prometheus-edc.adm.auro.re +# - prometheus-gs.adm.auro.re +# - prometheus-fleming.adm.auro.re +# - prometheus-pacaterie.adm.auro.re +# - prometheus-rives.adm.auro.re +# - prometheus-aurore.adm.auro.re +# - prometheus-ovh.adm.auro.re +# +# update_motd: +# prometheus_federate: >- +# Prometheus (en configuration fédération) est déployé (/etc/prometheus). +# roles: +# - prometheus_federate +# - update_motd +# +## Postgres Exporters +#- hosts: bdd,radius +# roles: +# - prometheus_postgres +# +## Monitor all hosts +#- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container +# roles: +# - prometheus_node diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml new file mode 100644 index 0000000..477d06c --- /dev/null +++ b/roles/prometheus/defaults/main.yml @@ -0,0 +1,7 @@ +--- +prometheus__alertmanager_targets: [] +prometheus__scraping: {} +prometheus__alert_rules: {} +prometheus__tsdb_retention_time: 15d +prometheus__page_title: "{{ inventory_hostname }}" +... diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml index d501c14..7297080 100644 --- a/roles/prometheus/handlers/main.yml +++ b/roles/prometheus/handlers/main.yml @@ -1,11 +1,11 @@ --- -- name: Restart Prometheus - service: - name: prometheus +- name: Restart prometheus + systemd: + name: prometheus.service state: restarted -- name: Restart prometheus-snmp-exporter - service: - name: prometheus-snmp-exporter - state: restarted +- name: Reload prometheus + systemd: + name: prometheus.service + state: reloaded ... diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 783bdad..d2f9837 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -1,69 +1,55 @@ --- -- name: Install Prometheus +- name: Install prometheus apt: - update_cache: true name: - prometheus - - prometheus-snmp-exporter - register: apt_result - retries: 3 - until: apt_result is succeeded -- name: Configure Prometheus +- name: Configure prometheus template: - src: "{{ item }}.j2" - dest: "/etc/prometheus/{{ item }}" + src: prometheus.yml.j2 + dest: /etc/prometheus/prometheus.yml owner: prometheus group: prometheus - mode: u=r,g=r,o= - loop: - - prometheus.yml - notify: Restart Prometheus + mode: u=rw,g=r,o=r + vars: + prometheus__config: + alerting: + alertmanagers: + - static_configs: + - targets: "{{ prometheus__alertmanager_targets }}" + rule_files: + - /etc/prometheus/rules.yml + scrape_configs: "{{ prometheus__scraping + | prometheus__convert_jobs }}" + notify: + - Restart prometheus -- name: Creates directory for alerts - file: - path: /etc/prometheus/alerts - state: directory - owner: prometheus - group: prometheus - mode: 0755 - -- name: Configure Prometheus alerts +- name: Configure prometheus default template: - src: "{{ item }}.j2" - dest: "/etc/prometheus/alerts/{{ item }}" - owner: prometheus - group: prometheus - mode: u=r,g=r,o= - loop: - - server.rules.yml - - docker.rules.yml - - ups.rules.yml - - postgres.rules.yml - - environmental.rules.yml - - ilo.rules.yml - notify: Restart Prometheus + src: default.j2 + dest: /etc/default/prometheus + owner: root + group: root + mode: u=rw,g=r,o=r + notify: + - Restart prometheus -- name: Make Prometheus snmp-exporter listen on localhost only - lineinfile: - path: /etc/default/prometheus-snmp-exporter - regexp: '^ARGS=' - line: "ARGS=\"--web.listen-address=127.0.0.1:9116\"" - notify: Restart prometheus-snmp-exporter - -# These files store SNMP OIDs -- name: Configure Prometheus snmp-exporter +- name: Configure prometheus rules template: - src: "{{ item }}.j2" - dest: "/etc/prometheus/{{ item }}" + src: rules.yml.j2 + dest: /etc/prometheus/rules.yml owner: prometheus group: prometheus - mode: u=r,g=r,o= - loop: - - snmp.yml - notify: Restart prometheus-snmp-exporter + mode: u=rw,g=r,o=r + validate: "promtool check rules %s" + vars: + prometheus__rules: + groups: "{{ prometheus__alert_rules + | dict2items(key_name='name', value_name='rules') }}" + notify: + - Reload prometheus -- name: Activate prometheus service +- name: Enable prometheus systemd: name: prometheus enabled: true diff --git a/roles/prometheus/templates/default.j2 b/roles/prometheus/templates/default.j2 new file mode 100644 index 0000000..fbb1e70 --- /dev/null +++ b/roles/prometheus/templates/default.j2 @@ -0,0 +1,3 @@ +{{ ansible_managed | comment }} + +ARGS="--storage.tsdb.retention.time={{ prometheus__tsdb_retention_time | quote }} --web.page-title={{ prometheus__page_title | quote }}" diff --git a/roles/prometheus/templates/docker.rules.yml.j2 b/roles/prometheus/templates/docker.rules.yml.j2 deleted file mode 100644 index d911698..0000000 --- a/roles/prometheus/templates/docker.rules.yml.j2 +++ /dev/null @@ -1,50 +0,0 @@ ---- -{{ ansible_managed | comment }} - -{% macro raw(string) -%} -{{ "{{" }} {{ string }} {{ "}}" }} -{%- endmacro %} - -groups: - - - name: docker.rules - rules: - - - alert: ContainerDown - expr: docker_container_running_state != 1 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Le container Docker est éteint / tombé - (container {{ raw('$labels.name') }}) - - - alert: ContainerFailed - expr: sum(increase(docker_container_restart_count[5m])) > 2 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Le container Docker redémarre souvent - (container {{ raw('$labels.name') }}) - - - alert: ContainerFailed - expr: - ( - docker_container_cpu_used_total - / - docker_container_cpu_capacity_total - ) * 100 - > 30 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Le container Docker utilise beaucoup de CPU - (container {{ raw('$labels.name') }}, - valeur {{ raw('$value | printf "%.1f"') }}) - -... diff --git a/roles/prometheus/templates/environmental.rules.yml.j2 b/roles/prometheus/templates/environmental.rules.yml.j2 deleted file mode 100644 index f371329..0000000 --- a/roles/prometheus/templates/environmental.rules.yml.j2 +++ /dev/null @@ -1,52 +0,0 @@ ---- -{{ ansible_managed | comment }} - -{% macro raw(string) -%} -{{ "{{" }} {{ string }} {{ "}}" }} -{%- endmacro %} - -groups: - - - name: environmental.rules - rules: - - - alert: EnvironmentalTemperature - expr: rPDU2SensorTempHumidityStatusTempC / 10 > 30 - for: 10m - labels: - severity: warning - annotations: - summary: >- - Température environnementale à {{ raw('$value') }}° - - - alert: EnvironmentalTemperature - expr: rPDU2SensorTempHumidityStatusTempC / 10 > 40 - for: 10m - labels: - severity: critical - annotations: - summary: >- - Température environnementale à {{ raw('$value') }}° - - - - alert: EnvironmentalTemperature - expr: xupsEnvRemoteTemp > 30 - for: 10m - labels: - severity: warning - annotations: - summary: >- - Température environnementale à {{ raw('$value') }}° - - - alert: EnvironmentalTemperature - expr: xupsEnvRemoteTemp > 40 - for: 10m - labels: - severity: critical - annotations: - summary: >- - Température environnementale à {{ raw('$value') }}° - - - -... diff --git a/roles/prometheus/templates/ilo.rules.yml.j2 b/roles/prometheus/templates/ilo.rules.yml.j2 deleted file mode 100644 index d6bbe75..0000000 --- a/roles/prometheus/templates/ilo.rules.yml.j2 +++ /dev/null @@ -1,83 +0,0 @@ ---- -{{ ansible_managed | comment }} - -{% macro raw(string) -%} -{{ "{{" }} {{ string }} {{ "}}" }} -{%- endmacro %} - -groups: - - - name: ilo.rules - rules: - - - alert: IloResilientMemoryDegraded - expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1 - for: 3m - labels: - severity: warning - annotations: - summary: >- - La mémoire vive n'est plus résiliente - ({{ raw('$labels.cpqHeResilientMemCondition') }}) - - - alert: IloBiosSelfTestDegraded - expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1 - for: 3m - labels: - severity: critical - annotations: - summary: >- - Une erreur a été détectée lors du POST du serveur - ({{ raw('$labels.cpqHeHWBiosCondition') }}) - - - alert: IloBatteryDegraded - expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1 - for: 3m - labels: - severity: warning - annotations: - summary: >- - La batterie est dégradée - ({{ raw('$labels.cpqHeSysBatteryCondition') }}) - - - alert: IloTemperatureSensorDegraded - expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1 - for: 3m - labels: - severity: critical - annotations: - summary: >- - Le capteur de température est dégradé - ({{ raw('$labels.cpqHeTemperatureCondition') }}) - - - alert: IloFanDegraded - expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1 - for: 3m - labels: - severity: critical - annotations: - summary: >- - Le ventilateur est dégradé - ({{ raw('$labels.cpqHeFltTolFanCondition') }}) - - - alert: IloPowerSupplyDegraded - expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1 - for: 3m - labels: - severity: critical - annotations: - summary: >- - L'alimentation est dégradée - ({{ raw('$labels.cpqHeFltTolPowerSupplyStatus') }}) - - - alert: IloOverrideSwitchState - expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1 - for: 3m - labels: - severity: critical - annotations: - summary: >- - Le switch de réinitialisation n'est pas à l'état d'origine, - l'authentification est bypassée - -... diff --git a/roles/prometheus/templates/postgres.rules.yml.j2 b/roles/prometheus/templates/postgres.rules.yml.j2 deleted file mode 100644 index aa24537..0000000 --- a/roles/prometheus/templates/postgres.rules.yml.j2 +++ /dev/null @@ -1,219 +0,0 @@ ---- -{{ ansible_managed | comment }} - -{% macro raw(string) -%} -{{ "{{" }} {{ string }} {{ "}}" }} -{%- endmacro %} - -groups: - - - name: postgres.rules - rules: - - alert: PostgresqlDown - expr: pg_up == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Serveur PostgreSQL down - - - alert: PostgresqlRestarted - expr: time() - pg_postmaster_start_time_seconds < 60 - for: 0m - labels: - severity: critical - annotations: - summary: Serveur PostgreSQL redémarré - - - alert: PostgresqlExporterError - expr: pg_exporter_last_scrape_error > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Erreur dans l'exporter PostgreSQL - - - alert: PostgresqlReplicationLag - expr: - pg_replication_lag > 30 - and - ON(instance) pg_replication_is_replica == 1 - for: 0m - labels: - severity: critical - annotations: - summary: >- - La réplication PostgreSQL lag ({{ raw('$value') }} > 30s) - (base de données {{ raw('$labels.datname') }} ) - - - alert: PostgresqlTableNotVaccumed - expr: - time() - pg_stat_user_tables_last_autovacuum - > 60 * 60 * 24 - for: 0m - labels: - severity: warning - annotations: - summary: >- - Le démon autovacuum n'a pas été lancé depuis 24h - (base de données {{ raw('$labels.datname') }} ) - - - alert: PostgresqlTableNotAnalyzed - expr: - time() - pg_stat_user_tables_last_autoanalyze - > 60 * 60 * 24 - for: 0m - labels: - severity: warning - annotations: - summary: >- - Table non-analysée depuis 24h - (base de données {{ raw('$labels.datname') }}) - - - alert: PostgresqlTooManyConnections - expr: - ( - sum by (datname) - (pg_stat_activity_count{datname!~"template.*|postgres"}) - ) * 100 - > pg_settings_max_connections * 80 - for: 2m - labels: - severity: warning - annotations: - summary: >- - PostgreSQL a trop de connexions - ({{ raw('$value | printf "%.1f"') }} > 80%) - (base de données {{ raw('$labels.datname') }}) - - - alert: PostgresqlDeadLocks - expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 - for: 0m - labels: - severity: warning - annotations: - summary: >- - PostgreSQL a plus de 5 deadlocks. - (base de données {{ raw('$labels.datname') }} ) - - - alert: PostgresqlSlowQueries - expr: pg_slow_queries > 0 - for: 2m - labels: - severity: warning - annotations: - summary: >- - Présence de requêtes lentes (slow-queries) - (base de données {{ raw('$labels.datname') }} ) - - - alert: PostgresqlHighRollbackRate - expr: - ( - rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / - rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) - ) * 100 - > 20 - for: 0m - labels: - severity: warning - annotations: - summary: >- - PostgreSQL a un taux de retour en arrière (rollback) élevé - (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} %) - - - alert: PostgresqlWaleReplicationStopped - expr: rate(pg_xlog_position_bytes[1m]) == 0 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Réplication de PostgreSQL WALE stoppée - (base de données {{ raw('$labels.datname') }} ) - - - alert: PostgresqlHighRateStatementTimeout - expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Beaucoup de requêtes PostgreSQL sont timeout - (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} ) - - - alert: PostgresqlHighRateDeadlock - expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1 - for: 0m - labels: - severity: critical - annotations: - summary: >- - PostgreSQL a un fort taux de deadlock - (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} ) - -# - alert: PostgresqlReplicationLagBytes -# expr: -# (pg_xlog_position_bytes and pg_replication_is_replica == 0) -# - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1) -# > 1e+09 -# for: 0m -# labels: -# severity: critical -# annotations: -# summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} ) - - - alert: PostgresqlTooManyDeadTuples - expr: - ( - (pg_stat_user_tables_n_dead_tup > 10000) - / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) - ) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1) - for: 2m - labels: - severity: warning - annotations: - summary: >- - Les tuples morts PostgreSQL sont trop volumineux - (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} ) - - - alert: PostgresqlSplitBrain - expr: count(pg_replication_is_replica == 0) != 1 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Split Brain : trop de bases de données PostgreSQL primaires en mode lecture-écriture - (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }} ) - - - alert: PostgresqlPromotedNode - expr: - pg_replication_is_replica - and - changes(pg_replication_is_replica[1m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: >- - Le serveur de secours PostgreSQL a été promu comme nœud principal - (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }}) - - - alert: PostgresqlTooManyLocksAcquired - expr: - ( - (sum (pg_locks_count)) - / (pg_settings_max_locks_per_transaction * pg_settings_max_connections) - ) * 100 > 20 - for: 2m - labels: - severity: critical - annotations: - summary: >- - Trop de deadlocks acquis sur la base de données. - Si cette alerte se produit fréquemment, nous devrons peut-être augmenter - le paramètre postgres max_locks_per_transaction - (Valeur = {{ raw('$value | printf "%.1f"') }} ) - -... - diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 9fd0531..1f68d8a 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -1,6 +1,10 @@ --- {{ ansible_managed | comment }} +{{ prometheus__config | to_nice_yaml }} +... + +{# global: # scrape_interval is set to the global default (60s) # evaluation_interval is set to the global default (60s) @@ -156,5 +160,5 @@ scrape_configs: - target_label: __address__ replacement: 127.0.0.1:9116 {% endif %} - ... +#} diff --git a/roles/prometheus/templates/rules.yml.j2 b/roles/prometheus/templates/rules.yml.j2 new file mode 100644 index 0000000..18e258c --- /dev/null +++ b/roles/prometheus/templates/rules.yml.j2 @@ -0,0 +1,5 @@ +--- +{{ ansible_managed | comment }} + +{{ prometheus__rules | to_nice_yaml }} +... diff --git a/roles/prometheus/templates/server.rules.yml.j2 b/roles/prometheus/templates/server.rules.yml.j2 deleted file mode 100644 index c4138fe..0000000 --- a/roles/prometheus/templates/server.rules.yml.j2 +++ /dev/null @@ -1,156 +0,0 @@ ---- -{{ ansible_managed | comment }} - -{% macro raw(string) -%} -{{ "{{" }} {{ string }} {{ "}}" }} -{%- endmacro %} - -groups: - - - name: server.rules - rules: - - - alert: MachineDown - expr: up{instance!~".*.borne.auro.re$"} == 0 - for: 3m - labels: - severity: critical - annotations: - summary: >- - Le collecteur {{ raw('$labels.job') }} ne marche plus - - - alert: AccessPointDown - expr: up{instance=~".*.borne.auro.re$"} == 0 - for: 3m - labels: - severity: warning - - - alert: OutOfMemory - expr: >- - ( - node_memory_MemFree_bytes - + node_memory_Cached_bytes - + node_memory_Buffers_bytes - ) / node_memory_MemTotal_bytes * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: >- - {{ raw('$value | printf "%.1f"') }}% de mémoire - libre - - - alert: HostSwapIsFillingUp - expr: >- - ( - 1 - ( - node_memory_SwapFree_bytes - / node_memory_SwapTotal_bytes - ) - ) * 100 >= 50 - for: 3m - labels: - severity: warning - annotations: - summary: >- - La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}% - - - alert: HostPhysicalComponentTooHot - expr: node_hwmon_temp_celsius > 79 - for: 3m - labels: - severity: critical - annotations: - summary: >- - La température de l'hôte est de {{ raw('$value') }}°C - ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }}) - - - alert: HostNodeOvertemperatureAlarm - expr: node_hwmon_temp_crit_alarm_celsius == 1 - for: 0m - labels: - severity: critical - annotations: - summary: >- - L'alarme de température de l'hôte est active - ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }}) - - - alert: HostOomKillDetected - expr: increase(node_vmstat_oom_kill[1m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer) - - - alert: HostEdacCorrectableErrorsDetected - expr: increase(node_edac_correctable_errors_total[1m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: >- - {{ raw('$value | print "%.1f"') }} erreur(s) ont été - corrigée(s) (EDAC) - - - alert: OutOfDiskSpace - expr: >- - node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: >- - {{ raw('$value | printf "%.1f"') }}% d'espace libre pour - {{ raw('$labels.mountpoint') }} - - - alert: OutOfInodes - expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: >- - {{ raw('$value | printf "%.1f"') }}% d'inodes - restants pour {{ raw('$labels.mountpoint') }} - - - alert: CpuUsage - expr: >- - ( - 100 - avg by (instance) ( - irate(node_cpu_seconds_total{mode="idle"}[5m]) - ) * 100 - ) > 75 - for: 10m - labels: - severity: warning - annotations: - summary: >- - CPU à {{ raw('$value | printf "%.1f"') }}% - - - alert: SystemdServiceFailed - expr: node_systemd_unit_state{state="failed"} == 1 - for: 10m - labels: - severity: warning - annotations: - summary: >- - {{ raw('$labels.name') }} a échoué - - - alert: LoadUsage - expr: node_load1 > 5 - for: 2m - labels: - severity: warning - annotations: - summary: > - Charge à {{ raw('$value') }} - - - alert: UnhealthyDisk - expr: smartmon_device_smart_healthy < 1 - for: 10m - labels: - severity: "critical" - annotations: - summary: "Le Disque {{ raw('$labels.disk') }} n'est pas en bonne santé !" -... diff --git a/roles/prometheus/templates/snmp.yml.j2 b/roles/prometheus/templates/snmp.yml.j2 deleted file mode 100644 index 3b9407a..0000000 --- a/roles/prometheus/templates/snmp.yml.j2 +++ /dev/null @@ -1,708 +0,0 @@ ---- -{{ ansible_managed | comment }} - -# TODOlist : -# - Faire fonctionner le monitoring des switchs défini ici -# * Configurer tous les switchs avec un compte SNMPv3 -# * Mettre l'inventaire des switchs dans Ansible -# - Optimiser les règles pour les bornes Unifi, -# on pourrait indexer avec les SSID - -eatonups: - walk: - - 1.3.6.1.2.1.33.1.2 - - 1.3.6.1.2.1.33.1.3 - - 1.3.6.1.2.1.33.1.4 - - 1.3.6.1.4.1.534.1.6 - - 1.3.6.1.4.1.318.1.1.10.2.3.2.1.4 - get: - - 1.3.6.1.2.1.1.3.0 - metrics: - - name: sysUpTime - oid: 1.3.6.1.2.1.1.3 - type: gauge - help: The time (in hundredths of a second) since the network management portion - of the system was last re-initialized. - 1.3.6.1.2.1.1.3 - - name: upsBatteryStatus - oid: 1.3.6.1.2.1.33.1.2.1 - type: gauge - help: The indication of the capacity remaining in the UPS system's batteries - - 1.3.6.1.2.1.33.1.2.1 - - name: upsEstimatedMinutesRemaining - oid: 1.3.6.1.2.1.33.1.2.3 - type: gauge - help: An estimate of the time to battery charge depletion under the present load - conditions if the utility power is off and remains off, or if it were to be - lost and remain off. - 1.3.6.1.2.1.33.1.2.3 - - name: upsInputVoltage - oid: 1.3.6.1.2.1.33.1.3.3.1.3 - type: gauge - help: The magnitude of the present input voltage. - 1.3.6.1.2.1.33.1.3.3.1.3 - indexes: - - labelname: upsInputLineIndex - type: gauge - - name: upsOutputSource - oid: 1.3.6.1.2.1.33.1.4.1 - type: gauge - help: The present source of output power - 1.3.6.1.2.1.33.1.4.1 - - name: upsOutputVoltage - oid: 1.3.6.1.2.1.33.1.4.4.1.2 - type: gauge - help: The present output voltage. - 1.3.6.1.2.1.33.1.4.4.1.2 - indexes: - - labelname: upsOutputLineIndex - type: gauge - - name: upsOutputPower - oid: 1.3.6.1.2.1.33.1.4.4.1.4 - type: gauge - help: The present output true power. - 1.3.6.1.2.1.33.1.4.4.1.4 - indexes: - - labelname: upsOutputLineIndex - type: gauge - - name: upsOutputPercentLoad - oid: 1.3.6.1.2.1.33.1.4.4.1.5 - type: gauge - help: The percentage of the UPS power capacity presently being used on this output - line, i.e., the greater of the percent load of true power capacity and the percent - load of VA. - 1.3.6.1.2.1.33.1.4.4.1.5 - indexes: - - labelname: upsOutputLineIndex - type: gauge - - name: xupsEnvRemoteTemp - oid: 1.3.6.1.4.1.318.1.1.10.2.3.2.1.4 - type: gauge - help: The reading of an EMP's temperature sensor (APC MIB) - 1.3.6.1.4.1.318.1.1.10.2.3.2.1.4 - - name: xupsEnvRemoteTemp - oid: 1.3.6.1.4.1.534.1.6.5 - type: gauge - help: The reading of an EMP's temperature sensor (Eaton MIB) - 1.3.6.1.4.1.534.1.6.5 - - name: xupsEnvRemoteHumidity - oid: 1.3.6.1.4.1.534.1.6.6 - type: gauge - help: The reading of an EMP's humidity sensor. - 1.3.6.1.4.1.534.1.6.6 - version: 1 - auth: - community: public - -procurve_switch: - walk: - - 1.3.6.1.2.1.31.1.1.1.10 - - 1.3.6.1.2.1.31.1.1.1 - - 1.3.6.1.2.1.2.2.1.2 - - 1.3.6.1.2.1.31.1.1.1.18 - get: - - 1.3.6.1.2.1.1.3.0 - - 1.3.6.1.2.1.1.5.0 - - 1.3.6.1.2.1.1.6.0 - metrics: - - name: sysUpTime - oid: 1.3.6.1.2.1.1.3 - type: gauge - help: The time (in hundredths of a second) since the network management - portion of the system was last re-initialized. - 1.3.6.1.2.1.1.3 - - name: sysName - oid: 1.3.6.1.2.1.1.5 - type: DisplayString - help: An administratively-assigned name for this managed node - - 1.3.6.1.2.1.1.5 - - name: sysLocation - oid: 1.3.6.1.2.1.1.6 - type: DisplayString - help: The physical location of this node (e.g., 'telephone closet, 3rd - floor') - 1.3.6.1.2.1.1.6 - - name: ifHCOutOctets - oid: 1.3.6.1.2.1.31.1.1.1.10 - type: counter - help: The total number of octets transmitted out of the interface, - including framing characters - 1.3.6.1.2.1.31.1.1.1.10 - indexes: - - labelname: ifIndex - type: gauge - lookups: - - labels: - - ifIndex - labelname: ifDescr - oid: 1.3.6.1.2.1.2.2.1.2 - type: DisplayString - - labels: - - ifIndex - labelname: ifName - oid: 1.3.6.1.2.1.31.1.1.1.1 - type: DisplayString - - name: ifHCInOctets - oid: 1.3.6.1.2.1.31.1.1.1.6 - type: counter - help: The total number of octets received on the interface, including - framing characters - 1.3.6.1.2.1.31.1.1.1.6 - indexes: - - labelname: ifIndex - type: gauge - lookups: - - labels: - - ifIndex - labelname: ifDescr - oid: 1.3.6.1.2.1.2.2.1.2 - type: DisplayString - - labels: - - ifIndex - labelname: ifName - oid: 1.3.6.1.2.1.31.1.1.1.1 - type: DisplayString - version: 2 - auth: - community: "{{ snmp_switch_community }}" - -ubiquiti_unifi: - walk: - - 1.3.6.1.4.1.41112.1.6 - get: - - 1.3.6.1.2.1.1.5.0 - - 1.3.6.1.2.1.1.6.0 - metrics: -# Pour faire une WifiMap un jour, on peut entrer la location dans la conf des bornes -# - name: sysLocation -# oid: 1.3.6.1.2.1.1.6 -# type: DisplayString -# help: The physical location of this node (e.g., 'telephone closet, 3rd floor') -# - 1.3.6.1.2.1.1.6 - - name: unifiVapIndex - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.1 - type: gauge - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.1' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapChannel - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.4 - type: gauge - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.4' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapEssId - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.6 - type: DisplayString - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.6' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapName - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.7 - type: DisplayString - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.7' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifi_vap_num_stations - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.8 - type: gauge - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.8' - indexes: - - labelname: unifi_vap_index - type: gauge - lookups: - - labels: [unifi_vap_index] - labelname: unifi_vap_essid - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.6 - type: DisplayString - - labels: [unifi_vap_index] - labelname: unifi_vap_radio - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.9 - type: DisplayString - - labels: [] - labelname: unifi_vap_index -# - name: unifiVapNumStations -# oid: 1.3.6.1.4.1.41112.1.6.1.2.1.8 -# type: gauge -# help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.8' -# indexes: -# - labelname: unifiVapIndex -# type: gauge - - name: unifiVapRadio - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.9 - type: DisplayString - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.9' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapRxBytes - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.10 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.10' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapRxCrypts - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.11 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.11' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapRxDropped - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.12 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.12' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapRxErrors - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.13 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.13' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapRxFrags - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.14 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.14' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapRxPackets - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.15 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.15' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapTxBytes - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.16 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.16' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapTxDropped - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.17 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.17' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapTxErrors - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.18 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.18' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapTxPackets - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.19 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.19' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapTxRetries - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.20 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.20' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapTxPower - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.21 - type: gauge - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.21' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapUp - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.22 - type: gauge - help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.22' - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiVapUsage - oid: 1.3.6.1.4.1.41112.1.6.1.2.1.23 - type: DisplayString - help: guest or regular user - 1.3.6.1.4.1.41112.1.6.1.2.1.23 - indexes: - - labelname: unifiVapIndex - type: gauge - - name: unifiIfIndex - oid: 1.3.6.1.4.1.41112.1.6.2.1.1.1 - type: gauge - help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.1' - indexes: - - labelname: unifiIfIndex - type: gauge - - name: unifiIfName - oid: 1.3.6.1.4.1.41112.1.6.2.1.1.5 - type: DisplayString - help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.5' - indexes: - - labelname: unifiIfIndex - type: gauge - - name: unifiIfRxBytes - oid: 1.3.6.1.4.1.41112.1.6.2.1.1.6 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.6' - indexes: - - labelname: unifiIfIndex - type: gauge - - name: unifiIfRxDropped - oid: 1.3.6.1.4.1.41112.1.6.2.1.1.7 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.7' - indexes: - - labelname: unifiIfIndex - type: gauge - - name: unifiIfRxError - oid: 1.3.6.1.4.1.41112.1.6.2.1.1.8 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.8' - indexes: - - labelname: unifiIfIndex - type: gauge - - name: unifiIfRxMulticast - oid: 1.3.6.1.4.1.41112.1.6.2.1.1.9 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.9' - indexes: - - labelname: unifiIfIndex - type: gauge - - name: unifiIfRxPackets - oid: 1.3.6.1.4.1.41112.1.6.2.1.1.10 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.10' - indexes: - - labelname: unifiIfIndex - type: gauge - - name: unifiIfTxBytes - oid: 1.3.6.1.4.1.41112.1.6.2.1.1.12 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.12' - indexes: - - labelname: unifiIfIndex - type: gauge - - name: unifiIfTxDropped - oid: 1.3.6.1.4.1.41112.1.6.2.1.1.13 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.13' - indexes: - - labelname: unifiIfIndex - type: gauge - - name: unifiIfTxError - oid: 1.3.6.1.4.1.41112.1.6.2.1.1.14 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.14' - indexes: - - labelname: unifiIfIndex - type: gauge - - name: unifiIfTxPackets - oid: 1.3.6.1.4.1.41112.1.6.2.1.1.15 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.15' - indexes: - - labelname: unifiIfIndex - type: gauge - - name: unifiApSystemModel - oid: 1.3.6.1.4.1.41112.1.6.3.3 - type: DisplayString - help: ' - 1.3.6.1.4.1.41112.1.6.3.3' - - name: unifiApSystemUptime - oid: 1.3.6.1.4.1.41112.1.6.3.5 - type: counter - help: ' - 1.3.6.1.4.1.41112.1.6.3.5' - version: 3 - auth: - security_level: authPriv - username: snmp_prometheus - password: {{ snmp_unifi_password }} - auth_protocol: SHA - priv_protocol: AES - priv_password: {{ snmp_unifi_password }} - - -apc_pdu: - walk: - - 1.3.6.1.4.1.318.1.1.26.10.2.2.1.8 - - 1.3.6.1.4.1.318.1.1.26.4.3.1.4 - - 1.3.6.1.4.1.318.1.1.26.4.3.1.5 - - 1.3.6.1.4.1.318.1.1.26.4.3.1.6 - - 1.3.6.1.4.1.318.1.1.26.6.3.1.9 - - 1.3.6.1.4.1.318.1.1.26.9.4.3.1.7 - metrics: - - name: rPDU2SensorTempHumidityStatusTempC - oid: 1.3.6.1.4.1.318.1.1.26.10.2.2.1.8 - type: gauge - help: Sensor temperature reading in tenths of degrees Celsius - 1.3.6.1.4.1.318.1.1.26.10.2.2.1.8 - indexes: - - labelname: rPDU2SensorTempHumidityStatusIndex - type: gauge - - name: rPDU2DeviceStatusLoadState - oid: 1.3.6.1.4.1.318.1.1.26.4.3.1.4 - type: gauge - help: Indicates the present load status of the Rack PDU - 1.3.6.1.4.1.318.1.1.26.4.3.1.4 - indexes: - - labelname: rPDU2DeviceStatusIndex - type: gauge - - name: rPDU2DeviceStatusPower - oid: 1.3.6.1.4.1.318.1.1.26.4.3.1.5 - type: gauge - help: The power consumption of the Rack PDU load in hundredths of kilowatts - - 1.3.6.1.4.1.318.1.1.26.4.3.1.5 - indexes: - - labelname: rPDU2DeviceStatusIndex - type: gauge - - name: rPDU2DeviceStatusPeakPower - oid: 1.3.6.1.4.1.318.1.1.26.4.3.1.6 - type: gauge - help: The peak power consumption of the Rack PDU load in hundredths of kilowatts - - 1.3.6.1.4.1.318.1.1.26.4.3.1.6 - indexes: - - labelname: rPDU2DeviceStatusIndex - type: gauge - - name: rPDU2PhaseStatusPowerFactor - oid: 1.3.6.1.4.1.318.1.1.26.6.3.1.9 - type: gauge - help: Indicates the load power factor, in hundredths, of the Rack PDU phase being - queried - 1.3.6.1.4.1.318.1.1.26.6.3.1.9 - indexes: - - labelname: rPDU2PhaseStatusIndex - type: gauge - - name: rPDU2OutletMeteredStatusPower - oid: 1.3.6.1.4.1.318.1.1.26.9.4.3.1.7 - type: gauge - help: Indicates the power draw of the load on the Rack PDU outlet being queried - - 1.3.6.1.4.1.318.1.1.26.9.4.3.1.7 - indexes: - - labelname: rPDU2OutletMeteredStatusIndex - type: gauge - version: 3 - auth: - security_level: authPriv - username: {{ snmp_pdu_user }} - password: {{ snmp_pdu_password }} - auth_protocol: SHA - priv_protocol: AES - priv_password: {{ snmp_pdu_password }} - -ilo: - walk: - - 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory - - 1.3.6.1.4.1.232.6.2.15.3 # Power meter - - 1.3.6.1.4.1.232.6.2.16.1 # POST tests - - 1.3.6.1.4.1.232.6.2.17.1 # Battery - - 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location - - 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value - - 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit - - 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition - - 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location - - 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition - - 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply - - 1.3.6.1.4.1.232.9.2.2 # iLO - metrics: - - name: cpqHeResilientMemCondition - oid: 1.3.6.1.4.1.232.6.2.14.4 - type: EnumAsStateSet - help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4 - enum_values: - 1: other - 2: ok - 3: degraded - 4: failed - - name: cpqHePowerMeterCurrReading - oid: 1.3.6.1.4.1.232.6.2.15.3 - type: gauge - help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3 - - name: cpqHeHWBiosCondition - oid: 1.3.6.1.4.1.232.6.2.16.1 - type: EnumAsStateSet - help: This value indicates an error has been detected during Pre-OS Test (POST) - or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1 - enum_values: - 1: other - 2: ok - 3: degraded - 4: failed - - name: cpqHeSysBatteryCondition - oid: 1.3.6.1.4.1.232.6.2.17.1 - type: EnumAsStateSet - help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1 - indexes: - - labelname: cpqHeSysBatteryChassis - type: gauge - - labelname: cpqHeSysBatteryIndex - type: gauge - enum_values: - 1: other - 2: ok - 3: degraded - 4: failed - - name: cpqHeTemperatureLocale - oid: 1.3.6.1.4.1.232.6.2.6.8.1.3 - type: EnumAsInfo - help: This specifies the location of the temperature sensor present in the system. - - 1.3.6.1.4.1.232.6.2.6.8.1.3 - indexes: - - labelname: cpqHeTemperatureChassis - type: gauge - - labelname: cpqHeTemperatureIndex - type: gauge - enum_values: - 1: other - 2: unknown - 3: system - 4: systemBoard - 5: ioBoard - 6: cpu - 7: memory - 8: storage - 9: removableMedia - 10: powerSupply - 11: ambient - 12: chassis - 13: bridgeCard - - name: cpqHeTemperatureCelsius - oid: 1.3.6.1.4.1.232.6.2.6.8.1.4 - type: gauge - help: This is the current temperature sensor reading in degrees celsius - 1.3.6.1.4.1.232.6.2.6.8.1.4 - indexes: - - labelname: cpqHeTemperatureChassis - type: gauge - - labelname: cpqHeTemperatureIndex - type: gauge - - name: cpqHeTemperatureThreshold - oid: 1.3.6.1.4.1.232.6.2.6.8.1.5 - type: gauge - help: This is the shutdown threshold temperature sensor setting in degrees celsius - - 1.3.6.1.4.1.232.6.2.6.8.1.5 - indexes: - - labelname: cpqHeTemperatureChassis - type: gauge - - labelname: cpqHeTemperatureIndex - type: gauge - - name: cpqHeTemperatureCondition - oid: 1.3.6.1.4.1.232.6.2.6.8.1.6 - type: EnumAsStateSet - help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6 - indexes: - - labelname: cpqHeTemperatureChassis - type: gauge - - labelname: cpqHeTemperatureIndex - type: gauge - enum_values: - 1: other - 2: ok - 3: degraded - 4: failed - - name: cpqHeFltTolFanLocale - oid: 1.3.6.1.4.1.232.6.2.6.7.1.3 - type: EnumAsInfo - help: This specifies the location of the fan present in the system. - - 1.3.6.1.4.1.232.6.2.6.7.1.3 - indexes: - - labelname: cpqHeFltTolFanChassis - type: gauge - - labelname: cpqHeFltTolFanIndex - type: gauge - enum_values: - 1: other - 2: unknown - 3: system - 4: systemBoard - 5: ioBoard - 6: cpu - 7: memory - 8: storage - 9: removableMedia - 10: powerSupply - 11: ambient - 12: chassis - 13: bridgeCard - - name: cpqHeFltTolFanCondition - oid: 1.3.6.1.4.1.232.6.2.6.7.1.9 - type: EnumAsStateSet - help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9 - indexes: - - labelname: cpqHeFltTolFanChassis - type: gauge - - labelname: cpqHeFltTolFanIndex - type: gauge - enum_values: - 1: other - 2: ok - 3: degraded - 4: failed - - name: cpqHeFltTolPowerSupplyStatus - oid: 1.3.6.1.4.1.232.6.2.9.3.1.5 - type: EnumAsStateSet - help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5 - indexes: - - labelname: cpqHeFltTolPowerSupplyChassis - type: gauge - - labelname: cpqHeFltTolPowerSupplyBay - type: gauge - enum_values: - 1: noError - 2: generalFailure - 3: bistFailure - 4: fanFailure - 5: tempFailure - 6: interlockOpen - 7: epromFailed - 8: vrefFailed - 9: dacFailed - 10: ramTestFailed - 11: voltageChannelFailed - 12: orringdiodeFailed - 13: brownOut - 14: giveupOnStartup - 15: nvramInvalid - 16: calibrationTableInvalid - 17: noPowerInput - - name: cpqSm2CntlrInterfaceStatus - oid: 1.3.6.1.4.1.232.9.2.2.17 - type: EnumAsStateSet - help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17 - enum_values: - 1: other - 2: ok - 3: notResponding - - name: cpqSm2CntlriLOSecurityOverrideSwitchState - oid: 1.3.6.1.4.1.232.9.2.2.27 - type: EnumAsStateSet - help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27 - enum_values: - 1: notSupported - 2: set - 3: notSet - - name: cpqSm2CntlrLicenseActive - oid: 1.3.6.1.4.1.232.9.2.2.30 - type: EnumAsStateSet - help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30 - enum_values: - 1: none - 2: iloAdvanced - 3: iloLight - 4: iloAdvancedBlade - 5: iloStandard - 6: iloEssentials - 7: iloScaleOut - 8: iloAdvancedPremiumSecurity - - name: cpqSm2CntlrServerPowerState - oid: 1.3.6.1.4.1.232.9.2.2.32 - type: EnumAsStateSet - help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32 - enum_values: - 1: unknown - 2: poweredOff - 3: poweredOn - 4: insufficientPowerOrPowerOnDenied - version: 3 - # Reduce timeout to retry faster - timeout: 1s - auth: - security_level: authPriv - username: {{ snmp_ilo_user }} - password: {{ snmp_ilo_auth }} - auth_protocol: SHA - priv_protocol: AES - priv_password: {{ snmp_ilo_priv }} - -... diff --git a/roles/prometheus/templates/ups.rules.yml.j2 b/roles/prometheus/templates/ups.rules.yml.j2 deleted file mode 100644 index eafdee3..0000000 --- a/roles/prometheus/templates/ups.rules.yml.j2 +++ /dev/null @@ -1,87 +0,0 @@ ---- -{{ ansible_managed | comment }} - -{% macro raw(string) -%} -{{ "{{" }} {{ string }} {{ "}}" }} -{%- endmacro %} - -groups: - - - name: ups.rules - rules: - - - alert: UpsOutputSourceChanged - expr: upsOutputSource != 3 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Source d'alimentation changée - - - alert: UpsBatteryStatus - expr: upsBatteryStatus == 3 - for: 0m - labels: - severity: warning - annotations: - summary: >- - État de la batterie faible - - - alert: UpsBatteryStatus - expr: upsBatteryStatus == 4 - for: 0m - labels: - severity: critical - annotations: - summary: >- - État de la batterie critique - - - alert: UpsHighLoad - expr: upsOutputPercentLoad > 70 - for: 3m - labels: - severity: critical - annotations: - summary: >- - Charge de {{ raw('$value | printf "%.1f"') }}% - - - alert: UpsWrongInputVoltage - expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) - for: 5m - labels: - severity: warning - annotations: - summary: >- - Tension d'entrée de {{ raw('$value') }}V - - - alert: UpsWrongOutputVoltage - expr: >- - abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d])) - < 3 * stddev_over_time(upsOutputVoltage[1d]) - for: 5m - labels: - severity: warning - annotations: - summary: >- - Tension de sortie de {{ raw('$value') }}V - - - alert: UpsTimeRemaining - expr: upsEstimatedMinutesRemaining < 8 - for: 0m - labels: - severity: warning - annotations: - summary: >- - Autonomie restante de {{ raw('$value') }} min - - - alert: UpsTimeRemaining - expr: upsEstimatedMinutesRemaining < 5 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Autonomie restante de {{ raw('$value') }} min - -...