prometheus: cleanup role (lots of features missing)

This commit is contained in:
jeltz 2023-04-02 05:08:01 +02:00
parent 7db15d9c63
commit 922b6894a7
Signed by: jeltz
GPG key ID: 800882B66C0C3326
15 changed files with 484 additions and 1651 deletions

View file

@ -0,0 +1,55 @@
from ansible.parsing.yaml.objects import AnsibleUnicode
class FilterModule:
def filters(self):
return {
"prometheus__convert_jobs": convert_jobs,
"interp": interp,
}
def interp(string):
return AnsibleUnicode(f"{{{{ {string} }}}}")
def convert_jobs(config):
for name, job in config.items():
config = {
"job_name": name,
"static_configs": [
{
"targets": job["targets"],
}
],
"params": job.get("params", {}),
}
if "path" in job:
config["metrics_path"] = job["path"]
if "address" in job:
try:
replacement = f"$1:{job['address']['port']}"
except Exception:
replacement = job["address"]
config["relabel_configs"] = [
{
"source_labels": ["__address__"],
"target_label": "__param_target",
},
{
"source_labels": ["__param_target"],
"target_label": "instance",
},
{
"source_labels": ["__param_target"],
"target_label": "__address__",
"replacement": replacement,
},
]
yield config

View file

@ -1,241 +1,369 @@
#!/usr/bin/env ansible-playbook #!/usr/bin/env ansible-playbook
--- ---
- hosts: prometheus-fleming.adm.auro.re - hosts:
vars: - pve_network
prometheus_alertmanager: docker-ovh.adm.auro.re:9093 - vm_network
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
snmp_switch_community: "{{ vault_snmp_switch_community }}"
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
snmp_ilo_user: aurore
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
prometheus_servers_targets: |
{{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }}
prometheus_unifi_snmp_targets: |
{{ groups['fleming_unifi'] | list | sort }}
prometheus_ilo_snmp_targets: |
{{ groups['fleming_ilo'] | list | sort }}
update_motd:
prometheus: >-
Prometheus (en configuration fleming) est déployé (/etc/prometheus).
roles:
- prometheus
- update_motd
- hosts: prometheus-pacaterie.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
snmp_switch_community: "{{ vault_snmp_switch_community }}"
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
snmp_ilo_user: aurore
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
prometheus_servers_targets: |
{{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }}
prometheus_unifi_snmp_targets: |
{{ groups['pacaterie_unifi'] | list | sort }}
prometheus_ups_snmp_targets:
- ups-pn-1.ups.auro.re
- ups-ps-1.ups.auro.re
prometheus_ilo_snmp_targets: |
{{ groups['pacaterie_ilo'] | list | sort }}
update_motd:
prometheus: >-
Prometheus (en configuration pacaterie) est déployé (/etc/prometheus).
roles:
- prometheus
- update_motd
- hosts: prometheus-edc.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
snmp_switch_community: "{{ vault_snmp_switch_community }}"
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
snmp_ilo_user: aurore
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
prometheus_ups_snmp_targets:
- ups-ec-1.ups.auro.re
# - ups-ec-2.ups.auro.re
- ups-ec-3.ups.auro.re
prometheus_servers_targets: |
{{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }}
prometheus_unifi_snmp_targets: |
{{ groups['edc_unifi'] | list | sort }}
prometheus_ilo_snmp_targets: |
{{ groups['edc_ilo'] | list | sort }}
update_motd:
prometheus: >-
Prometheus (en configuration edc) est déployé (/etc/prometheus).
roles:
- prometheus
- update_motd
- hosts: prometheus-gs.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
snmp_switch_community: "{{ vault_snmp_switch_community }}"
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
snmp_ilo_user: aurore
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
prometheus_servers_targets: |
{{ groups['gs_pve'] + groups['gs_vm'] | list | sort }}
prometheus_unifi_snmp_targets: |
{{ groups['gs_unifi'] | list | sort }}
prometheus_ups_snmp_targets:
- ups-gk-1.ups.auro.re
prometheus_apc_pdu_snmp_targets:
- pdu-ga-1.ups.auro.re
prometheus_ilo_snmp_targets: |
{{ groups['gs_ilo'] | list | sort }}
update_motd:
prometheus: >-
Prometheus (en configuration gs) est déployé (/etc/prometheus).
roles:
- prometheus
- update_motd
- hosts: prometheus-rives.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
snmp_switch_community: "{{ vault_snmp_switch_community }}"
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
snmp_ilo_user: aurore
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
prometheus_ups_snmp_targets:
- ups-r3-1.ups.auro.re
- ups-r1-1.ups.auro.re
prometheus_servers_targets: |
{{ groups['rives_pve'] + groups['rives_vm'] | list | sort }}
prometheus_unifi_snmp_targets: |
{{ groups['rives_unifi'] | list | sort }}
prometheus_ilo_snmp_targets: |
{{ groups['rives_ilo'] | list | sort }}
update_motd:
prometheus: >-
Prometheus (en configuration rives) est déployé (/etc/prometheus).
roles:
- prometheus
- update_motd
- hosts: prometheus-aurore.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
snmp_switch_community: "{{ vault_snmp_switch_community }}"
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
snmp_ilo_user: aurore
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
prometheus_servers_targets: |
{{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
prometheus_postgresql_targets: |
{{ groups['bdd'] + groups['radius'] | list | sort }}
prometheus_switch_snmp_targets:
- yggdrasil.switch.auro.re
- sw-pn-serveurs.switch.auro.re
- sw-ec-serveurs.switch.auro.re
- sw-gk-serveurs.switch.auro.re
- sw-fl-serveurs.switch.auro.re
- sw-ff-uplink.switch.auro.re
- sw-fl-core.switch.auro.re
- sw-fd-vcore.switch.auro.re
- sw-fl-vcore.switch.auro.re
- sw-ff-vcore.switch.auro.re
- sw-pn-core.switch.auro.re
- sw-ec-core.switch.auro.re
- sw-gk-core.switch.auro.re
- sw-r3-core.switch.auro.re
prometheus_ilo_snmp_targets: |
{{ groups['aurore_ilo'] | list | sort }}
update_motd:
prometheus: >-
Prometheus (en configuration aurore) est déployé (/etc/prometheus).
roles:
- prometheus
- update_motd
- hosts: prometheus-ovh.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
snmp_switch_community: "{{ vault_snmp_switch_community }}"
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
snmp_ilo_user: aurore
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
prometheus_servers_targets: |
{{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
prometheus_postgresql_targets:
- bdd-ovh.adm.auro.re
prometheus_docker_targets:
- docker-ovh.adm.auro.re
update_motd:
prometheus: >-
Prometheus (en configuration ovh) est déployé (/etc/prometheus).
roles:
- prometheus
- update_motd
- hosts: prometheus-federate.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
snmp_ilo_user: aurore
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
prometheus_servers_targets:
- prometheus-edc.adm.auro.re
- prometheus-gs.adm.auro.re
- prometheus-fleming.adm.auro.re
- prometheus-pacaterie.adm.auro.re
- prometheus-rives.adm.auro.re
- prometheus-aurore.adm.auro.re
- prometheus-ovh.adm.auro.re
update_motd:
prometheus_federate: >-
Prometheus (en configuration fédération) est déployé (/etc/prometheus).
roles:
- prometheus_federate
- update_motd
# Postgres Exporters
- hosts: bdd,radius
roles:
- prometheus_postgres
# Monitor all hosts
- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
roles: roles:
- prometheus_node - prometheus_node
- hosts:
- prometheus-1.monit.infra.auro.re
- prometheus-2.monit.infra.auro.re
vars:
prometheus__tsdb_retention_time: 90d
prometheus__scraping:
node:
targets: "{{ ['vm_network', 'pve_network']
| map('extract', groups)
| flatten }}"
address:
port: 9100
prometheus__alert_rules:
node:
- alert: MachineDown
expr: "up == 0"
for: 3m
labels:
severity: critical
annotations:
summary: "Collecteur {{ '$labels.job' | interp }}"
- alert: OutOfMemory
expr: "( node_memory_MemFree_bytes
+ node_memory_Cached_bytes
+ node_memory_Buffers_bytes )
/ node_memory_MemTotal_bytes * 100 < 10"
for: 5m
labels:
severity: warning
annotations:
summary: "Mémoire libre à {{ '$value' | interp }}%"
- alert: HostSwapIsFillingUp
expr: "( 1 - ( node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes ) )
* 100 >= 50"
for: 3m
labels:
severity: critical
annotations:
summary: "Swap {{ '$value' | interp }}%"
- alert: HostPhysicalComponentTooHot
expr: "node_hwmon_temp_celsius > 79"
for: 3m
labels:
severity: critical
annotations:
summary: "{{ '$value' | interp }}°C :
{{ '$labels.chip' | interp }},
{{ '$labels.sensor' | interp }}"
- alert: HostNodeOvertemperatureAlarm
expr: "node_hwmon_temp_crit_alarm_celsius == 1"
for: 0m
labels:
severity: critical
annotations:
summary: "{{ '$labels.chip' | interp }},
{{ '$labels.sensor' | interp }}"
- alert: HostOomKillDetected
expr: "increase(node_vmstat_oom_kill[1m]) > 0"
for: 0m
labels:
severity: warning
annotations:
summary: "PID {{ '$value' | interp }}"
- alert: HostEdacCorrectableErrorsDetected
expr: "increase(node_edac_correctable_errors_total[1m]) > 0"
for: 0m
labels:
severity: warning
annotations:
summary: "{{ '$value' | interp }} erreurs corrigées"
- alert: OutOfDiskSpace
expr: "node_filesystem_free_bytes
/ node_filesystem_size_bytes * 100 < 10"
for: 5m
labels:
severity: warning
annotations:
summary: "{{ '$labels.mountpoint' | interp }} :
{{ '$value' | interp }}% libre"
- alert: OutOfInodes
expr: "node_filesystem_files_free
/ node_filesystem_files * 100 < 10"
for: 3m
labels:
severity: warning
annotations:
summary: "{{ '$labels.mountpoint' | interp }} :
{{ '$value' | interp }}% libre"
- alert: CpuUsage
expr: '( 100 - avg by (instance)
( irate(node_cpu_seconds_total{mode="idle"}[5m]) )
* 100 ) > 75'
for: 10m
labels:
severity: warning
annotations:
summary: "{{ '$value' | interp }}%"
- alert: SystemdServiceFailed
expr: 'node_systemd_unit_state{state="failed"} == 1'
for: 10m
labels:
severity: warning
annotations:
summary: "{{ '$labels.name' | interp }}"
- alert: LoadUsage
expr: "node_load1 > 5"
for: 2m
labels:
severity: warning
annotations:
summary: "{{ '$value' | interp }}"
- alert: UnhealthyDisk
expr: "smartmon_device_smart_healthy < 1"
for: 10m
labels:
severity: critical
annotations:
summary: "{{ '$labels.disk' | interp }}"
roles:
- prometheus
#- hosts: prometheus-fleming.adm.auro.re
# vars:
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
# snmp_ilo_user: aurore
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
#
# prometheus_servers_targets: |
# {{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }}
# prometheus_unifi_snmp_targets: |
# {{ groups['fleming_unifi'] | list | sort }}
# prometheus_ilo_snmp_targets: |
# {{ groups['fleming_ilo'] | list | sort }}
#
# update_motd:
# prometheus: >-
# Prometheus (en configuration fleming) est déployé (/etc/prometheus).
# roles:
# - prometheus
# - update_motd
#
#- hosts: prometheus-pacaterie.adm.auro.re
# vars:
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
# snmp_ilo_user: aurore
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
#
# prometheus_servers_targets: |
# {{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }}
# prometheus_unifi_snmp_targets: |
# {{ groups['pacaterie_unifi'] | list | sort }}
# prometheus_ups_snmp_targets:
# - ups-pn-1.ups.auro.re
# - ups-ps-1.ups.auro.re
# prometheus_ilo_snmp_targets: |
# {{ groups['pacaterie_ilo'] | list | sort }}
#
# update_motd:
# prometheus: >-
# Prometheus (en configuration pacaterie) est déployé (/etc/prometheus).
# roles:
# - prometheus
# - update_motd
#
#- hosts: prometheus-edc.adm.auro.re
# vars:
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
# snmp_ilo_user: aurore
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
#
# prometheus_ups_snmp_targets:
# - ups-ec-1.ups.auro.re
# # - ups-ec-2.ups.auro.re
# - ups-ec-3.ups.auro.re
# prometheus_servers_targets: |
# {{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }}
# prometheus_unifi_snmp_targets: |
# {{ groups['edc_unifi'] | list | sort }}
# prometheus_ilo_snmp_targets: |
# {{ groups['edc_ilo'] | list | sort }}
#
# update_motd:
# prometheus: >-
# Prometheus (en configuration edc) est déployé (/etc/prometheus).
# roles:
# - prometheus
# - update_motd
#
#- hosts: prometheus-gs.adm.auro.re
# vars:
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
# snmp_ilo_user: aurore
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
#
# prometheus_servers_targets: |
# {{ groups['gs_pve'] + groups['gs_vm'] | list | sort }}
# prometheus_unifi_snmp_targets: |
# {{ groups['gs_unifi'] | list | sort }}
# prometheus_ups_snmp_targets:
# - ups-gk-1.ups.auro.re
# prometheus_apc_pdu_snmp_targets:
# - pdu-ga-1.ups.auro.re
# prometheus_ilo_snmp_targets: |
# {{ groups['gs_ilo'] | list | sort }}
#
# update_motd:
# prometheus: >-
# Prometheus (en configuration gs) est déployé (/etc/prometheus).
# roles:
# - prometheus
# - update_motd
#
#- hosts: prometheus-rives.adm.auro.re
# vars:
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
# snmp_ilo_user: aurore
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
#
# prometheus_ups_snmp_targets:
# - ups-r3-1.ups.auro.re
# - ups-r1-1.ups.auro.re
# prometheus_servers_targets: |
# {{ groups['rives_pve'] + groups['rives_vm'] | list | sort }}
# prometheus_unifi_snmp_targets: |
# {{ groups['rives_unifi'] | list | sort }}
# prometheus_ilo_snmp_targets: |
# {{ groups['rives_ilo'] | list | sort }}
#
# update_motd:
# prometheus: >-
# Prometheus (en configuration rives) est déployé (/etc/prometheus).
# roles:
# - prometheus
# - update_motd
#
#- hosts: prometheus-aurore.adm.auro.re
# vars:
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
# snmp_ilo_user: aurore
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
#
# prometheus_servers_targets: |
# {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
# prometheus_postgresql_targets: |
# {{ groups['bdd'] + groups['radius'] | list | sort }}
# prometheus_switch_snmp_targets:
# - yggdrasil.switch.auro.re
# - sw-pn-serveurs.switch.auro.re
# - sw-ec-serveurs.switch.auro.re
# - sw-gk-serveurs.switch.auro.re
# - sw-fl-serveurs.switch.auro.re
# - sw-ff-uplink.switch.auro.re
# - sw-fl-core.switch.auro.re
# - sw-fd-vcore.switch.auro.re
# - sw-fl-vcore.switch.auro.re
# - sw-ff-vcore.switch.auro.re
# - sw-pn-core.switch.auro.re
# - sw-ec-core.switch.auro.re
# - sw-gk-core.switch.auro.re
# - sw-r3-core.switch.auro.re
# prometheus_ilo_snmp_targets: |
# {{ groups['aurore_ilo'] | list | sort }}
#
# update_motd:
# prometheus: >-
# Prometheus (en configuration aurore) est déployé (/etc/prometheus).
# roles:
# - prometheus
# - update_motd
#
#- hosts: prometheus-ovh.adm.auro.re
# vars:
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# snmp_switch_community: "{{ vault_snmp_switch_community }}"
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
# snmp_ilo_user: aurore
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
#
# prometheus_servers_targets: |
# {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
# prometheus_postgresql_targets:
# - bdd-ovh.adm.auro.re
# prometheus_docker_targets:
# - docker-ovh.adm.auro.re
#
# update_motd:
# prometheus: >-
# Prometheus (en configuration ovh) est déployé (/etc/prometheus).
# roles:
# - prometheus
# - update_motd
#
#- hosts: prometheus-federate.adm.auro.re
# vars:
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
# snmp_ilo_user: aurore
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
#
# prometheus_servers_targets:
# - prometheus-edc.adm.auro.re
# - prometheus-gs.adm.auro.re
# - prometheus-fleming.adm.auro.re
# - prometheus-pacaterie.adm.auro.re
# - prometheus-rives.adm.auro.re
# - prometheus-aurore.adm.auro.re
# - prometheus-ovh.adm.auro.re
#
# update_motd:
# prometheus_federate: >-
# Prometheus (en configuration fédération) est déployé (/etc/prometheus).
# roles:
# - prometheus_federate
# - update_motd
#
## Postgres Exporters
#- hosts: bdd,radius
# roles:
# - prometheus_postgres
#
## Monitor all hosts
#- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
# roles:
# - prometheus_node

View file

@ -0,0 +1,7 @@
---
prometheus__alertmanager_targets: []
prometheus__scraping: {}
prometheus__alert_rules: {}
prometheus__tsdb_retention_time: 15d
prometheus__page_title: "{{ inventory_hostname }}"
...

View file

@ -1,11 +1,11 @@
--- ---
- name: Restart Prometheus - name: Restart prometheus
service: systemd:
name: prometheus name: prometheus.service
state: restarted state: restarted
- name: Restart prometheus-snmp-exporter - name: Reload prometheus
service: systemd:
name: prometheus-snmp-exporter name: prometheus.service
state: restarted state: reloaded
... ...

View file

@ -1,69 +1,55 @@
--- ---
- name: Install Prometheus - name: Install prometheus
apt: apt:
update_cache: true
name: name:
- prometheus - prometheus
- prometheus-snmp-exporter
register: apt_result
retries: 3
until: apt_result is succeeded
- name: Configure Prometheus - name: Configure prometheus
template: template:
src: "{{ item }}.j2" src: prometheus.yml.j2
dest: "/etc/prometheus/{{ item }}" dest: /etc/prometheus/prometheus.yml
owner: prometheus owner: prometheus
group: prometheus group: prometheus
mode: u=r,g=r,o= mode: u=rw,g=r,o=r
loop: vars:
- prometheus.yml prometheus__config:
notify: Restart Prometheus alerting:
alertmanagers:
- static_configs:
- targets: "{{ prometheus__alertmanager_targets }}"
rule_files:
- /etc/prometheus/rules.yml
scrape_configs: "{{ prometheus__scraping
| prometheus__convert_jobs }}"
notify:
- Restart prometheus
- name: Creates directory for alerts - name: Configure prometheus default
file:
path: /etc/prometheus/alerts
state: directory
owner: prometheus
group: prometheus
mode: 0755
- name: Configure Prometheus alerts
template: template:
src: "{{ item }}.j2" src: default.j2
dest: "/etc/prometheus/alerts/{{ item }}" dest: /etc/default/prometheus
owner: prometheus owner: root
group: prometheus group: root
mode: u=r,g=r,o= mode: u=rw,g=r,o=r
loop: notify:
- server.rules.yml - Restart prometheus
- docker.rules.yml
- ups.rules.yml
- postgres.rules.yml
- environmental.rules.yml
- ilo.rules.yml
notify: Restart Prometheus
- name: Make Prometheus snmp-exporter listen on localhost only - name: Configure prometheus rules
lineinfile:
path: /etc/default/prometheus-snmp-exporter
regexp: '^ARGS='
line: "ARGS=\"--web.listen-address=127.0.0.1:9116\""
notify: Restart prometheus-snmp-exporter
# These files store SNMP OIDs
- name: Configure Prometheus snmp-exporter
template: template:
src: "{{ item }}.j2" src: rules.yml.j2
dest: "/etc/prometheus/{{ item }}" dest: /etc/prometheus/rules.yml
owner: prometheus owner: prometheus
group: prometheus group: prometheus
mode: u=r,g=r,o= mode: u=rw,g=r,o=r
loop: validate: "promtool check rules %s"
- snmp.yml vars:
notify: Restart prometheus-snmp-exporter prometheus__rules:
groups: "{{ prometheus__alert_rules
| dict2items(key_name='name', value_name='rules') }}"
notify:
- Reload prometheus
- name: Activate prometheus service - name: Enable prometheus
systemd: systemd:
name: prometheus name: prometheus
enabled: true enabled: true

View file

@ -0,0 +1,3 @@
{{ ansible_managed | comment }}
ARGS="--storage.tsdb.retention.time={{ prometheus__tsdb_retention_time | quote }} --web.page-title={{ prometheus__page_title | quote }}"

View file

@ -1,50 +0,0 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: docker.rules
rules:
- alert: ContainerDown
expr: docker_container_running_state != 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
Le container Docker est éteint / tombé
(container {{ raw('$labels.name') }})
- alert: ContainerFailed
expr: sum(increase(docker_container_restart_count[5m])) > 2
for: 0m
labels:
severity: critical
annotations:
summary: >-
Le container Docker redémarre souvent
(container {{ raw('$labels.name') }})
- alert: ContainerFailed
expr:
(
docker_container_cpu_used_total
/
docker_container_cpu_capacity_total
) * 100
> 30
for: 0m
labels:
severity: critical
annotations:
summary: >-
Le container Docker utilise beaucoup de CPU
(container {{ raw('$labels.name') }},
valeur {{ raw('$value | printf "%.1f"') }})
...

View file

@ -1,52 +0,0 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: environmental.rules
rules:
- alert: EnvironmentalTemperature
expr: rPDU2SensorTempHumidityStatusTempC / 10 > 30
for: 10m
labels:
severity: warning
annotations:
summary: >-
Température environnementale à {{ raw('$value') }}°
- alert: EnvironmentalTemperature
expr: rPDU2SensorTempHumidityStatusTempC / 10 > 40
for: 10m
labels:
severity: critical
annotations:
summary: >-
Température environnementale à {{ raw('$value') }}°
- alert: EnvironmentalTemperature
expr: xupsEnvRemoteTemp > 30
for: 10m
labels:
severity: warning
annotations:
summary: >-
Température environnementale à {{ raw('$value') }}°
- alert: EnvironmentalTemperature
expr: xupsEnvRemoteTemp > 40
for: 10m
labels:
severity: critical
annotations:
summary: >-
Température environnementale à {{ raw('$value') }}°
...

View file

@ -1,83 +0,0 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: ilo.rules
rules:
- alert: IloResilientMemoryDegraded
expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1
for: 3m
labels:
severity: warning
annotations:
summary: >-
La mémoire vive n'est plus résiliente
({{ raw('$labels.cpqHeResilientMemCondition') }})
- alert: IloBiosSelfTestDegraded
expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
for: 3m
labels:
severity: critical
annotations:
summary: >-
Une erreur a été détectée lors du POST du serveur
({{ raw('$labels.cpqHeHWBiosCondition') }})
- alert: IloBatteryDegraded
expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1
for: 3m
labels:
severity: warning
annotations:
summary: >-
La batterie est dégradée
({{ raw('$labels.cpqHeSysBatteryCondition') }})
- alert: IloTemperatureSensorDegraded
expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
for: 3m
labels:
severity: critical
annotations:
summary: >-
Le capteur de température est dégradé
({{ raw('$labels.cpqHeTemperatureCondition') }})
- alert: IloFanDegraded
expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1
for: 3m
labels:
severity: critical
annotations:
summary: >-
Le ventilateur est dégradé
({{ raw('$labels.cpqHeFltTolFanCondition') }})
- alert: IloPowerSupplyDegraded
expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1
for: 3m
labels:
severity: critical
annotations:
summary: >-
L'alimentation est dégradée
({{ raw('$labels.cpqHeFltTolPowerSupplyStatus') }})
- alert: IloOverrideSwitchState
expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1
for: 3m
labels:
severity: critical
annotations:
summary: >-
Le switch de réinitialisation n'est pas à l'état d'origine,
l'authentification est bypassée
...

View file

@ -1,219 +0,0 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: postgres.rules
rules:
- alert: PostgresqlDown
expr: pg_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Serveur PostgreSQL down
- alert: PostgresqlRestarted
expr: time() - pg_postmaster_start_time_seconds < 60
for: 0m
labels:
severity: critical
annotations:
summary: Serveur PostgreSQL redémarré
- alert: PostgresqlExporterError
expr: pg_exporter_last_scrape_error > 0
for: 0m
labels:
severity: critical
annotations:
summary: Erreur dans l'exporter PostgreSQL
- alert: PostgresqlReplicationLag
expr:
pg_replication_lag > 30
and
ON(instance) pg_replication_is_replica == 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
La réplication PostgreSQL lag ({{ raw('$value') }} > 30s)
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlTableNotVaccumed
expr:
time() - pg_stat_user_tables_last_autovacuum
> 60 * 60 * 24
for: 0m
labels:
severity: warning
annotations:
summary: >-
Le démon autovacuum n'a pas été lancé depuis 24h
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlTableNotAnalyzed
expr:
time() - pg_stat_user_tables_last_autoanalyze
> 60 * 60 * 24
for: 0m
labels:
severity: warning
annotations:
summary: >-
Table non-analysée depuis 24h
(base de données {{ raw('$labels.datname') }})
- alert: PostgresqlTooManyConnections
expr:
(
sum by (datname)
(pg_stat_activity_count{datname!~"template.*|postgres"})
) * 100
> pg_settings_max_connections * 80
for: 2m
labels:
severity: warning
annotations:
summary: >-
PostgreSQL a trop de connexions
({{ raw('$value | printf "%.1f"') }} > 80%)
(base de données {{ raw('$labels.datname') }})
- alert: PostgresqlDeadLocks
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
for: 0m
labels:
severity: warning
annotations:
summary: >-
PostgreSQL a plus de 5 deadlocks.
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlSlowQueries
expr: pg_slow_queries > 0
for: 2m
labels:
severity: warning
annotations:
summary: >-
Présence de requêtes lentes (slow-queries)
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlHighRollbackRate
expr:
(
rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) /
rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m])
) * 100
> 20
for: 0m
labels:
severity: warning
annotations:
summary: >-
PostgreSQL a un taux de retour en arrière (rollback) élevé
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} %)
- alert: PostgresqlWaleReplicationStopped
expr: rate(pg_xlog_position_bytes[1m]) == 0
for: 0m
labels:
severity: critical
annotations:
summary: >-
Réplication de PostgreSQL WALE stoppée
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlHighRateStatementTimeout
expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Beaucoup de requêtes PostgreSQL sont timeout
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
- alert: PostgresqlHighRateDeadlock
expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
PostgreSQL a un fort taux de deadlock
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
# - alert: PostgresqlReplicationLagBytes
# expr:
# (pg_xlog_position_bytes and pg_replication_is_replica == 0)
# - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1)
# > 1e+09
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} )
- alert: PostgresqlTooManyDeadTuples
expr:
(
(pg_stat_user_tables_n_dead_tup > 10000)
/ (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)
) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
for: 2m
labels:
severity: warning
annotations:
summary: >-
Les tuples morts PostgreSQL sont trop volumineux
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
- alert: PostgresqlSplitBrain
expr: count(pg_replication_is_replica == 0) != 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
Split Brain : trop de bases de données PostgreSQL primaires en mode lecture-écriture
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }} )
- alert: PostgresqlPromotedNode
expr:
pg_replication_is_replica
and
changes(pg_replication_is_replica[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: >-
Le serveur de secours PostgreSQL a été promu comme nœud principal
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }})
- alert: PostgresqlTooManyLocksAcquired
expr:
(
(sum (pg_locks_count))
/ (pg_settings_max_locks_per_transaction * pg_settings_max_connections)
) * 100 > 20
for: 2m
labels:
severity: critical
annotations:
summary: >-
Trop de deadlocks acquis sur la base de données.
Si cette alerte se produit fréquemment, nous devrons peut-être augmenter
le paramètre postgres max_locks_per_transaction
(Valeur = {{ raw('$value | printf "%.1f"') }} )
...

View file

@ -1,6 +1,10 @@
--- ---
{{ ansible_managed | comment }} {{ ansible_managed | comment }}
{{ prometheus__config | to_nice_yaml }}
...
{#
global: global:
# scrape_interval is set to the global default (60s) # scrape_interval is set to the global default (60s)
# evaluation_interval is set to the global default (60s) # evaluation_interval is set to the global default (60s)
@ -156,5 +160,5 @@ scrape_configs:
- target_label: __address__ - target_label: __address__
replacement: 127.0.0.1:9116 replacement: 127.0.0.1:9116
{% endif %} {% endif %}
... ...
#}

View file

@ -0,0 +1,5 @@
---
{{ ansible_managed | comment }}
{{ prometheus__rules | to_nice_yaml }}
...

View file

@ -1,156 +0,0 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: server.rules
rules:
- alert: MachineDown
expr: up{instance!~".*.borne.auro.re$"} == 0
for: 3m
labels:
severity: critical
annotations:
summary: >-
Le collecteur {{ raw('$labels.job') }} ne marche plus
- alert: AccessPointDown
expr: up{instance=~".*.borne.auro.re$"} == 0
for: 3m
labels:
severity: warning
- alert: OutOfMemory
expr: >-
(
node_memory_MemFree_bytes
+ node_memory_Cached_bytes
+ node_memory_Buffers_bytes
) / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: >-
{{ raw('$value | printf "%.1f"') }}% de mémoire
libre
- alert: HostSwapIsFillingUp
expr: >-
(
1 - (
node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes
)
) * 100 >= 50
for: 3m
labels:
severity: warning
annotations:
summary: >-
La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}%
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 79
for: 3m
labels:
severity: critical
annotations:
summary: >-
La température de l'hôte est de {{ raw('$value') }}°C
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
- alert: HostNodeOvertemperatureAlarm
expr: node_hwmon_temp_crit_alarm_celsius == 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
L'alarme de température de l'hôte est active
({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }})
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer)
- alert: HostEdacCorrectableErrorsDetected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: >-
{{ raw('$value | print "%.1f"') }} erreur(s) ont été
corrigée(s) (EDAC)
- alert: OutOfDiskSpace
expr: >-
node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: >-
{{ raw('$value | printf "%.1f"') }}% d'espace libre pour
{{ raw('$labels.mountpoint') }}
- alert: OutOfInodes
expr: node_filesystem_files_free / node_filesystem_files * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: >-
{{ raw('$value | printf "%.1f"') }}% d'inodes
restants pour {{ raw('$labels.mountpoint') }}
- alert: CpuUsage
expr: >-
(
100 - avg by (instance) (
irate(node_cpu_seconds_total{mode="idle"}[5m])
) * 100
) > 75
for: 10m
labels:
severity: warning
annotations:
summary: >-
CPU à {{ raw('$value | printf "%.1f"') }}%
- alert: SystemdServiceFailed
expr: node_systemd_unit_state{state="failed"} == 1
for: 10m
labels:
severity: warning
annotations:
summary: >-
{{ raw('$labels.name') }} a échoué
- alert: LoadUsage
expr: node_load1 > 5
for: 2m
labels:
severity: warning
annotations:
summary: >
Charge à {{ raw('$value') }}
- alert: UnhealthyDisk
expr: smartmon_device_smart_healthy < 1
for: 10m
labels:
severity: "critical"
annotations:
summary: "Le Disque {{ raw('$labels.disk') }} n'est pas en bonne santé !"
...

View file

@ -1,708 +0,0 @@
---
{{ ansible_managed | comment }}
# TODOlist :
# - Faire fonctionner le monitoring des switchs défini ici
# * Configurer tous les switchs avec un compte SNMPv3
# * Mettre l'inventaire des switchs dans Ansible
# - Optimiser les règles pour les bornes Unifi,
# on pourrait indexer avec les SSID
eatonups:
walk:
- 1.3.6.1.2.1.33.1.2
- 1.3.6.1.2.1.33.1.3
- 1.3.6.1.2.1.33.1.4
- 1.3.6.1.4.1.534.1.6
- 1.3.6.1.4.1.318.1.1.10.2.3.2.1.4
get:
- 1.3.6.1.2.1.1.3.0
metrics:
- name: sysUpTime
oid: 1.3.6.1.2.1.1.3
type: gauge
help: The time (in hundredths of a second) since the network management portion
of the system was last re-initialized. - 1.3.6.1.2.1.1.3
- name: upsBatteryStatus
oid: 1.3.6.1.2.1.33.1.2.1
type: gauge
help: The indication of the capacity remaining in the UPS system's batteries -
1.3.6.1.2.1.33.1.2.1
- name: upsEstimatedMinutesRemaining
oid: 1.3.6.1.2.1.33.1.2.3
type: gauge
help: An estimate of the time to battery charge depletion under the present load
conditions if the utility power is off and remains off, or if it were to be
lost and remain off. - 1.3.6.1.2.1.33.1.2.3
- name: upsInputVoltage
oid: 1.3.6.1.2.1.33.1.3.3.1.3
type: gauge
help: The magnitude of the present input voltage. - 1.3.6.1.2.1.33.1.3.3.1.3
indexes:
- labelname: upsInputLineIndex
type: gauge
- name: upsOutputSource
oid: 1.3.6.1.2.1.33.1.4.1
type: gauge
help: The present source of output power - 1.3.6.1.2.1.33.1.4.1
- name: upsOutputVoltage
oid: 1.3.6.1.2.1.33.1.4.4.1.2
type: gauge
help: The present output voltage. - 1.3.6.1.2.1.33.1.4.4.1.2
indexes:
- labelname: upsOutputLineIndex
type: gauge
- name: upsOutputPower
oid: 1.3.6.1.2.1.33.1.4.4.1.4
type: gauge
help: The present output true power. - 1.3.6.1.2.1.33.1.4.4.1.4
indexes:
- labelname: upsOutputLineIndex
type: gauge
- name: upsOutputPercentLoad
oid: 1.3.6.1.2.1.33.1.4.4.1.5
type: gauge
help: The percentage of the UPS power capacity presently being used on this output
line, i.e., the greater of the percent load of true power capacity and the percent
load of VA. - 1.3.6.1.2.1.33.1.4.4.1.5
indexes:
- labelname: upsOutputLineIndex
type: gauge
- name: xupsEnvRemoteTemp
oid: 1.3.6.1.4.1.318.1.1.10.2.3.2.1.4
type: gauge
help: The reading of an EMP's temperature sensor (APC MIB) - 1.3.6.1.4.1.318.1.1.10.2.3.2.1.4
- name: xupsEnvRemoteTemp
oid: 1.3.6.1.4.1.534.1.6.5
type: gauge
help: The reading of an EMP's temperature sensor (Eaton MIB) - 1.3.6.1.4.1.534.1.6.5
- name: xupsEnvRemoteHumidity
oid: 1.3.6.1.4.1.534.1.6.6
type: gauge
help: The reading of an EMP's humidity sensor. - 1.3.6.1.4.1.534.1.6.6
version: 1
auth:
community: public
procurve_switch:
walk:
- 1.3.6.1.2.1.31.1.1.1.10
- 1.3.6.1.2.1.31.1.1.1
- 1.3.6.1.2.1.2.2.1.2
- 1.3.6.1.2.1.31.1.1.1.18
get:
- 1.3.6.1.2.1.1.3.0
- 1.3.6.1.2.1.1.5.0
- 1.3.6.1.2.1.1.6.0
metrics:
- name: sysUpTime
oid: 1.3.6.1.2.1.1.3
type: gauge
help: The time (in hundredths of a second) since the network management
portion of the system was last re-initialized. - 1.3.6.1.2.1.1.3
- name: sysName
oid: 1.3.6.1.2.1.1.5
type: DisplayString
help: An administratively-assigned name for this managed node
- 1.3.6.1.2.1.1.5
- name: sysLocation
oid: 1.3.6.1.2.1.1.6
type: DisplayString
help: The physical location of this node (e.g., 'telephone closet, 3rd
floor') - 1.3.6.1.2.1.1.6
- name: ifHCOutOctets
oid: 1.3.6.1.2.1.31.1.1.1.10
type: counter
help: The total number of octets transmitted out of the interface,
including framing characters - 1.3.6.1.2.1.31.1.1.1.10
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifDescr
oid: 1.3.6.1.2.1.2.2.1.2
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
- name: ifHCInOctets
oid: 1.3.6.1.2.1.31.1.1.1.6
type: counter
help: The total number of octets received on the interface, including
framing characters - 1.3.6.1.2.1.31.1.1.1.6
indexes:
- labelname: ifIndex
type: gauge
lookups:
- labels:
- ifIndex
labelname: ifDescr
oid: 1.3.6.1.2.1.2.2.1.2
type: DisplayString
- labels:
- ifIndex
labelname: ifName
oid: 1.3.6.1.2.1.31.1.1.1.1
type: DisplayString
version: 2
auth:
community: "{{ snmp_switch_community }}"
ubiquiti_unifi:
walk:
- 1.3.6.1.4.1.41112.1.6
get:
- 1.3.6.1.2.1.1.5.0
- 1.3.6.1.2.1.1.6.0
metrics:
# Pour faire une WifiMap un jour, on peut entrer la location dans la conf des bornes
# - name: sysLocation
# oid: 1.3.6.1.2.1.1.6
# type: DisplayString
# help: The physical location of this node (e.g., 'telephone closet, 3rd floor')
# - 1.3.6.1.2.1.1.6
- name: unifiVapIndex
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.1
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.1'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapChannel
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.4
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.4'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapEssId
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.6
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.6'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapName
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.7
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.7'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifi_vap_num_stations
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.8
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.8'
indexes:
- labelname: unifi_vap_index
type: gauge
lookups:
- labels: [unifi_vap_index]
labelname: unifi_vap_essid
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.6
type: DisplayString
- labels: [unifi_vap_index]
labelname: unifi_vap_radio
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.9
type: DisplayString
- labels: []
labelname: unifi_vap_index
# - name: unifiVapNumStations
# oid: 1.3.6.1.4.1.41112.1.6.1.2.1.8
# type: gauge
# help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.8'
# indexes:
# - labelname: unifiVapIndex
# type: gauge
- name: unifiVapRadio
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.9
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.9'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxBytes
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.10
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.10'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxCrypts
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.11
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.11'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxDropped
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.12
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.12'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxErrors
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.13
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.13'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxFrags
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.14
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.14'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxPackets
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.15
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.15'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxBytes
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.16
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.16'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxDropped
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.17
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.17'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxErrors
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.18
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.18'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxPackets
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.19
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.19'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxRetries
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.20
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.20'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxPower
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.21
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.21'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapUp
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.22
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.22'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapUsage
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.23
type: DisplayString
help: guest or regular user - 1.3.6.1.4.1.41112.1.6.1.2.1.23
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiIfIndex
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.1
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.1'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfName
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.5
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.5'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxBytes
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.6
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.6'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxDropped
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.7
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.7'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxError
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.8
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.8'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxMulticast
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.9
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.9'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxPackets
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.10
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.10'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfTxBytes
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.12
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.12'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfTxDropped
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.13
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.13'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfTxError
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.14
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.14'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfTxPackets
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.15
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.15'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiApSystemModel
oid: 1.3.6.1.4.1.41112.1.6.3.3
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.3.3'
- name: unifiApSystemUptime
oid: 1.3.6.1.4.1.41112.1.6.3.5
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.3.5'
version: 3
auth:
security_level: authPriv
username: snmp_prometheus
password: {{ snmp_unifi_password }}
auth_protocol: SHA
priv_protocol: AES
priv_password: {{ snmp_unifi_password }}
apc_pdu:
walk:
- 1.3.6.1.4.1.318.1.1.26.10.2.2.1.8
- 1.3.6.1.4.1.318.1.1.26.4.3.1.4
- 1.3.6.1.4.1.318.1.1.26.4.3.1.5
- 1.3.6.1.4.1.318.1.1.26.4.3.1.6
- 1.3.6.1.4.1.318.1.1.26.6.3.1.9
- 1.3.6.1.4.1.318.1.1.26.9.4.3.1.7
metrics:
- name: rPDU2SensorTempHumidityStatusTempC
oid: 1.3.6.1.4.1.318.1.1.26.10.2.2.1.8
type: gauge
help: Sensor temperature reading in tenths of degrees Celsius - 1.3.6.1.4.1.318.1.1.26.10.2.2.1.8
indexes:
- labelname: rPDU2SensorTempHumidityStatusIndex
type: gauge
- name: rPDU2DeviceStatusLoadState
oid: 1.3.6.1.4.1.318.1.1.26.4.3.1.4
type: gauge
help: Indicates the present load status of the Rack PDU - 1.3.6.1.4.1.318.1.1.26.4.3.1.4
indexes:
- labelname: rPDU2DeviceStatusIndex
type: gauge
- name: rPDU2DeviceStatusPower
oid: 1.3.6.1.4.1.318.1.1.26.4.3.1.5
type: gauge
help: The power consumption of the Rack PDU load in hundredths of kilowatts -
1.3.6.1.4.1.318.1.1.26.4.3.1.5
indexes:
- labelname: rPDU2DeviceStatusIndex
type: gauge
- name: rPDU2DeviceStatusPeakPower
oid: 1.3.6.1.4.1.318.1.1.26.4.3.1.6
type: gauge
help: The peak power consumption of the Rack PDU load in hundredths of kilowatts
- 1.3.6.1.4.1.318.1.1.26.4.3.1.6
indexes:
- labelname: rPDU2DeviceStatusIndex
type: gauge
- name: rPDU2PhaseStatusPowerFactor
oid: 1.3.6.1.4.1.318.1.1.26.6.3.1.9
type: gauge
help: Indicates the load power factor, in hundredths, of the Rack PDU phase being
queried - 1.3.6.1.4.1.318.1.1.26.6.3.1.9
indexes:
- labelname: rPDU2PhaseStatusIndex
type: gauge
- name: rPDU2OutletMeteredStatusPower
oid: 1.3.6.1.4.1.318.1.1.26.9.4.3.1.7
type: gauge
help: Indicates the power draw of the load on the Rack PDU outlet being queried
- 1.3.6.1.4.1.318.1.1.26.9.4.3.1.7
indexes:
- labelname: rPDU2OutletMeteredStatusIndex
type: gauge
version: 3
auth:
security_level: authPriv
username: {{ snmp_pdu_user }}
password: {{ snmp_pdu_password }}
auth_protocol: SHA
priv_protocol: AES
priv_password: {{ snmp_pdu_password }}
ilo:
walk:
- 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory
- 1.3.6.1.4.1.232.6.2.15.3 # Power meter
- 1.3.6.1.4.1.232.6.2.16.1 # POST tests
- 1.3.6.1.4.1.232.6.2.17.1 # Battery
- 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location
- 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value
- 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit
- 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition
- 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location
- 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition
- 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply
- 1.3.6.1.4.1.232.9.2.2 # iLO
metrics:
- name: cpqHeResilientMemCondition
oid: 1.3.6.1.4.1.232.6.2.14.4
type: EnumAsStateSet
help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHePowerMeterCurrReading
oid: 1.3.6.1.4.1.232.6.2.15.3
type: gauge
help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3
- name: cpqHeHWBiosCondition
oid: 1.3.6.1.4.1.232.6.2.16.1
type: EnumAsStateSet
help: This value indicates an error has been detected during Pre-OS Test (POST)
or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHeSysBatteryCondition
oid: 1.3.6.1.4.1.232.6.2.17.1
type: EnumAsStateSet
help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1
indexes:
- labelname: cpqHeSysBatteryChassis
type: gauge
- labelname: cpqHeSysBatteryIndex
type: gauge
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHeTemperatureLocale
oid: 1.3.6.1.4.1.232.6.2.6.8.1.3
type: EnumAsInfo
help: This specifies the location of the temperature sensor present in the system.
- 1.3.6.1.4.1.232.6.2.6.8.1.3
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
enum_values:
1: other
2: unknown
3: system
4: systemBoard
5: ioBoard
6: cpu
7: memory
8: storage
9: removableMedia
10: powerSupply
11: ambient
12: chassis
13: bridgeCard
- name: cpqHeTemperatureCelsius
oid: 1.3.6.1.4.1.232.6.2.6.8.1.4
type: gauge
help: This is the current temperature sensor reading in degrees celsius - 1.3.6.1.4.1.232.6.2.6.8.1.4
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
- name: cpqHeTemperatureThreshold
oid: 1.3.6.1.4.1.232.6.2.6.8.1.5
type: gauge
help: This is the shutdown threshold temperature sensor setting in degrees celsius
- 1.3.6.1.4.1.232.6.2.6.8.1.5
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
- name: cpqHeTemperatureCondition
oid: 1.3.6.1.4.1.232.6.2.6.8.1.6
type: EnumAsStateSet
help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHeFltTolFanLocale
oid: 1.3.6.1.4.1.232.6.2.6.7.1.3
type: EnumAsInfo
help: This specifies the location of the fan present in the system.
- 1.3.6.1.4.1.232.6.2.6.7.1.3
indexes:
- labelname: cpqHeFltTolFanChassis
type: gauge
- labelname: cpqHeFltTolFanIndex
type: gauge
enum_values:
1: other
2: unknown
3: system
4: systemBoard
5: ioBoard
6: cpu
7: memory
8: storage
9: removableMedia
10: powerSupply
11: ambient
12: chassis
13: bridgeCard
- name: cpqHeFltTolFanCondition
oid: 1.3.6.1.4.1.232.6.2.6.7.1.9
type: EnumAsStateSet
help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9
indexes:
- labelname: cpqHeFltTolFanChassis
type: gauge
- labelname: cpqHeFltTolFanIndex
type: gauge
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHeFltTolPowerSupplyStatus
oid: 1.3.6.1.4.1.232.6.2.9.3.1.5
type: EnumAsStateSet
help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5
indexes:
- labelname: cpqHeFltTolPowerSupplyChassis
type: gauge
- labelname: cpqHeFltTolPowerSupplyBay
type: gauge
enum_values:
1: noError
2: generalFailure
3: bistFailure
4: fanFailure
5: tempFailure
6: interlockOpen
7: epromFailed
8: vrefFailed
9: dacFailed
10: ramTestFailed
11: voltageChannelFailed
12: orringdiodeFailed
13: brownOut
14: giveupOnStartup
15: nvramInvalid
16: calibrationTableInvalid
17: noPowerInput
- name: cpqSm2CntlrInterfaceStatus
oid: 1.3.6.1.4.1.232.9.2.2.17
type: EnumAsStateSet
help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17
enum_values:
1: other
2: ok
3: notResponding
- name: cpqSm2CntlriLOSecurityOverrideSwitchState
oid: 1.3.6.1.4.1.232.9.2.2.27
type: EnumAsStateSet
help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27
enum_values:
1: notSupported
2: set
3: notSet
- name: cpqSm2CntlrLicenseActive
oid: 1.3.6.1.4.1.232.9.2.2.30
type: EnumAsStateSet
help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30
enum_values:
1: none
2: iloAdvanced
3: iloLight
4: iloAdvancedBlade
5: iloStandard
6: iloEssentials
7: iloScaleOut
8: iloAdvancedPremiumSecurity
- name: cpqSm2CntlrServerPowerState
oid: 1.3.6.1.4.1.232.9.2.2.32
type: EnumAsStateSet
help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32
enum_values:
1: unknown
2: poweredOff
3: poweredOn
4: insufficientPowerOrPowerOnDenied
version: 3
# Reduce timeout to retry faster
timeout: 1s
auth:
security_level: authPriv
username: {{ snmp_ilo_user }}
password: {{ snmp_ilo_auth }}
auth_protocol: SHA
priv_protocol: AES
priv_password: {{ snmp_ilo_priv }}
...

View file

@ -1,87 +0,0 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: ups.rules
rules:
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Source d'alimentation changée
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 3
for: 0m
labels:
severity: warning
annotations:
summary: >-
État de la batterie faible
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 4
for: 0m
labels:
severity: critical
annotations:
summary: >-
État de la batterie critique
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 3m
labels:
severity: critical
annotations:
summary: >-
Charge de {{ raw('$value | printf "%.1f"') }}%
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension d'entrée de {{ raw('$value') }}V
- alert: UpsWrongOutputVoltage
expr: >-
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
< 3 * stddev_over_time(upsOutputVoltage[1d])
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension de sortie de {{ raw('$value') }}V
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 8
for: 0m
labels:
severity: warning
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 5
for: 0m
labels:
severity: critical
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
...