From 456c6b47b8351ed3b29cb409767428a8eb927ea1 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Fri, 31 Dec 2021 19:38:35 +0100 Subject: [PATCH 01/15] prometheus: add missing snmp_ilo variables --- playbooks/prometheus.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/playbooks/prometheus.yml b/playbooks/prometheus.yml index 8a27497..e8fcf95 100755 --- a/playbooks/prometheus.yml +++ b/playbooks/prometheus.yml @@ -34,6 +34,9 @@ snmp_switch_community: "{{ vault_snmp_switch_community }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_servers_targets: | {{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }} @@ -59,6 +62,9 @@ snmp_switch_community: "{{ vault_snmp_switch_community }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_ups_snmp_targets: - ups-ec-1.ups.auro.re @@ -85,6 +91,9 @@ snmp_switch_community: "{{ vault_snmp_switch_community }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_servers_targets: | {{ groups['gs_pve'] + groups['gs_vm'] | list | sort }} @@ -111,6 +120,9 @@ snmp_switch_community: "{{ vault_snmp_switch_community }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_ups_snmp_targets: - ups-r3-1.ups.auro.re @@ -136,6 +148,9 @@ snmp_switch_community: "{{ vault_snmp_switch_community }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_servers_targets: | {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }} @@ -173,6 +188,9 @@ snmp_switch_community: "{{ vault_snmp_switch_community }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_servers_targets: | {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} @@ -194,6 +212,9 @@ snmp_unifi_password: "{{ vault_snmp_unifi_password }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_servers_targets: - prometheus-edc.adm.auro.re From fdeaa355ad7e8d697ccf1baca4ccd5dc5e8a60f6 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Fri, 31 Dec 2021 19:39:23 +0100 Subject: [PATCH 02/15] prometheus: use longer timeout for iLO scraping --- roles/prometheus/templates/prometheus.yml.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index e80e355..5b967a5 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -163,6 +163,7 @@ scrape_configs: static_configs: - targets: {{ prometheus_ilo_targets | to_json }} metrics_path: /snmp + scrape_timeout: 30s params: module: - ilo From 860a26a8dcd2b4b45e2937089eccad5cdb2aa110 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Fri, 31 Dec 2021 19:39:38 +0100 Subject: [PATCH 03/15] prometheus: federate ilo metrics --- roles/prometheus_federate/templates/prometheus.yml.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/roles/prometheus_federate/templates/prometheus.yml.j2 b/roles/prometheus_federate/templates/prometheus.yml.j2 index fc848db..524008c 100644 --- a/roles/prometheus_federate/templates/prometheus.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus.yml.j2 @@ -40,4 +40,5 @@ scrape_configs: - '{job="docker"}' - '{job="switch_snmp"}' - '{job="ipmi"}' + - '{job="ilo"}' ... From bcded46ed653e716a36199efc788d970955f88e5 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Fri, 31 Dec 2021 19:40:22 +0100 Subject: [PATCH 04/15] prometheus: remove JSON targets cleanup --- roles/prometheus/tasks/main.yml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 98d0c52..896e34f 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -63,20 +63,6 @@ - snmp.yml notify: Restart prometheus-snmp-exporter -- name: Remove old targets - file: - path: "{{ item }}" - state: absent - loop: - - /etc/prometheus/targets.json - - /etc/prometheus/targets_unifi_snmp.json - - /etc/prometheus/targets_switch_snmp.json - - /etc/prometheus/targets_ups_snmp.json - - /etc/prometheus/targets_docker.json - - /etc/prometheus/targets_postgres.json - - /etc/prometheus/targets_apc_pdu_snmp.json - - /etc/prometheus/targets_ipmi.json - - name: Activate prometheus service systemd: name: prometheus From 7d9ff449a7bfbf98020319b0ca166b42058eec31 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Fri, 31 Dec 2021 19:41:51 +0100 Subject: [PATCH 05/15] hosts: remove passbolt --- hosts | 1 - 1 file changed, 1 deletion(-) diff --git a/hosts b/hosts index 29bc3b3..65510bb 100644 --- a/hosts +++ b/hosts @@ -58,7 +58,6 @@ proxy-ovh.adm.auro.re [ovh_vm] serge.adm.auro.re -passbolt.adm.auro.re docker-ovh.adm.auro.re switchs-manager.adm.auro.re ldap-replica-ovh.adm.auro.re From 733e9f555dcb7f9b85c2e549637a3d8ba85d7f29 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Fri, 31 Dec 2021 20:03:04 +0100 Subject: [PATCH 06/15] prometheus: add _snmp suffix to ilo target --- playbooks/prometheus.yml | 2 +- roles/prometheus/templates/prometheus.yml.j2 | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/playbooks/prometheus.yml b/playbooks/prometheus.yml index e8fcf95..0b00862 100755 --- a/playbooks/prometheus.yml +++ b/playbooks/prometheus.yml @@ -17,7 +17,7 @@ {{ groups['fleming_unifi'] | list | sort }} prometheus_ipmi_targets: | {{ groups['fleming_pve'] + groups['fleming_server'] | list | sort }} - prometheus_ilo_targets: | + prometheus_ilo_snmp_targets: | {{ groups['fleming_ilo'] | list | sort }} update_motd: diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 5b967a5..6c1fbda 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -158,10 +158,10 @@ scrape_configs: replacement: '$1:9290' {% endif %} -{% if prometheus_ilo_targets is defined %} - - job_name: ilo +{% if prometheus_ilo_snmp_targets is defined %} + - job_name: ilo_snmp static_configs: - - targets: {{ prometheus_ilo_targets | to_json }} + - targets: {{ prometheus_ilo_snmp_targets | to_json }} metrics_path: /snmp scrape_timeout: 30s params: From 16a2d3647295c25c4127664e55fb5eb083a03345 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Fri, 31 Dec 2021 20:03:27 +0100 Subject: [PATCH 07/15] prometheus: add machines to ilo_snmp --- hosts | 18 ++++++++++++++++++ playbooks/prometheus.yml | 10 ++++++++++ 2 files changed, 28 insertions(+) diff --git a/hosts b/hosts index 65510bb..5876707 100644 --- a/hosts +++ b/hosts @@ -45,6 +45,9 @@ grafana.adm.auro.re [aurore_testing_vm] +[aurore_ilo] +escalope-ilo.adm.auro.re + ############################################################################### # OVH @@ -210,6 +213,10 @@ unifi-pacaterie.adm.auro.re routeur-pacaterie.adm.auro.re routeur-pacaterie-backup.adm.auro.re +[pacaterie_ilo] +mordred-ilo.adm.auro.re +titan-ilo.adm.auro.re + [pacaterie_unifi] pc-1-1.borne.auro.re pn-0-1.borne.auro.re @@ -265,6 +272,10 @@ radius-edc-backup.adm.auro.re ldap-replica-edc.adm.auro.re prometheus-edc.adm.auro.re +[edc_ilo] +caradoc-ilo.adm.auro.re +chapalux-ilo.adm.auro.re + [edc_unifi] ee-2-1.borne.auro.re ee-2-2.borne.auro.re @@ -296,6 +307,10 @@ radius-gs-backup.adm.auro.re prometheus-gs.adm.auro.re ldap-replica-gs.adm.auro.re +[gs_ilo] +lancelot-ilo.adm.auro.re +odin-ilo.adm.auro.re + [gs_unifi] ga-0-1.borne.auro.re ga-1-1.borne.auro.re @@ -365,6 +380,9 @@ dns-rives.adm.auro.re radius-rives.adm.auro.re routeur-rives.adm.auro.re +[rives_ilo] +loki-ilo.adm.auro.re + [rives_unifi] r1-1-1.borne.auro.re r1-1-2.borne.auro.re diff --git a/playbooks/prometheus.yml b/playbooks/prometheus.yml index 0b00862..16e6499 100755 --- a/playbooks/prometheus.yml +++ b/playbooks/prometheus.yml @@ -47,6 +47,8 @@ - ups-ps-1.ups.auro.re prometheus_ipmi_targets: | {{ groups['pacaterie_pve'] | list | sort }} + prometheus_ilo_snmp_targets: | + {{ groups['pacaterie_ilo'] | list | sort }} update_motd: prometheus: >- @@ -76,6 +78,8 @@ {{ groups['edc_unifi'] | list | sort }} prometheus_ipmi_targets: | {{ groups['edc_pve'] + groups['edc_server'] | list | sort }} + prometheus_ilo_snmp_targets: | + {{ groups['edc_ilo'] | list | sort }} update_motd: prometheus: >- @@ -105,6 +109,8 @@ - pdu-ga-1.ups.auro.re prometheus_ipmi_targets: | {{ groups['gs_pve'] | list | sort }} + prometheus_ilo_snmp_targets: | + {{ groups['gs_ilo'] | list | sort }} update_motd: prometheus: >- @@ -133,6 +139,8 @@ {{ groups['rives_unifi'] | list | sort }} prometheus_ipmi_targets: | {{ groups['rives_pve'] | list | sort }} + prometheus_ilo_snmp_targets: | + {{ groups['rives_ilo'] | list | sort }} update_motd: prometheus: >- @@ -173,6 +181,8 @@ - sw-r3-core.switch.auro.re prometheus_ipmi_targets: | {{ groups['aurore_pve'] | list | sort }} + prometheus_ilo_snmp_targets: | + {{ groups['aurore_ilo'] | list | sort }} update_motd: prometheus: >- From 28305585458c661d8bc966840394b80ca45348df Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Fri, 31 Dec 2021 20:04:24 +0100 Subject: [PATCH 08/15] prometheus_federation: add ilo_snmp and remove django --- roles/prometheus_federate/templates/prometheus.yml.j2 | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/roles/prometheus_federate/templates/prometheus.yml.j2 b/roles/prometheus_federate/templates/prometheus.yml.j2 index 524008c..8feabaa 100644 --- a/roles/prometheus_federate/templates/prometheus.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus.yml.j2 @@ -34,11 +34,10 @@ scrape_configs: - '{job="postgresql"}' - '{job="prometheus"}' - '{job="unifi_snmp"}' - - '{job="django"}' - '{job="ups_snmp"}' - '{job="apc_pdu_snmp"}' - '{job="docker"}' - '{job="switch_snmp"}' - '{job="ipmi"}' - - '{job="ilo"}' + - '{job="ilo_snmp"}' ... From 40d9108b37b77b32a3631d5f6a520c7dc3af934e Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Fri, 31 Dec 2021 21:19:35 +0100 Subject: [PATCH 09/15] prometheus: add iLO alert rules --- roles/prometheus/tasks/main.yml | 2 +- roles/prometheus/templates/ilo.rules.yml.j2 | 58 +++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 roles/prometheus/templates/ilo.rules.yml.j2 diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 896e34f..783bdad 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -38,10 +38,10 @@ loop: - server.rules.yml - docker.rules.yml - - django.rules.yml - ups.rules.yml - postgres.rules.yml - environmental.rules.yml + - ilo.rules.yml notify: Restart Prometheus - name: Make Prometheus snmp-exporter listen on localhost only diff --git a/roles/prometheus/templates/ilo.rules.yml.j2 b/roles/prometheus/templates/ilo.rules.yml.j2 new file mode 100644 index 0000000..4a0bc37 --- /dev/null +++ b/roles/prometheus/templates/ilo.rules.yml.j2 @@ -0,0 +1,58 @@ +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + +groups: + + - name: ilo.rules + rules: + + - alert: IloBiosSelfTestDegraded + expr: cpqHeHWBiosCondition >= 3 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Une erreur a été détectée lors du POST du serveur + + - alert: IloTemperatureSensorDegraded + expr: cpqHeTemperatureCondition >= 3 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Le capteur de température est dégradé + + - alert: IloFanDegraded + expr: cpqHeFltTolFanCondition >= 3 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Le ventilateur est dégradé + + - alert: IloPowerSupplyDegraded + expr: cpqHeFltTolPowerSupplyStatus >= 2 + for: 3m + labels: + severity: critical + annotations: + summary: >- + L'alimentation est dégradée + + - alert: IloOverrideSwitchState + expr: cpqSm2CntlriLOSecurityOverrideSwitchState == 2 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Le switch de réinitialisation n'est pas à l'état d'origine + +... From 9b53daf42ad6c3be279fbd22faf1158b3418e3b3 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Sat, 1 Jan 2022 11:58:33 +0100 Subject: [PATCH 10/15] prometheus: ignore ups-ec-2.ups.auro.re --- playbooks/prometheus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/prometheus.yml b/playbooks/prometheus.yml index 16e6499..a82e41d 100755 --- a/playbooks/prometheus.yml +++ b/playbooks/prometheus.yml @@ -70,7 +70,7 @@ prometheus_ups_snmp_targets: - ups-ec-1.ups.auro.re - - ups-ec-2.ups.auro.re + # - ups-ec-2.ups.auro.re - ups-ec-3.ups.auro.re prometheus_servers_targets: | {{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }} From 5ab3dcdac20b8de027b425972711a46c8ff8a2ef Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Sat, 1 Jan 2022 11:59:23 +0100 Subject: [PATCH 11/15] prometheus: use enums for iLO SNMP --- .../prometheus/templates/django.rules.yml.j2 | 106 ------------------ roles/prometheus/templates/ilo.rules.yml.j2 | 17 ++- roles/prometheus/templates/snmp.yml.j2 | 72 ++---------- 3 files changed, 19 insertions(+), 176 deletions(-) delete mode 100644 roles/prometheus/templates/django.rules.yml.j2 diff --git a/roles/prometheus/templates/django.rules.yml.j2 b/roles/prometheus/templates/django.rules.yml.j2 deleted file mode 100644 index fddd398..0000000 --- a/roles/prometheus/templates/django.rules.yml.j2 +++ /dev/null @@ -1,106 +0,0 @@ -# {{ ansible_managed }} -{# As this is also Jinja2 it will conflict without a raw block #} -{% raw %} -groups: -- name: django.rules - rules: - - record: job:django_http_requests_before_middlewares_total:sum_rate30s - expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job) - - record: job:django_http_requests_unknown_latency_total:sum_rate30s - expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job) - - record: job:django_http_ajax_requests_total:sum_rate30s - expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job) - - record: job:django_http_responses_before_middlewares_total:sum_rate30s - expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job) - - record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s - expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s])) - BY (job) - - record: job:django_http_requests_body_total_bytes:sum_rate30s - expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job) - - record: job:django_http_responses_streaming_total:sum_rate30s - expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job) - - record: job:django_http_responses_body_total_bytes:sum_rate30s - expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job) - - record: job:django_http_requests_total:sum_rate30s - expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job) - - record: job:django_http_requests_total_by_method:sum_rate30s - expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method) - - record: job:django_http_requests_total_by_transport:sum_rate30s - expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport) - - record: job:django_http_requests_total_by_view:sum_rate30s - expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, - view) - - record: job:django_http_requests_total_by_view_transport_method:sum_rate30s - expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, - view, transport, method) - - record: job:django_http_responses_total_by_templatename:sum_rate30s - expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename) - - record: job:django_http_responses_total_by_status:sum_rate30s - expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status) - - record: job:django_http_responses_total_by_charset:sum_rate30s - expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset) - - record: job:django_http_exceptions_total_by_type:sum_rate30s - expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type) - - record: job:django_http_exceptions_total_by_view:sum_rate30s - expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view) - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "50" - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "95" - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99" - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99.9" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "50" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "95" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99.9" - - record: job:django_model_inserts_total:sum_rate1m - expr: sum(rate(django_model_inserts_total[1m])) BY (job, model) - - record: job:django_model_updates_total:sum_rate1m - expr: sum(rate(django_model_updates_total[1m])) BY (job, model) - - record: job:django_model_deletes_total:sum_rate1m - expr: sum(rate(django_model_deletes_total[1m])) BY (job, model) - - record: job:django_db_new_connections_total:sum_rate30s - expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor) - - record: job:django_db_new_connection_errors_total:sum_rate30s - expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor) - - record: job:django_db_execute_total:sum_rate30s - expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor) - - record: job:django_db_execute_many_total:sum_rate30s - expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor) - - record: job:django_db_errors_total:sum_rate30s - expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type) - - record: job:django_migrations_applied_total:max - expr: max(django_migrations_applied_total) BY (job, connection) - - record: job:django_migrations_unapplied_total:max - expr: max(django_migrations_unapplied_total) BY (job, connection) -{% endraw %} diff --git a/roles/prometheus/templates/ilo.rules.yml.j2 b/roles/prometheus/templates/ilo.rules.yml.j2 index 4a0bc37..47a3e8f 100644 --- a/roles/prometheus/templates/ilo.rules.yml.j2 +++ b/roles/prometheus/templates/ilo.rules.yml.j2 @@ -11,48 +11,53 @@ groups: rules: - alert: IloBiosSelfTestDegraded - expr: cpqHeHWBiosCondition >= 3 + expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1 for: 3m labels: severity: critical annotations: summary: >- Une erreur a été détectée lors du POST du serveur + ({{ raw('$labels.cpqHeHWBiosCondition') }}) - alert: IloTemperatureSensorDegraded - expr: cpqHeTemperatureCondition >= 3 + expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1 for: 3m labels: severity: critical annotations: summary: >- Le capteur de température est dégradé + ({{ raw('$labels.cpqHeTemperatureCondition') }}) - alert: IloFanDegraded - expr: cpqHeFltTolFanCondition >= 3 + expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1 for: 3m labels: severity: critical annotations: summary: >- Le ventilateur est dégradé + ({{ raw('$labels.cpqHeFltTolFanCondition') }}) - alert: IloPowerSupplyDegraded - expr: cpqHeFltTolPowerSupplyStatus >= 2 + expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1 for: 3m labels: severity: critical annotations: summary: >- L'alimentation est dégradée + ({{ raw('$labels.cpqHeFltTolPowerSupplyStatus') }}) - alert: IloOverrideSwitchState - expr: cpqSm2CntlriLOSecurityOverrideSwitchState == 2 + expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1 for: 3m labels: severity: critical annotations: summary: >- - Le switch de réinitialisation n'est pas à l'état d'origine + Le switch de réinitialisation n'est pas à l'état d'origine, + l'authentification est bypassée ... diff --git a/roles/prometheus/templates/snmp.yml.j2 b/roles/prometheus/templates/snmp.yml.j2 index d593a3d..1171d4c 100644 --- a/roles/prometheus/templates/snmp.yml.j2 +++ b/roles/prometheus/templates/snmp.yml.j2 @@ -497,7 +497,7 @@ ilo: help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3 - name: cpqHeHWBiosCondition oid: 1.3.6.1.4.1.232.6.2.16.1 - type: gauge + type: EnumAsStateSet help: This value indicates an error has been detected during Pre-OS Test (POST) or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1 enum_values: @@ -505,25 +505,6 @@ ilo: 2: ok 3: degraded 4: failed - - name: cpqHeTemperatureChassis - oid: 1.3.6.1.4.1.232.6.2.6.8.1.1 - type: gauge - help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.8.1.1 - indexes: - - labelname: cpqHeTemperatureChassis - type: gauge - - labelname: cpqHeTemperatureIndex - type: gauge - - name: cpqHeTemperatureIndex - oid: 1.3.6.1.4.1.232.6.2.6.8.1.2 - type: gauge - help: A number that uniquely specifies this temperature sensor description. - - 1.3.6.1.4.1.232.6.2.6.8.1.2 - indexes: - - labelname: cpqHeTemperatureChassis - type: gauge - - labelname: cpqHeTemperatureIndex - type: gauge - name: cpqHeTemperatureLocale oid: 1.3.6.1.4.1.232.6.2.6.8.1.3 type: EnumAsInfo @@ -569,7 +550,7 @@ ilo: type: gauge - name: cpqHeTemperatureCondition oid: 1.3.6.1.4.1.232.6.2.6.8.1.6 - type: gauge + type: EnumAsStateSet help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6 indexes: - labelname: cpqHeTemperatureChassis @@ -581,25 +562,6 @@ ilo: 2: ok 3: degraded 4: failed - - name: cpqHeFltTolFanChassis - oid: 1.3.6.1.4.1.232.6.2.6.7.1.1 - type: gauge - help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.7.1.1 - indexes: - - labelname: cpqHeFltTolFanChassis - type: gauge - - labelname: cpqHeFltTolFanIndex - type: gauge - - name: cpqHeFltTolFanIndex - oid: 1.3.6.1.4.1.232.6.2.6.7.1.2 - type: gauge - help: A number that uniquely specifies this fan description. - - 1.3.6.1.4.1.232.6.2.6.7.1.2 - indexes: - - labelname: cpqHeFltTolFanChassis - type: gauge - - labelname: cpqHeFltTolFanIndex - type: gauge - name: cpqHeFltTolFanLocale oid: 1.3.6.1.4.1.232.6.2.6.7.1.3 type: EnumAsInfo @@ -626,7 +588,7 @@ ilo: 13: bridgeCard - name: cpqHeFltTolFanCondition oid: 1.3.6.1.4.1.232.6.2.6.7.1.9 - type: gauge + type: EnumAsStateSet help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9 indexes: - labelname: cpqHeFltTolFanChassis @@ -638,27 +600,9 @@ ilo: 2: ok 3: degraded 4: failed - - name: cpqHeFltTolPowerSupplyChassis - oid: 1.3.6.1.4.1.232.6.2.9.3.1.1 - type: gauge - help: The system chassis number. - 1.3.6.1.4.1.232.6.2.9.3.1.1 - indexes: - - labelname: cpqHeFltTolPowerSupplyChassis - type: gauge - - labelname: cpqHeFltTolPowerSupplyBay - type: gauge - - name: cpqHeFltTolPowerSupplyBay - oid: 1.3.6.1.4.1.232.6.2.9.3.1.2 - type: gauge - help: The bay number to index within this chassis. - 1.3.6.1.4.1.232.6.2.9.3.1.2 - indexes: - - labelname: cpqHeFltTolPowerSupplyChassis - type: gauge - - labelname: cpqHeFltTolPowerSupplyBay - type: gauge - name: cpqHeFltTolPowerSupplyStatus oid: 1.3.6.1.4.1.232.6.2.9.3.1.5 - type: gauge + type: EnumAsStateSet help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5 indexes: - labelname: cpqHeFltTolPowerSupplyChassis @@ -685,7 +629,7 @@ ilo: 17: noPowerInput - name: cpqSm2CntlrInterfaceStatus oid: 1.3.6.1.4.1.232.9.2.2.17 - type: gauge + type: EnumAsStateSet help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17 enum_values: 1: other @@ -693,7 +637,7 @@ ilo: 3: notResponding - name: cpqSm2CntlriLOSecurityOverrideSwitchState oid: 1.3.6.1.4.1.232.9.2.2.27 - type: gauge + type: EnumAsStateSet help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27 enum_values: 1: notSupported @@ -701,7 +645,7 @@ ilo: 3: notSet - name: cpqSm2CntlrLicenseActive oid: 1.3.6.1.4.1.232.9.2.2.30 - type: gauge + type: EnumAsStateSet help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30 enum_values: 1: none @@ -714,7 +658,7 @@ ilo: 8: iloAdvancedPremiumSecurity - name: cpqSm2CntlrServerPowerState oid: 1.3.6.1.4.1.232.9.2.2.32 - type: gauge + type: EnumAsStateSet help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32 enum_values: 1: unknown From 70c8e0ebe0eda755841a9bee14be8401deb66ce6 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Sat, 1 Jan 2022 16:45:10 +0100 Subject: [PATCH 12/15] prometheus: monitor iLO resilient mem and battery --- roles/prometheus/templates/ilo.rules.yml.j2 | 20 +++++++++++ roles/prometheus/templates/snmp.yml.j2 | 39 ++++++++++++++++++--- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/roles/prometheus/templates/ilo.rules.yml.j2 b/roles/prometheus/templates/ilo.rules.yml.j2 index 47a3e8f..8a3f72b 100644 --- a/roles/prometheus/templates/ilo.rules.yml.j2 +++ b/roles/prometheus/templates/ilo.rules.yml.j2 @@ -10,6 +10,16 @@ groups: - name: ilo.rules rules: + - alert: IloResilientMemoryDegraded + expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: warning + annotations: + summary: >- + La mémoire vive n'est plus résilente + ({{ raw('$labels.cpqHeResilientMemCondition') }}) + - alert: IloBiosSelfTestDegraded expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1 for: 3m @@ -20,6 +30,16 @@ groups: Une erreur a été détectée lors du POST du serveur ({{ raw('$labels.cpqHeHWBiosCondition') }}) + - alert: IloBatteryDegraded + expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: warning + annotations: + summary: >- + La batterie est dégradée + ({{ raw('$labels.cpqHeSysBatteryCondition') }}) + - alert: IloTemperatureSensorDegraded expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1 for: 3m diff --git a/roles/prometheus/templates/snmp.yml.j2 b/roles/prometheus/templates/snmp.yml.j2 index 1171d4c..4ecd4b3 100644 --- a/roles/prometheus/templates/snmp.yml.j2 +++ b/roles/prometheus/templates/snmp.yml.j2 @@ -484,13 +484,28 @@ apc_pdu: ilo: walk: - - 1.3.6.1.4.1.232.6.2.15 # Power meter - - 1.3.6.1.4.1.232.6.2.16 # POST tests - - 1.3.6.1.4.1.232.6.2.6.8.1 # Temperature sensors - - 1.3.6.1.4.1.232.6.2.6.7.1 # Fans - - 1.3.6.1.4.1.232.6.2.9.3.1 # Power supply + - 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory + - 1.3.6.1.4.1.232.6.2.15.3 # Power meter + - 1.3.6.1.4.1.232.6.2.16.1 # POST tests + - 1.3.6.1.4.1.232.6.2.17.1 # Battery + - 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location + - 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value + - 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit + - 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition + - 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location + - 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition + - 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply - 1.3.6.1.4.1.232.9.2.2 # iLO metrics: + - name: cpqHeResilientMemCondition + oid: 1.3.6.1.4.1.232.6.2.14.4 + type: EnumAsStateSet + help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4 + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed - name: cpqHePowerMeterCurrReading oid: 1.3.6.1.4.1.232.6.2.15.3 type: gauge @@ -505,6 +520,20 @@ ilo: 2: ok 3: degraded 4: failed + - name: cpqHeSysBatteryCondition + oid: 1.3.6.1.4.1.232.6.2.17.1 + type: EnumAsStateSet + help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1 + indexes: + - labelname: cpqHeSysBatteryChassis + type: gauge + - labelname: cpqHeSysBatteryIndex + type: gauge + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed - name: cpqHeTemperatureLocale oid: 1.3.6.1.4.1.232.6.2.6.8.1.3 type: EnumAsInfo From a24b473566c31c750882f5037211347ee058a2bd Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Sat, 1 Jan 2022 16:45:32 +0100 Subject: [PATCH 13/15] prometheus: reduce iLO SNMP timeout --- roles/prometheus/templates/prometheus.yml.j2 | 1 - roles/prometheus/templates/snmp.yml.j2 | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 6c1fbda..bcaa598 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -163,7 +163,6 @@ scrape_configs: static_configs: - targets: {{ prometheus_ilo_snmp_targets | to_json }} metrics_path: /snmp - scrape_timeout: 30s params: module: - ilo diff --git a/roles/prometheus/templates/snmp.yml.j2 b/roles/prometheus/templates/snmp.yml.j2 index 4ecd4b3..3b9407a 100644 --- a/roles/prometheus/templates/snmp.yml.j2 +++ b/roles/prometheus/templates/snmp.yml.j2 @@ -695,6 +695,8 @@ ilo: 3: poweredOn 4: insufficientPowerOrPowerOnDenied version: 3 + # Reduce timeout to retry faster + timeout: 1s auth: security_level: authPriv username: {{ snmp_ilo_user }} From 9e4b8c250985c5de74bf299f7a818fc2d8d715fa Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Sat, 1 Jan 2022 17:15:11 +0100 Subject: [PATCH 14/15] prometheus: remove ipmi target --- playbooks/prometheus.yml | 12 ------------ roles/prometheus/templates/prometheus.yml.j2 | 18 ------------------ .../templates/prometheus.yml.j2 | 1 - 3 files changed, 31 deletions(-) diff --git a/playbooks/prometheus.yml b/playbooks/prometheus.yml index a82e41d..6f16471 100755 --- a/playbooks/prometheus.yml +++ b/playbooks/prometheus.yml @@ -15,8 +15,6 @@ {{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }} prometheus_unifi_snmp_targets: | {{ groups['fleming_unifi'] | list | sort }} - prometheus_ipmi_targets: | - {{ groups['fleming_pve'] + groups['fleming_server'] | list | sort }} prometheus_ilo_snmp_targets: | {{ groups['fleming_ilo'] | list | sort }} @@ -45,8 +43,6 @@ prometheus_ups_snmp_targets: - ups-pn-1.ups.auro.re - ups-ps-1.ups.auro.re - prometheus_ipmi_targets: | - {{ groups['pacaterie_pve'] | list | sort }} prometheus_ilo_snmp_targets: | {{ groups['pacaterie_ilo'] | list | sort }} @@ -76,8 +72,6 @@ {{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }} prometheus_unifi_snmp_targets: | {{ groups['edc_unifi'] | list | sort }} - prometheus_ipmi_targets: | - {{ groups['edc_pve'] + groups['edc_server'] | list | sort }} prometheus_ilo_snmp_targets: | {{ groups['edc_ilo'] | list | sort }} @@ -107,8 +101,6 @@ - ups-gk-1.ups.auro.re prometheus_apc_pdu_snmp_targets: - pdu-ga-1.ups.auro.re - prometheus_ipmi_targets: | - {{ groups['gs_pve'] | list | sort }} prometheus_ilo_snmp_targets: | {{ groups['gs_ilo'] | list | sort }} @@ -137,8 +129,6 @@ {{ groups['rives_pve'] + groups['rives_vm'] | list | sort }} prometheus_unifi_snmp_targets: | {{ groups['rives_unifi'] | list | sort }} - prometheus_ipmi_targets: | - {{ groups['rives_pve'] | list | sort }} prometheus_ilo_snmp_targets: | {{ groups['rives_ilo'] | list | sort }} @@ -179,8 +169,6 @@ - sw-ec-core.switch.auro.re - sw-gk-core.switch.auro.re - sw-r3-core.switch.auro.re - prometheus_ipmi_targets: | - {{ groups['aurore_pve'] | list | sort }} prometheus_ilo_snmp_targets: | {{ groups['aurore_ilo'] | list | sort }} diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index bcaa598..9fd0531 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -140,24 +140,6 @@ scrape_configs: replacement: 127.0.0.1:9116 {% endif %} -{% if prometheus_ipmi_targets is defined %} - - job_name: ipmi - static_configs: - - targets: {{ prometheus_ipmi_targets | to_json }} - metrics_path: /metrics - params: - module: [default] - relabel_configs: - # Do not put :9290 in instance name, rather here - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - source_labels: [__param_target] - target_label: __address__ - replacement: '$1:9290' -{% endif %} - {% if prometheus_ilo_snmp_targets is defined %} - job_name: ilo_snmp static_configs: diff --git a/roles/prometheus_federate/templates/prometheus.yml.j2 b/roles/prometheus_federate/templates/prometheus.yml.j2 index 8feabaa..e1271df 100644 --- a/roles/prometheus_federate/templates/prometheus.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus.yml.j2 @@ -38,6 +38,5 @@ scrape_configs: - '{job="apc_pdu_snmp"}' - '{job="docker"}' - '{job="switch_snmp"}' - - '{job="ipmi"}' - '{job="ilo_snmp"}' ... From a320907047b6a1b488963c60cfde6cfbe4dbf261 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Sat, 1 Jan 2022 17:55:32 +0100 Subject: [PATCH 15/15] prometheus: fix typo --- roles/prometheus/templates/ilo.rules.yml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/prometheus/templates/ilo.rules.yml.j2 b/roles/prometheus/templates/ilo.rules.yml.j2 index 8a3f72b..d6bbe75 100644 --- a/roles/prometheus/templates/ilo.rules.yml.j2 +++ b/roles/prometheus/templates/ilo.rules.yml.j2 @@ -17,7 +17,7 @@ groups: severity: warning annotations: summary: >- - La mémoire vive n'est plus résilente + La mémoire vive n'est plus résiliente ({{ raw('$labels.cpqHeResilientMemCondition') }}) - alert: IloBiosSelfTestDegraded