diff --git a/hosts b/hosts index 29bc3b3..5876707 100644 --- a/hosts +++ b/hosts @@ -45,6 +45,9 @@ grafana.adm.auro.re [aurore_testing_vm] +[aurore_ilo] +escalope-ilo.adm.auro.re + ############################################################################### # OVH @@ -58,7 +61,6 @@ proxy-ovh.adm.auro.re [ovh_vm] serge.adm.auro.re -passbolt.adm.auro.re docker-ovh.adm.auro.re switchs-manager.adm.auro.re ldap-replica-ovh.adm.auro.re @@ -211,6 +213,10 @@ unifi-pacaterie.adm.auro.re routeur-pacaterie.adm.auro.re routeur-pacaterie-backup.adm.auro.re +[pacaterie_ilo] +mordred-ilo.adm.auro.re +titan-ilo.adm.auro.re + [pacaterie_unifi] pc-1-1.borne.auro.re pn-0-1.borne.auro.re @@ -266,6 +272,10 @@ radius-edc-backup.adm.auro.re ldap-replica-edc.adm.auro.re prometheus-edc.adm.auro.re +[edc_ilo] +caradoc-ilo.adm.auro.re +chapalux-ilo.adm.auro.re + [edc_unifi] ee-2-1.borne.auro.re ee-2-2.borne.auro.re @@ -297,6 +307,10 @@ radius-gs-backup.adm.auro.re prometheus-gs.adm.auro.re ldap-replica-gs.adm.auro.re +[gs_ilo] +lancelot-ilo.adm.auro.re +odin-ilo.adm.auro.re + [gs_unifi] ga-0-1.borne.auro.re ga-1-1.borne.auro.re @@ -366,6 +380,9 @@ dns-rives.adm.auro.re radius-rives.adm.auro.re routeur-rives.adm.auro.re +[rives_ilo] +loki-ilo.adm.auro.re + [rives_unifi] r1-1-1.borne.auro.re r1-1-2.borne.auro.re diff --git a/playbooks/prometheus.yml b/playbooks/prometheus.yml index 8a27497..6f16471 100755 --- a/playbooks/prometheus.yml +++ b/playbooks/prometheus.yml @@ -15,9 +15,7 @@ {{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }} prometheus_unifi_snmp_targets: | {{ groups['fleming_unifi'] | list | sort }} - prometheus_ipmi_targets: | - {{ groups['fleming_pve'] + groups['fleming_server'] | list | sort }} - prometheus_ilo_targets: | + prometheus_ilo_snmp_targets: | {{ groups['fleming_ilo'] | list | sort }} update_motd: @@ -34,6 +32,9 @@ snmp_switch_community: "{{ vault_snmp_switch_community }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_servers_targets: | {{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }} @@ -42,8 +43,8 @@ prometheus_ups_snmp_targets: - ups-pn-1.ups.auro.re - ups-ps-1.ups.auro.re - prometheus_ipmi_targets: | - {{ groups['pacaterie_pve'] | list | sort }} + prometheus_ilo_snmp_targets: | + {{ groups['pacaterie_ilo'] | list | sort }} update_motd: prometheus: >- @@ -59,17 +60,20 @@ snmp_switch_community: "{{ vault_snmp_switch_community }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_ups_snmp_targets: - ups-ec-1.ups.auro.re - - ups-ec-2.ups.auro.re + # - ups-ec-2.ups.auro.re - ups-ec-3.ups.auro.re prometheus_servers_targets: | {{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }} prometheus_unifi_snmp_targets: | {{ groups['edc_unifi'] | list | sort }} - prometheus_ipmi_targets: | - {{ groups['edc_pve'] + groups['edc_server'] | list | sort }} + prometheus_ilo_snmp_targets: | + {{ groups['edc_ilo'] | list | sort }} update_motd: prometheus: >- @@ -85,6 +89,9 @@ snmp_switch_community: "{{ vault_snmp_switch_community }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_servers_targets: | {{ groups['gs_pve'] + groups['gs_vm'] | list | sort }} @@ -94,8 +101,8 @@ - ups-gk-1.ups.auro.re prometheus_apc_pdu_snmp_targets: - pdu-ga-1.ups.auro.re - prometheus_ipmi_targets: | - {{ groups['gs_pve'] | list | sort }} + prometheus_ilo_snmp_targets: | + {{ groups['gs_ilo'] | list | sort }} update_motd: prometheus: >- @@ -111,6 +118,9 @@ snmp_switch_community: "{{ vault_snmp_switch_community }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_ups_snmp_targets: - ups-r3-1.ups.auro.re @@ -119,8 +129,8 @@ {{ groups['rives_pve'] + groups['rives_vm'] | list | sort }} prometheus_unifi_snmp_targets: | {{ groups['rives_unifi'] | list | sort }} - prometheus_ipmi_targets: | - {{ groups['rives_pve'] | list | sort }} + prometheus_ilo_snmp_targets: | + {{ groups['rives_ilo'] | list | sort }} update_motd: prometheus: >- @@ -136,6 +146,9 @@ snmp_switch_community: "{{ vault_snmp_switch_community }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_servers_targets: | {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }} @@ -156,8 +169,8 @@ - sw-ec-core.switch.auro.re - sw-gk-core.switch.auro.re - sw-r3-core.switch.auro.re - prometheus_ipmi_targets: | - {{ groups['aurore_pve'] | list | sort }} + prometheus_ilo_snmp_targets: | + {{ groups['aurore_ilo'] | list | sort }} update_motd: prometheus: >- @@ -173,6 +186,9 @@ snmp_switch_community: "{{ vault_snmp_switch_community }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_servers_targets: | {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} @@ -194,6 +210,9 @@ snmp_unifi_password: "{{ vault_snmp_unifi_password }}" snmp_pdu_user: "{{ vault_snmp_pdu_user }}" snmp_pdu_password: "{{ vault_snmp_pdu_password }}" + snmp_ilo_user: aurore + snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" + snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" prometheus_servers_targets: - prometheus-edc.adm.auro.re diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 98d0c52..783bdad 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -38,10 +38,10 @@ loop: - server.rules.yml - docker.rules.yml - - django.rules.yml - ups.rules.yml - postgres.rules.yml - environmental.rules.yml + - ilo.rules.yml notify: Restart Prometheus - name: Make Prometheus snmp-exporter listen on localhost only @@ -63,20 +63,6 @@ - snmp.yml notify: Restart prometheus-snmp-exporter -- name: Remove old targets - file: - path: "{{ item }}" - state: absent - loop: - - /etc/prometheus/targets.json - - /etc/prometheus/targets_unifi_snmp.json - - /etc/prometheus/targets_switch_snmp.json - - /etc/prometheus/targets_ups_snmp.json - - /etc/prometheus/targets_docker.json - - /etc/prometheus/targets_postgres.json - - /etc/prometheus/targets_apc_pdu_snmp.json - - /etc/prometheus/targets_ipmi.json - - name: Activate prometheus service systemd: name: prometheus diff --git a/roles/prometheus/templates/django.rules.yml.j2 b/roles/prometheus/templates/django.rules.yml.j2 deleted file mode 100644 index fddd398..0000000 --- a/roles/prometheus/templates/django.rules.yml.j2 +++ /dev/null @@ -1,106 +0,0 @@ -# {{ ansible_managed }} -{# As this is also Jinja2 it will conflict without a raw block #} -{% raw %} -groups: -- name: django.rules - rules: - - record: job:django_http_requests_before_middlewares_total:sum_rate30s - expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job) - - record: job:django_http_requests_unknown_latency_total:sum_rate30s - expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job) - - record: job:django_http_ajax_requests_total:sum_rate30s - expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job) - - record: job:django_http_responses_before_middlewares_total:sum_rate30s - expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job) - - record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s - expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s])) - BY (job) - - record: job:django_http_requests_body_total_bytes:sum_rate30s - expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job) - - record: job:django_http_responses_streaming_total:sum_rate30s - expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job) - - record: job:django_http_responses_body_total_bytes:sum_rate30s - expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job) - - record: job:django_http_requests_total:sum_rate30s - expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job) - - record: job:django_http_requests_total_by_method:sum_rate30s - expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method) - - record: job:django_http_requests_total_by_transport:sum_rate30s - expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport) - - record: job:django_http_requests_total_by_view:sum_rate30s - expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, - view) - - record: job:django_http_requests_total_by_view_transport_method:sum_rate30s - expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, - view, transport, method) - - record: job:django_http_responses_total_by_templatename:sum_rate30s - expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename) - - record: job:django_http_responses_total_by_status:sum_rate30s - expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status) - - record: job:django_http_responses_total_by_charset:sum_rate30s - expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset) - - record: job:django_http_exceptions_total_by_type:sum_rate30s - expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type) - - record: job:django_http_exceptions_total_by_view:sum_rate30s - expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view) - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "50" - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "95" - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99" - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99.9" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "50" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "95" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99.9" - - record: job:django_model_inserts_total:sum_rate1m - expr: sum(rate(django_model_inserts_total[1m])) BY (job, model) - - record: job:django_model_updates_total:sum_rate1m - expr: sum(rate(django_model_updates_total[1m])) BY (job, model) - - record: job:django_model_deletes_total:sum_rate1m - expr: sum(rate(django_model_deletes_total[1m])) BY (job, model) - - record: job:django_db_new_connections_total:sum_rate30s - expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor) - - record: job:django_db_new_connection_errors_total:sum_rate30s - expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor) - - record: job:django_db_execute_total:sum_rate30s - expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor) - - record: job:django_db_execute_many_total:sum_rate30s - expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor) - - record: job:django_db_errors_total:sum_rate30s - expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type) - - record: job:django_migrations_applied_total:max - expr: max(django_migrations_applied_total) BY (job, connection) - - record: job:django_migrations_unapplied_total:max - expr: max(django_migrations_unapplied_total) BY (job, connection) -{% endraw %} diff --git a/roles/prometheus/templates/ilo.rules.yml.j2 b/roles/prometheus/templates/ilo.rules.yml.j2 new file mode 100644 index 0000000..d6bbe75 --- /dev/null +++ b/roles/prometheus/templates/ilo.rules.yml.j2 @@ -0,0 +1,83 @@ +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + +groups: + + - name: ilo.rules + rules: + + - alert: IloResilientMemoryDegraded + expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: warning + annotations: + summary: >- + La mémoire vive n'est plus résiliente + ({{ raw('$labels.cpqHeResilientMemCondition') }}) + + - alert: IloBiosSelfTestDegraded + expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Une erreur a été détectée lors du POST du serveur + ({{ raw('$labels.cpqHeHWBiosCondition') }}) + + - alert: IloBatteryDegraded + expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: warning + annotations: + summary: >- + La batterie est dégradée + ({{ raw('$labels.cpqHeSysBatteryCondition') }}) + + - alert: IloTemperatureSensorDegraded + expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Le capteur de température est dégradé + ({{ raw('$labels.cpqHeTemperatureCondition') }}) + + - alert: IloFanDegraded + expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Le ventilateur est dégradé + ({{ raw('$labels.cpqHeFltTolFanCondition') }}) + + - alert: IloPowerSupplyDegraded + expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1 + for: 3m + labels: + severity: critical + annotations: + summary: >- + L'alimentation est dégradée + ({{ raw('$labels.cpqHeFltTolPowerSupplyStatus') }}) + + - alert: IloOverrideSwitchState + expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Le switch de réinitialisation n'est pas à l'état d'origine, + l'authentification est bypassée + +... diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index e80e355..9fd0531 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -140,28 +140,10 @@ scrape_configs: replacement: 127.0.0.1:9116 {% endif %} -{% if prometheus_ipmi_targets is defined %} - - job_name: ipmi +{% if prometheus_ilo_snmp_targets is defined %} + - job_name: ilo_snmp static_configs: - - targets: {{ prometheus_ipmi_targets | to_json }} - metrics_path: /metrics - params: - module: [default] - relabel_configs: - # Do not put :9290 in instance name, rather here - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - source_labels: [__param_target] - target_label: __address__ - replacement: '$1:9290' -{% endif %} - -{% if prometheus_ilo_targets is defined %} - - job_name: ilo - static_configs: - - targets: {{ prometheus_ilo_targets | to_json }} + - targets: {{ prometheus_ilo_snmp_targets | to_json }} metrics_path: /snmp params: module: diff --git a/roles/prometheus/templates/snmp.yml.j2 b/roles/prometheus/templates/snmp.yml.j2 index d593a3d..3b9407a 100644 --- a/roles/prometheus/templates/snmp.yml.j2 +++ b/roles/prometheus/templates/snmp.yml.j2 @@ -484,20 +484,35 @@ apc_pdu: ilo: walk: - - 1.3.6.1.4.1.232.6.2.15 # Power meter - - 1.3.6.1.4.1.232.6.2.16 # POST tests - - 1.3.6.1.4.1.232.6.2.6.8.1 # Temperature sensors - - 1.3.6.1.4.1.232.6.2.6.7.1 # Fans - - 1.3.6.1.4.1.232.6.2.9.3.1 # Power supply + - 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory + - 1.3.6.1.4.1.232.6.2.15.3 # Power meter + - 1.3.6.1.4.1.232.6.2.16.1 # POST tests + - 1.3.6.1.4.1.232.6.2.17.1 # Battery + - 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location + - 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value + - 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit + - 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition + - 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location + - 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition + - 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply - 1.3.6.1.4.1.232.9.2.2 # iLO metrics: + - name: cpqHeResilientMemCondition + oid: 1.3.6.1.4.1.232.6.2.14.4 + type: EnumAsStateSet + help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4 + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed - name: cpqHePowerMeterCurrReading oid: 1.3.6.1.4.1.232.6.2.15.3 type: gauge help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3 - name: cpqHeHWBiosCondition oid: 1.3.6.1.4.1.232.6.2.16.1 - type: gauge + type: EnumAsStateSet help: This value indicates an error has been detected during Pre-OS Test (POST) or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1 enum_values: @@ -505,25 +520,20 @@ ilo: 2: ok 3: degraded 4: failed - - name: cpqHeTemperatureChassis - oid: 1.3.6.1.4.1.232.6.2.6.8.1.1 - type: gauge - help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.8.1.1 - indexes: - - labelname: cpqHeTemperatureChassis - type: gauge - - labelname: cpqHeTemperatureIndex - type: gauge - - name: cpqHeTemperatureIndex - oid: 1.3.6.1.4.1.232.6.2.6.8.1.2 - type: gauge - help: A number that uniquely specifies this temperature sensor description. - - 1.3.6.1.4.1.232.6.2.6.8.1.2 + - name: cpqHeSysBatteryCondition + oid: 1.3.6.1.4.1.232.6.2.17.1 + type: EnumAsStateSet + help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1 indexes: - - labelname: cpqHeTemperatureChassis + - labelname: cpqHeSysBatteryChassis type: gauge - - labelname: cpqHeTemperatureIndex + - labelname: cpqHeSysBatteryIndex type: gauge + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed - name: cpqHeTemperatureLocale oid: 1.3.6.1.4.1.232.6.2.6.8.1.3 type: EnumAsInfo @@ -569,7 +579,7 @@ ilo: type: gauge - name: cpqHeTemperatureCondition oid: 1.3.6.1.4.1.232.6.2.6.8.1.6 - type: gauge + type: EnumAsStateSet help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6 indexes: - labelname: cpqHeTemperatureChassis @@ -581,25 +591,6 @@ ilo: 2: ok 3: degraded 4: failed - - name: cpqHeFltTolFanChassis - oid: 1.3.6.1.4.1.232.6.2.6.7.1.1 - type: gauge - help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.7.1.1 - indexes: - - labelname: cpqHeFltTolFanChassis - type: gauge - - labelname: cpqHeFltTolFanIndex - type: gauge - - name: cpqHeFltTolFanIndex - oid: 1.3.6.1.4.1.232.6.2.6.7.1.2 - type: gauge - help: A number that uniquely specifies this fan description. - - 1.3.6.1.4.1.232.6.2.6.7.1.2 - indexes: - - labelname: cpqHeFltTolFanChassis - type: gauge - - labelname: cpqHeFltTolFanIndex - type: gauge - name: cpqHeFltTolFanLocale oid: 1.3.6.1.4.1.232.6.2.6.7.1.3 type: EnumAsInfo @@ -626,7 +617,7 @@ ilo: 13: bridgeCard - name: cpqHeFltTolFanCondition oid: 1.3.6.1.4.1.232.6.2.6.7.1.9 - type: gauge + type: EnumAsStateSet help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9 indexes: - labelname: cpqHeFltTolFanChassis @@ -638,27 +629,9 @@ ilo: 2: ok 3: degraded 4: failed - - name: cpqHeFltTolPowerSupplyChassis - oid: 1.3.6.1.4.1.232.6.2.9.3.1.1 - type: gauge - help: The system chassis number. - 1.3.6.1.4.1.232.6.2.9.3.1.1 - indexes: - - labelname: cpqHeFltTolPowerSupplyChassis - type: gauge - - labelname: cpqHeFltTolPowerSupplyBay - type: gauge - - name: cpqHeFltTolPowerSupplyBay - oid: 1.3.6.1.4.1.232.6.2.9.3.1.2 - type: gauge - help: The bay number to index within this chassis. - 1.3.6.1.4.1.232.6.2.9.3.1.2 - indexes: - - labelname: cpqHeFltTolPowerSupplyChassis - type: gauge - - labelname: cpqHeFltTolPowerSupplyBay - type: gauge - name: cpqHeFltTolPowerSupplyStatus oid: 1.3.6.1.4.1.232.6.2.9.3.1.5 - type: gauge + type: EnumAsStateSet help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5 indexes: - labelname: cpqHeFltTolPowerSupplyChassis @@ -685,7 +658,7 @@ ilo: 17: noPowerInput - name: cpqSm2CntlrInterfaceStatus oid: 1.3.6.1.4.1.232.9.2.2.17 - type: gauge + type: EnumAsStateSet help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17 enum_values: 1: other @@ -693,7 +666,7 @@ ilo: 3: notResponding - name: cpqSm2CntlriLOSecurityOverrideSwitchState oid: 1.3.6.1.4.1.232.9.2.2.27 - type: gauge + type: EnumAsStateSet help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27 enum_values: 1: notSupported @@ -701,7 +674,7 @@ ilo: 3: notSet - name: cpqSm2CntlrLicenseActive oid: 1.3.6.1.4.1.232.9.2.2.30 - type: gauge + type: EnumAsStateSet help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30 enum_values: 1: none @@ -714,7 +687,7 @@ ilo: 8: iloAdvancedPremiumSecurity - name: cpqSm2CntlrServerPowerState oid: 1.3.6.1.4.1.232.9.2.2.32 - type: gauge + type: EnumAsStateSet help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32 enum_values: 1: unknown @@ -722,6 +695,8 @@ ilo: 3: poweredOn 4: insufficientPowerOrPowerOnDenied version: 3 + # Reduce timeout to retry faster + timeout: 1s auth: security_level: authPriv username: {{ snmp_ilo_user }} diff --git a/roles/prometheus_federate/templates/prometheus.yml.j2 b/roles/prometheus_federate/templates/prometheus.yml.j2 index fc848db..e1271df 100644 --- a/roles/prometheus_federate/templates/prometheus.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus.yml.j2 @@ -34,10 +34,9 @@ scrape_configs: - '{job="postgresql"}' - '{job="prometheus"}' - '{job="unifi_snmp"}' - - '{job="django"}' - '{job="ups_snmp"}' - '{job="apc_pdu_snmp"}' - '{job="docker"}' - '{job="switch_snmp"}' - - '{job="ipmi"}' + - '{job="ilo_snmp"}' ...