Prometheus: cleanup #90
8 changed files with 180 additions and 225 deletions
19
hosts
19
hosts
|
@ -45,6 +45,9 @@ grafana.adm.auro.re
|
|||
|
||||
[aurore_testing_vm]
|
||||
|
||||
[aurore_ilo]
|
||||
escalope-ilo.adm.auro.re
|
||||
|
||||
###############################################################################
|
||||
# OVH
|
||||
|
||||
|
@ -58,7 +61,6 @@ proxy-ovh.adm.auro.re
|
|||
|
||||
[ovh_vm]
|
||||
serge.adm.auro.re
|
||||
passbolt.adm.auro.re
|
||||
docker-ovh.adm.auro.re
|
||||
switchs-manager.adm.auro.re
|
||||
ldap-replica-ovh.adm.auro.re
|
||||
|
@ -211,6 +213,10 @@ unifi-pacaterie.adm.auro.re
|
|||
routeur-pacaterie.adm.auro.re
|
||||
routeur-pacaterie-backup.adm.auro.re
|
||||
|
||||
[pacaterie_ilo]
|
||||
mordred-ilo.adm.auro.re
|
||||
titan-ilo.adm.auro.re
|
||||
|
||||
[pacaterie_unifi]
|
||||
pc-1-1.borne.auro.re
|
||||
pn-0-1.borne.auro.re
|
||||
|
@ -266,6 +272,10 @@ radius-edc-backup.adm.auro.re
|
|||
ldap-replica-edc.adm.auro.re
|
||||
prometheus-edc.adm.auro.re
|
||||
|
||||
[edc_ilo]
|
||||
caradoc-ilo.adm.auro.re
|
||||
chapalux-ilo.adm.auro.re
|
||||
|
||||
[edc_unifi]
|
||||
ee-2-1.borne.auro.re
|
||||
ee-2-2.borne.auro.re
|
||||
|
@ -297,6 +307,10 @@ radius-gs-backup.adm.auro.re
|
|||
prometheus-gs.adm.auro.re
|
||||
ldap-replica-gs.adm.auro.re
|
||||
|
||||
[gs_ilo]
|
||||
lancelot-ilo.adm.auro.re
|
||||
odin-ilo.adm.auro.re
|
||||
|
||||
[gs_unifi]
|
||||
ga-0-1.borne.auro.re
|
||||
ga-1-1.borne.auro.re
|
||||
|
@ -366,6 +380,9 @@ dns-rives.adm.auro.re
|
|||
radius-rives.adm.auro.re
|
||||
routeur-rives.adm.auro.re
|
||||
|
||||
[rives_ilo]
|
||||
loki-ilo.adm.auro.re
|
||||
|
||||
[rives_unifi]
|
||||
r1-1-1.borne.auro.re
|
||||
r1-1-2.borne.auro.re
|
||||
|
|
|
@ -15,9 +15,7 @@
|
|||
{{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }}
|
||||
prometheus_unifi_snmp_targets: |
|
||||
{{ groups['fleming_unifi'] | list | sort }}
|
||||
prometheus_ipmi_targets: |
|
||||
{{ groups['fleming_pve'] + groups['fleming_server'] | list | sort }}
|
||||
prometheus_ilo_targets: |
|
||||
prometheus_ilo_snmp_targets: |
|
||||
{{ groups['fleming_ilo'] | list | sort }}
|
||||
|
||||
update_motd:
|
||||
|
@ -34,6 +32,9 @@
|
|||
snmp_switch_community: "{{ vault_snmp_switch_community }}"
|
||||
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
||||
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
||||
snmp_ilo_user: aurore
|
||||
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
||||
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
||||
|
||||
prometheus_servers_targets: |
|
||||
{{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }}
|
||||
|
@ -42,8 +43,8 @@
|
|||
prometheus_ups_snmp_targets:
|
||||
- ups-pn-1.ups.auro.re
|
||||
- ups-ps-1.ups.auro.re
|
||||
prometheus_ipmi_targets: |
|
||||
{{ groups['pacaterie_pve'] | list | sort }}
|
||||
prometheus_ilo_snmp_targets: |
|
||||
{{ groups['pacaterie_ilo'] | list | sort }}
|
||||
|
||||
update_motd:
|
||||
prometheus: >-
|
||||
|
@ -59,17 +60,20 @@
|
|||
snmp_switch_community: "{{ vault_snmp_switch_community }}"
|
||||
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
||||
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
||||
snmp_ilo_user: aurore
|
||||
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
||||
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
||||
|
||||
prometheus_ups_snmp_targets:
|
||||
- ups-ec-1.ups.auro.re
|
||||
- ups-ec-2.ups.auro.re
|
||||
# - ups-ec-2.ups.auro.re
|
||||
- ups-ec-3.ups.auro.re
|
||||
prometheus_servers_targets: |
|
||||
{{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }}
|
||||
prometheus_unifi_snmp_targets: |
|
||||
{{ groups['edc_unifi'] | list | sort }}
|
||||
prometheus_ipmi_targets: |
|
||||
{{ groups['edc_pve'] + groups['edc_server'] | list | sort }}
|
||||
prometheus_ilo_snmp_targets: |
|
||||
{{ groups['edc_ilo'] | list | sort }}
|
||||
|
||||
update_motd:
|
||||
prometheus: >-
|
||||
|
@ -85,6 +89,9 @@
|
|||
snmp_switch_community: "{{ vault_snmp_switch_community }}"
|
||||
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
||||
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
||||
snmp_ilo_user: aurore
|
||||
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
||||
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
||||
|
||||
prometheus_servers_targets: |
|
||||
{{ groups['gs_pve'] + groups['gs_vm'] | list | sort }}
|
||||
|
@ -94,8 +101,8 @@
|
|||
- ups-gk-1.ups.auro.re
|
||||
prometheus_apc_pdu_snmp_targets:
|
||||
- pdu-ga-1.ups.auro.re
|
||||
prometheus_ipmi_targets: |
|
||||
{{ groups['gs_pve'] | list | sort }}
|
||||
prometheus_ilo_snmp_targets: |
|
||||
{{ groups['gs_ilo'] | list | sort }}
|
||||
|
||||
update_motd:
|
||||
prometheus: >-
|
||||
|
@ -111,6 +118,9 @@
|
|||
snmp_switch_community: "{{ vault_snmp_switch_community }}"
|
||||
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
||||
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
||||
snmp_ilo_user: aurore
|
||||
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
||||
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
||||
|
||||
prometheus_ups_snmp_targets:
|
||||
- ups-r3-1.ups.auro.re
|
||||
|
@ -119,8 +129,8 @@
|
|||
{{ groups['rives_pve'] + groups['rives_vm'] | list | sort }}
|
||||
prometheus_unifi_snmp_targets: |
|
||||
{{ groups['rives_unifi'] | list | sort }}
|
||||
prometheus_ipmi_targets: |
|
||||
{{ groups['rives_pve'] | list | sort }}
|
||||
prometheus_ilo_snmp_targets: |
|
||||
{{ groups['rives_ilo'] | list | sort }}
|
||||
|
||||
update_motd:
|
||||
prometheus: >-
|
||||
|
@ -136,6 +146,9 @@
|
|||
snmp_switch_community: "{{ vault_snmp_switch_community }}"
|
||||
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
||||
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
||||
snmp_ilo_user: aurore
|
||||
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
||||
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
||||
|
||||
prometheus_servers_targets: |
|
||||
{{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
|
||||
|
@ -156,8 +169,8 @@
|
|||
- sw-ec-core.switch.auro.re
|
||||
- sw-gk-core.switch.auro.re
|
||||
- sw-r3-core.switch.auro.re
|
||||
prometheus_ipmi_targets: |
|
||||
{{ groups['aurore_pve'] | list | sort }}
|
||||
prometheus_ilo_snmp_targets: |
|
||||
{{ groups['aurore_ilo'] | list | sort }}
|
||||
|
||||
update_motd:
|
||||
prometheus: >-
|
||||
|
@ -173,6 +186,9 @@
|
|||
snmp_switch_community: "{{ vault_snmp_switch_community }}"
|
||||
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
||||
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
||||
snmp_ilo_user: aurore
|
||||
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
||||
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
||||
|
||||
prometheus_servers_targets: |
|
||||
{{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
|
||||
|
@ -194,6 +210,9 @@
|
|||
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
||||
snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
||||
snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
||||
snmp_ilo_user: aurore
|
||||
snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
||||
snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
||||
|
||||
prometheus_servers_targets:
|
||||
- prometheus-edc.adm.auro.re
|
||||
|
|
|
@ -38,10 +38,10 @@
|
|||
loop:
|
||||
- server.rules.yml
|
||||
- docker.rules.yml
|
||||
- django.rules.yml
|
||||
- ups.rules.yml
|
||||
- postgres.rules.yml
|
||||
- environmental.rules.yml
|
||||
- ilo.rules.yml
|
||||
notify: Restart Prometheus
|
||||
|
||||
- name: Make Prometheus snmp-exporter listen on localhost only
|
||||
|
@ -63,20 +63,6 @@
|
|||
- snmp.yml
|
||||
notify: Restart prometheus-snmp-exporter
|
||||
|
||||
- name: Remove old targets
|
||||
file:
|
||||
path: "{{ item }}"
|
||||
state: absent
|
||||
loop:
|
||||
- /etc/prometheus/targets.json
|
||||
- /etc/prometheus/targets_unifi_snmp.json
|
||||
- /etc/prometheus/targets_switch_snmp.json
|
||||
- /etc/prometheus/targets_ups_snmp.json
|
||||
- /etc/prometheus/targets_docker.json
|
||||
- /etc/prometheus/targets_postgres.json
|
||||
- /etc/prometheus/targets_apc_pdu_snmp.json
|
||||
- /etc/prometheus/targets_ipmi.json
|
||||
|
||||
- name: Activate prometheus service
|
||||
systemd:
|
||||
name: prometheus
|
||||
|
|
|
@ -1,106 +0,0 @@
|
|||
# {{ ansible_managed }}
|
||||
{# As this is also Jinja2 it will conflict without a raw block #}
|
||||
{% raw %}
|
||||
groups:
|
||||
- name: django.rules
|
||||
rules:
|
||||
- record: job:django_http_requests_before_middlewares_total:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job)
|
||||
- record: job:django_http_requests_unknown_latency_total:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job)
|
||||
- record: job:django_http_ajax_requests_total:sum_rate30s
|
||||
expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job)
|
||||
- record: job:django_http_responses_before_middlewares_total:sum_rate30s
|
||||
expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job)
|
||||
- record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s]))
|
||||
BY (job)
|
||||
- record: job:django_http_requests_body_total_bytes:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job)
|
||||
- record: job:django_http_responses_streaming_total:sum_rate30s
|
||||
expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job)
|
||||
- record: job:django_http_responses_body_total_bytes:sum_rate30s
|
||||
expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job)
|
||||
- record: job:django_http_requests_total:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job)
|
||||
- record: job:django_http_requests_total_by_method:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method)
|
||||
- record: job:django_http_requests_total_by_transport:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport)
|
||||
- record: job:django_http_requests_total_by_view:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
|
||||
view)
|
||||
- record: job:django_http_requests_total_by_view_transport_method:sum_rate30s
|
||||
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
|
||||
view, transport, method)
|
||||
- record: job:django_http_responses_total_by_templatename:sum_rate30s
|
||||
expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename)
|
||||
- record: job:django_http_responses_total_by_status:sum_rate30s
|
||||
expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status)
|
||||
- record: job:django_http_responses_total_by_charset:sum_rate30s
|
||||
expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset)
|
||||
- record: job:django_http_exceptions_total_by_type:sum_rate30s
|
||||
expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type)
|
||||
- record: job:django_http_exceptions_total_by_view:sum_rate30s
|
||||
expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view)
|
||||
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
|
||||
BY (job, le))
|
||||
labels:
|
||||
quantile: "50"
|
||||
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
|
||||
BY (job, le))
|
||||
labels:
|
||||
quantile: "95"
|
||||
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
|
||||
BY (job, le))
|
||||
labels:
|
||||
quantile: "99"
|
||||
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
|
||||
BY (job, le))
|
||||
labels:
|
||||
quantile: "99.9"
|
||||
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
|
||||
BY (job, le))
|
||||
labels:
|
||||
quantile: "50"
|
||||
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
|
||||
BY (job, le))
|
||||
labels:
|
||||
quantile: "95"
|
||||
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
|
||||
BY (job, le))
|
||||
labels:
|
||||
quantile: "99"
|
||||
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
||||
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
|
||||
BY (job, le))
|
||||
labels:
|
||||
quantile: "99.9"
|
||||
- record: job:django_model_inserts_total:sum_rate1m
|
||||
expr: sum(rate(django_model_inserts_total[1m])) BY (job, model)
|
||||
- record: job:django_model_updates_total:sum_rate1m
|
||||
expr: sum(rate(django_model_updates_total[1m])) BY (job, model)
|
||||
- record: job:django_model_deletes_total:sum_rate1m
|
||||
expr: sum(rate(django_model_deletes_total[1m])) BY (job, model)
|
||||
- record: job:django_db_new_connections_total:sum_rate30s
|
||||
expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor)
|
||||
- record: job:django_db_new_connection_errors_total:sum_rate30s
|
||||
expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor)
|
||||
- record: job:django_db_execute_total:sum_rate30s
|
||||
expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor)
|
||||
- record: job:django_db_execute_many_total:sum_rate30s
|
||||
expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor)
|
||||
- record: job:django_db_errors_total:sum_rate30s
|
||||
expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type)
|
||||
- record: job:django_migrations_applied_total:max
|
||||
expr: max(django_migrations_applied_total) BY (job, connection)
|
||||
- record: job:django_migrations_unapplied_total:max
|
||||
expr: max(django_migrations_unapplied_total) BY (job, connection)
|
||||
{% endraw %}
|
83
roles/prometheus/templates/ilo.rules.yml.j2
Normal file
83
roles/prometheus/templates/ilo.rules.yml.j2
Normal file
|
@ -0,0 +1,83 @@
|
|||
---
|
||||
{{ ansible_managed | comment }}
|
||||
|
||||
{% macro raw(string) -%}
|
||||
{{ "{{" }} {{ string }} {{ "}}" }}
|
||||
{%- endmacro %}
|
||||
|
||||
groups:
|
||||
|
||||
- name: ilo.rules
|
||||
rules:
|
||||
|
||||
- alert: IloResilientMemoryDegraded
|
||||
expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: >-
|
||||
La mémoire vive n'est plus résiliente
|
||||
erdnaxe marked this conversation as resolved
Outdated
|
||||
({{ raw('$labels.cpqHeResilientMemCondition') }})
|
||||
|
||||
- alert: IloBiosSelfTestDegraded
|
||||
expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
|
||||
jeltz marked this conversation as resolved
jeltz
commented
À quoi correspond À quoi correspond `other` ? C'est jamais une erreur ?
erdnaxe
commented
other = le module est absent par exemple, on a BEAUCOUP de other other = le module est absent par exemple, on a BEAUCOUP de other
jeltz
commented
Ok Ok
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
Une erreur a été détectée lors du POST du serveur
|
||||
({{ raw('$labels.cpqHeHWBiosCondition') }})
|
||||
|
||||
- alert: IloBatteryDegraded
|
||||
expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: >-
|
||||
La batterie est dégradée
|
||||
({{ raw('$labels.cpqHeSysBatteryCondition') }})
|
||||
|
||||
- alert: IloTemperatureSensorDegraded
|
||||
expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
Le capteur de température est dégradé
|
||||
({{ raw('$labels.cpqHeTemperatureCondition') }})
|
||||
|
||||
- alert: IloFanDegraded
|
||||
expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
Le ventilateur est dégradé
|
||||
({{ raw('$labels.cpqHeFltTolFanCondition') }})
|
||||
|
||||
- alert: IloPowerSupplyDegraded
|
||||
expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
L'alimentation est dégradée
|
||||
({{ raw('$labels.cpqHeFltTolPowerSupplyStatus') }})
|
||||
|
||||
- alert: IloOverrideSwitchState
|
||||
expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: >-
|
||||
Le switch de réinitialisation n'est pas à l'état d'origine,
|
||||
l'authentification est bypassée
|
||||
|
||||
...
|
|
@ -140,28 +140,10 @@ scrape_configs:
|
|||
replacement: 127.0.0.1:9116
|
||||
{% endif %}
|
||||
|
||||
{% if prometheus_ipmi_targets is defined %}
|
||||
- job_name: ipmi
|
||||
{% if prometheus_ilo_snmp_targets is defined %}
|
||||
- job_name: ilo_snmp
|
||||
static_configs:
|
||||
- targets: {{ prometheus_ipmi_targets | to_json }}
|
||||
metrics_path: /metrics
|
||||
params:
|
||||
module: [default]
|
||||
relabel_configs:
|
||||
# Do not put :9290 in instance name, rather here
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- source_labels: [__param_target]
|
||||
target_label: __address__
|
||||
replacement: '$1:9290'
|
||||
{% endif %}
|
||||
|
||||
{% if prometheus_ilo_targets is defined %}
|
||||
- job_name: ilo
|
||||
static_configs:
|
||||
- targets: {{ prometheus_ilo_targets | to_json }}
|
||||
- targets: {{ prometheus_ilo_snmp_targets | to_json }}
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module:
|
||||
|
|
|
@ -484,20 +484,35 @@ apc_pdu:
|
|||
|
||||
ilo:
|
||||
walk:
|
||||
- 1.3.6.1.4.1.232.6.2.15 # Power meter
|
||||
- 1.3.6.1.4.1.232.6.2.16 # POST tests
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1 # Temperature sensors
|
||||
- 1.3.6.1.4.1.232.6.2.6.7.1 # Fans
|
||||
- 1.3.6.1.4.1.232.6.2.9.3.1 # Power supply
|
||||
- 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory
|
||||
- 1.3.6.1.4.1.232.6.2.15.3 # Power meter
|
||||
- 1.3.6.1.4.1.232.6.2.16.1 # POST tests
|
||||
- 1.3.6.1.4.1.232.6.2.17.1 # Battery
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition
|
||||
- 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location
|
||||
- 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition
|
||||
- 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply
|
||||
- 1.3.6.1.4.1.232.9.2.2 # iLO
|
||||
metrics:
|
||||
- name: cpqHeResilientMemCondition
|
||||
oid: 1.3.6.1.4.1.232.6.2.14.4
|
||||
type: EnumAsStateSet
|
||||
help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4
|
||||
enum_values:
|
||||
1: other
|
||||
2: ok
|
||||
3: degraded
|
||||
4: failed
|
||||
- name: cpqHePowerMeterCurrReading
|
||||
oid: 1.3.6.1.4.1.232.6.2.15.3
|
||||
type: gauge
|
||||
help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3
|
||||
- name: cpqHeHWBiosCondition
|
||||
oid: 1.3.6.1.4.1.232.6.2.16.1
|
||||
type: gauge
|
||||
type: EnumAsStateSet
|
||||
help: This value indicates an error has been detected during Pre-OS Test (POST)
|
||||
or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1
|
||||
enum_values:
|
||||
|
@ -505,25 +520,20 @@ ilo:
|
|||
2: ok
|
||||
3: degraded
|
||||
4: failed
|
||||
- name: cpqHeTemperatureChassis
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.8.1.1
|
||||
type: gauge
|
||||
help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.8.1.1
|
||||
- name: cpqHeSysBatteryCondition
|
||||
oid: 1.3.6.1.4.1.232.6.2.17.1
|
||||
type: EnumAsStateSet
|
||||
help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1
|
||||
indexes:
|
||||
- labelname: cpqHeTemperatureChassis
|
||||
- labelname: cpqHeSysBatteryChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeTemperatureIndex
|
||||
type: gauge
|
||||
- name: cpqHeTemperatureIndex
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.8.1.2
|
||||
type: gauge
|
||||
help: A number that uniquely specifies this temperature sensor description. -
|
||||
1.3.6.1.4.1.232.6.2.6.8.1.2
|
||||
indexes:
|
||||
- labelname: cpqHeTemperatureChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeTemperatureIndex
|
||||
- labelname: cpqHeSysBatteryIndex
|
||||
type: gauge
|
||||
enum_values:
|
||||
1: other
|
||||
2: ok
|
||||
3: degraded
|
||||
4: failed
|
||||
- name: cpqHeTemperatureLocale
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.8.1.3
|
||||
type: EnumAsInfo
|
||||
|
@ -569,7 +579,7 @@ ilo:
|
|||
type: gauge
|
||||
- name: cpqHeTemperatureCondition
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.8.1.6
|
||||
type: gauge
|
||||
type: EnumAsStateSet
|
||||
help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6
|
||||
indexes:
|
||||
- labelname: cpqHeTemperatureChassis
|
||||
|
@ -581,25 +591,6 @@ ilo:
|
|||
2: ok
|
||||
3: degraded
|
||||
4: failed
|
||||
- name: cpqHeFltTolFanChassis
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.7.1.1
|
||||
type: gauge
|
||||
help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.7.1.1
|
||||
indexes:
|
||||
- labelname: cpqHeFltTolFanChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeFltTolFanIndex
|
||||
type: gauge
|
||||
- name: cpqHeFltTolFanIndex
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.7.1.2
|
||||
type: gauge
|
||||
help: A number that uniquely specifies this fan description. -
|
||||
1.3.6.1.4.1.232.6.2.6.7.1.2
|
||||
indexes:
|
||||
- labelname: cpqHeFltTolFanChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeFltTolFanIndex
|
||||
type: gauge
|
||||
- name: cpqHeFltTolFanLocale
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.7.1.3
|
||||
type: EnumAsInfo
|
||||
|
@ -626,7 +617,7 @@ ilo:
|
|||
13: bridgeCard
|
||||
- name: cpqHeFltTolFanCondition
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.7.1.9
|
||||
type: gauge
|
||||
type: EnumAsStateSet
|
||||
help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9
|
||||
indexes:
|
||||
- labelname: cpqHeFltTolFanChassis
|
||||
|
@ -638,27 +629,9 @@ ilo:
|
|||
2: ok
|
||||
3: degraded
|
||||
4: failed
|
||||
- name: cpqHeFltTolPowerSupplyChassis
|
||||
oid: 1.3.6.1.4.1.232.6.2.9.3.1.1
|
||||
type: gauge
|
||||
help: The system chassis number. - 1.3.6.1.4.1.232.6.2.9.3.1.1
|
||||
indexes:
|
||||
- labelname: cpqHeFltTolPowerSupplyChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeFltTolPowerSupplyBay
|
||||
type: gauge
|
||||
- name: cpqHeFltTolPowerSupplyBay
|
||||
oid: 1.3.6.1.4.1.232.6.2.9.3.1.2
|
||||
type: gauge
|
||||
help: The bay number to index within this chassis. - 1.3.6.1.4.1.232.6.2.9.3.1.2
|
||||
indexes:
|
||||
- labelname: cpqHeFltTolPowerSupplyChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeFltTolPowerSupplyBay
|
||||
type: gauge
|
||||
- name: cpqHeFltTolPowerSupplyStatus
|
||||
oid: 1.3.6.1.4.1.232.6.2.9.3.1.5
|
||||
type: gauge
|
||||
type: EnumAsStateSet
|
||||
help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5
|
||||
indexes:
|
||||
- labelname: cpqHeFltTolPowerSupplyChassis
|
||||
|
@ -685,7 +658,7 @@ ilo:
|
|||
17: noPowerInput
|
||||
- name: cpqSm2CntlrInterfaceStatus
|
||||
oid: 1.3.6.1.4.1.232.9.2.2.17
|
||||
type: gauge
|
||||
type: EnumAsStateSet
|
||||
help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17
|
||||
enum_values:
|
||||
1: other
|
||||
|
@ -693,7 +666,7 @@ ilo:
|
|||
3: notResponding
|
||||
- name: cpqSm2CntlriLOSecurityOverrideSwitchState
|
||||
oid: 1.3.6.1.4.1.232.9.2.2.27
|
||||
type: gauge
|
||||
type: EnumAsStateSet
|
||||
help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27
|
||||
enum_values:
|
||||
1: notSupported
|
||||
|
@ -701,7 +674,7 @@ ilo:
|
|||
3: notSet
|
||||
- name: cpqSm2CntlrLicenseActive
|
||||
oid: 1.3.6.1.4.1.232.9.2.2.30
|
||||
type: gauge
|
||||
type: EnumAsStateSet
|
||||
help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30
|
||||
enum_values:
|
||||
1: none
|
||||
|
@ -714,7 +687,7 @@ ilo:
|
|||
8: iloAdvancedPremiumSecurity
|
||||
- name: cpqSm2CntlrServerPowerState
|
||||
oid: 1.3.6.1.4.1.232.9.2.2.32
|
||||
type: gauge
|
||||
type: EnumAsStateSet
|
||||
help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32
|
||||
enum_values:
|
||||
1: unknown
|
||||
|
@ -722,6 +695,8 @@ ilo:
|
|||
3: poweredOn
|
||||
4: insufficientPowerOrPowerOnDenied
|
||||
version: 3
|
||||
# Reduce timeout to retry faster
|
||||
timeout: 1s
|
||||
auth:
|
||||
security_level: authPriv
|
||||
username: {{ snmp_ilo_user }}
|
||||
|
|
|
@ -34,10 +34,9 @@ scrape_configs:
|
|||
- '{job="postgresql"}'
|
||||
- '{job="prometheus"}'
|
||||
- '{job="unifi_snmp"}'
|
||||
- '{job="django"}'
|
||||
- '{job="ups_snmp"}'
|
||||
- '{job="apc_pdu_snmp"}'
|
||||
- '{job="docker"}'
|
||||
- '{job="switch_snmp"}'
|
||||
- '{job="ipmi"}'
|
||||
- '{job="ilo_snmp"}'
|
||||
...
|
||||
|
|
Loading…
Reference in a new issue
résiliente