From 8c7031d05994fd11e7b11d7489867376dfc13623 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Fri, 31 Dec 2021 18:31:58 +0100 Subject: [PATCH] prometheus: add iLO SNMP target --- hosts | 3 + playbooks/prometheus.yml | 2 + roles/prometheus/templates/prometheus.yml.j2 | 17 ++ roles/prometheus/templates/snmp.yml.j2 | 248 +++++++++++++++++++ 4 files changed, 270 insertions(+) diff --git a/hosts b/hosts index 2f397b4..29bc3b3 100644 --- a/hosts +++ b/hosts @@ -92,6 +92,9 @@ unifi-fleming.adm.auro.re routeur-fleming.adm.auro.re routeur-fleming-backup.adm.auro.re +[fleming_ilo] +marki-ilo.adm.auro.re + [fleming_unifi] fa-0-1.borne.auro.re fa-1-1.borne.auro.re diff --git a/playbooks/prometheus.yml b/playbooks/prometheus.yml index 0b658d6..8a27497 100755 --- a/playbooks/prometheus.yml +++ b/playbooks/prometheus.yml @@ -17,6 +17,8 @@ {{ groups['fleming_unifi'] | list | sort }} prometheus_ipmi_targets: | {{ groups['fleming_pve'] + groups['fleming_server'] | list | sort }} + prometheus_ilo_targets: | + {{ groups['fleming_ilo'] | list | sort }} update_motd: prometheus: >- diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 15f12e9..e80e355 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -158,4 +158,21 @@ scrape_configs: replacement: '$1:9290' {% endif %} +{% if prometheus_ilo_targets is defined %} + - job_name: ilo + static_configs: + - targets: {{ prometheus_ilo_targets | to_json }} + metrics_path: /snmp + params: + module: + - ilo + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: 127.0.0.1:9116 +{% endif %} + ... diff --git a/roles/prometheus/templates/snmp.yml.j2 b/roles/prometheus/templates/snmp.yml.j2 index 5ed5fa1..d593a3d 100644 --- a/roles/prometheus/templates/snmp.yml.j2 +++ b/roles/prometheus/templates/snmp.yml.j2 @@ -482,4 +482,252 @@ apc_pdu: priv_protocol: AES priv_password: {{ snmp_pdu_password }} +ilo: + walk: + - 1.3.6.1.4.1.232.6.2.15 # Power meter + - 1.3.6.1.4.1.232.6.2.16 # POST tests + - 1.3.6.1.4.1.232.6.2.6.8.1 # Temperature sensors + - 1.3.6.1.4.1.232.6.2.6.7.1 # Fans + - 1.3.6.1.4.1.232.6.2.9.3.1 # Power supply + - 1.3.6.1.4.1.232.9.2.2 # iLO + metrics: + - name: cpqHePowerMeterCurrReading + oid: 1.3.6.1.4.1.232.6.2.15.3 + type: gauge + help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3 + - name: cpqHeHWBiosCondition + oid: 1.3.6.1.4.1.232.6.2.16.1 + type: gauge + help: This value indicates an error has been detected during Pre-OS Test (POST) + or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1 + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed + - name: cpqHeTemperatureChassis + oid: 1.3.6.1.4.1.232.6.2.6.8.1.1 + type: gauge + help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.8.1.1 + indexes: + - labelname: cpqHeTemperatureChassis + type: gauge + - labelname: cpqHeTemperatureIndex + type: gauge + - name: cpqHeTemperatureIndex + oid: 1.3.6.1.4.1.232.6.2.6.8.1.2 + type: gauge + help: A number that uniquely specifies this temperature sensor description. - + 1.3.6.1.4.1.232.6.2.6.8.1.2 + indexes: + - labelname: cpqHeTemperatureChassis + type: gauge + - labelname: cpqHeTemperatureIndex + type: gauge + - name: cpqHeTemperatureLocale + oid: 1.3.6.1.4.1.232.6.2.6.8.1.3 + type: EnumAsInfo + help: This specifies the location of the temperature sensor present in the system. + - 1.3.6.1.4.1.232.6.2.6.8.1.3 + indexes: + - labelname: cpqHeTemperatureChassis + type: gauge + - labelname: cpqHeTemperatureIndex + type: gauge + enum_values: + 1: other + 2: unknown + 3: system + 4: systemBoard + 5: ioBoard + 6: cpu + 7: memory + 8: storage + 9: removableMedia + 10: powerSupply + 11: ambient + 12: chassis + 13: bridgeCard + - name: cpqHeTemperatureCelsius + oid: 1.3.6.1.4.1.232.6.2.6.8.1.4 + type: gauge + help: This is the current temperature sensor reading in degrees celsius - 1.3.6.1.4.1.232.6.2.6.8.1.4 + indexes: + - labelname: cpqHeTemperatureChassis + type: gauge + - labelname: cpqHeTemperatureIndex + type: gauge + - name: cpqHeTemperatureThreshold + oid: 1.3.6.1.4.1.232.6.2.6.8.1.5 + type: gauge + help: This is the shutdown threshold temperature sensor setting in degrees celsius + - 1.3.6.1.4.1.232.6.2.6.8.1.5 + indexes: + - labelname: cpqHeTemperatureChassis + type: gauge + - labelname: cpqHeTemperatureIndex + type: gauge + - name: cpqHeTemperatureCondition + oid: 1.3.6.1.4.1.232.6.2.6.8.1.6 + type: gauge + help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6 + indexes: + - labelname: cpqHeTemperatureChassis + type: gauge + - labelname: cpqHeTemperatureIndex + type: gauge + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed + - name: cpqHeFltTolFanChassis + oid: 1.3.6.1.4.1.232.6.2.6.7.1.1 + type: gauge + help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.7.1.1 + indexes: + - labelname: cpqHeFltTolFanChassis + type: gauge + - labelname: cpqHeFltTolFanIndex + type: gauge + - name: cpqHeFltTolFanIndex + oid: 1.3.6.1.4.1.232.6.2.6.7.1.2 + type: gauge + help: A number that uniquely specifies this fan description. - + 1.3.6.1.4.1.232.6.2.6.7.1.2 + indexes: + - labelname: cpqHeFltTolFanChassis + type: gauge + - labelname: cpqHeFltTolFanIndex + type: gauge + - name: cpqHeFltTolFanLocale + oid: 1.3.6.1.4.1.232.6.2.6.7.1.3 + type: EnumAsInfo + help: This specifies the location of the fan present in the system. + - 1.3.6.1.4.1.232.6.2.6.7.1.3 + indexes: + - labelname: cpqHeFltTolFanChassis + type: gauge + - labelname: cpqHeFltTolFanIndex + type: gauge + enum_values: + 1: other + 2: unknown + 3: system + 4: systemBoard + 5: ioBoard + 6: cpu + 7: memory + 8: storage + 9: removableMedia + 10: powerSupply + 11: ambient + 12: chassis + 13: bridgeCard + - name: cpqHeFltTolFanCondition + oid: 1.3.6.1.4.1.232.6.2.6.7.1.9 + type: gauge + help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9 + indexes: + - labelname: cpqHeFltTolFanChassis + type: gauge + - labelname: cpqHeFltTolFanIndex + type: gauge + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed + - name: cpqHeFltTolPowerSupplyChassis + oid: 1.3.6.1.4.1.232.6.2.9.3.1.1 + type: gauge + help: The system chassis number. - 1.3.6.1.4.1.232.6.2.9.3.1.1 + indexes: + - labelname: cpqHeFltTolPowerSupplyChassis + type: gauge + - labelname: cpqHeFltTolPowerSupplyBay + type: gauge + - name: cpqHeFltTolPowerSupplyBay + oid: 1.3.6.1.4.1.232.6.2.9.3.1.2 + type: gauge + help: The bay number to index within this chassis. - 1.3.6.1.4.1.232.6.2.9.3.1.2 + indexes: + - labelname: cpqHeFltTolPowerSupplyChassis + type: gauge + - labelname: cpqHeFltTolPowerSupplyBay + type: gauge + - name: cpqHeFltTolPowerSupplyStatus + oid: 1.3.6.1.4.1.232.6.2.9.3.1.5 + type: gauge + help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5 + indexes: + - labelname: cpqHeFltTolPowerSupplyChassis + type: gauge + - labelname: cpqHeFltTolPowerSupplyBay + type: gauge + enum_values: + 1: noError + 2: generalFailure + 3: bistFailure + 4: fanFailure + 5: tempFailure + 6: interlockOpen + 7: epromFailed + 8: vrefFailed + 9: dacFailed + 10: ramTestFailed + 11: voltageChannelFailed + 12: orringdiodeFailed + 13: brownOut + 14: giveupOnStartup + 15: nvramInvalid + 16: calibrationTableInvalid + 17: noPowerInput + - name: cpqSm2CntlrInterfaceStatus + oid: 1.3.6.1.4.1.232.9.2.2.17 + type: gauge + help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17 + enum_values: + 1: other + 2: ok + 3: notResponding + - name: cpqSm2CntlriLOSecurityOverrideSwitchState + oid: 1.3.6.1.4.1.232.9.2.2.27 + type: gauge + help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27 + enum_values: + 1: notSupported + 2: set + 3: notSet + - name: cpqSm2CntlrLicenseActive + oid: 1.3.6.1.4.1.232.9.2.2.30 + type: gauge + help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30 + enum_values: + 1: none + 2: iloAdvanced + 3: iloLight + 4: iloAdvancedBlade + 5: iloStandard + 6: iloEssentials + 7: iloScaleOut + 8: iloAdvancedPremiumSecurity + - name: cpqSm2CntlrServerPowerState + oid: 1.3.6.1.4.1.232.9.2.2.32 + type: gauge + help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32 + enum_values: + 1: unknown + 2: poweredOff + 3: poweredOn + 4: insufficientPowerOrPowerOnDenied + version: 3 + auth: + security_level: authPriv + username: {{ snmp_ilo_user }} + password: {{ snmp_ilo_auth }} + auth_protocol: SHA + priv_protocol: AES + priv_password: {{ snmp_ilo_priv }} + ...