Monitor iLO using SNMP #89

Merged
erdnaxe merged 8 commits from prometheus_ilo into master 2021-12-31 18:40:20 +01:00
4 changed files with 270 additions and 0 deletions
Showing only changes of commit 8c7031d059 - Show all commits

3
hosts
View file

@ -92,6 +92,9 @@ unifi-fleming.adm.auro.re
routeur-fleming.adm.auro.re
routeur-fleming-backup.adm.auro.re
[fleming_ilo]
marki-ilo.adm.auro.re
[fleming_unifi]
fa-0-1.borne.auro.re
fa-1-1.borne.auro.re

View file

@ -17,6 +17,8 @@
{{ groups['fleming_unifi'] | list | sort }}
prometheus_ipmi_targets: |
{{ groups['fleming_pve'] + groups['fleming_server'] | list | sort }}
prometheus_ilo_targets: |
{{ groups['fleming_ilo'] | list | sort }}
update_motd:
prometheus: >-

View file

@ -158,4 +158,21 @@ scrape_configs:
replacement: '$1:9290'
{% endif %}
{% if prometheus_ilo_targets is defined %}
- job_name: ilo
static_configs:
- targets: {{ prometheus_ilo_targets | to_json }}
metrics_path: /snmp
params:
module:
- ilo
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116
{% endif %}
...

View file

@ -482,4 +482,252 @@ apc_pdu:
priv_protocol: AES
priv_password: {{ snmp_pdu_password }}
ilo:
walk:
- 1.3.6.1.4.1.232.6.2.15 # Power meter
- 1.3.6.1.4.1.232.6.2.16 # POST tests
- 1.3.6.1.4.1.232.6.2.6.8.1 # Temperature sensors
- 1.3.6.1.4.1.232.6.2.6.7.1 # Fans
- 1.3.6.1.4.1.232.6.2.9.3.1 # Power supply
- 1.3.6.1.4.1.232.9.2.2 # iLO
metrics:
- name: cpqHePowerMeterCurrReading
oid: 1.3.6.1.4.1.232.6.2.15.3
type: gauge
help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3
- name: cpqHeHWBiosCondition
oid: 1.3.6.1.4.1.232.6.2.16.1
type: gauge
help: This value indicates an error has been detected during Pre-OS Test (POST)
or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHeTemperatureChassis
oid: 1.3.6.1.4.1.232.6.2.6.8.1.1
type: gauge
help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.8.1.1
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
- name: cpqHeTemperatureIndex
oid: 1.3.6.1.4.1.232.6.2.6.8.1.2
type: gauge
help: A number that uniquely specifies this temperature sensor description. -
1.3.6.1.4.1.232.6.2.6.8.1.2
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
- name: cpqHeTemperatureLocale
oid: 1.3.6.1.4.1.232.6.2.6.8.1.3
type: EnumAsInfo
help: This specifies the location of the temperature sensor present in the system.
- 1.3.6.1.4.1.232.6.2.6.8.1.3
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
enum_values:
1: other
2: unknown
3: system
4: systemBoard
5: ioBoard
6: cpu
7: memory
8: storage
9: removableMedia
10: powerSupply
11: ambient
12: chassis
13: bridgeCard
- name: cpqHeTemperatureCelsius
oid: 1.3.6.1.4.1.232.6.2.6.8.1.4
type: gauge
help: This is the current temperature sensor reading in degrees celsius - 1.3.6.1.4.1.232.6.2.6.8.1.4
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
- name: cpqHeTemperatureThreshold
oid: 1.3.6.1.4.1.232.6.2.6.8.1.5
type: gauge
help: This is the shutdown threshold temperature sensor setting in degrees celsius
- 1.3.6.1.4.1.232.6.2.6.8.1.5
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
- name: cpqHeTemperatureCondition
oid: 1.3.6.1.4.1.232.6.2.6.8.1.6
type: gauge
help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHeFltTolFanChassis
oid: 1.3.6.1.4.1.232.6.2.6.7.1.1
type: gauge
help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.7.1.1
indexes:
- labelname: cpqHeFltTolFanChassis
type: gauge
- labelname: cpqHeFltTolFanIndex
type: gauge
- name: cpqHeFltTolFanIndex
oid: 1.3.6.1.4.1.232.6.2.6.7.1.2
type: gauge
help: A number that uniquely specifies this fan description. -
1.3.6.1.4.1.232.6.2.6.7.1.2
indexes:
- labelname: cpqHeFltTolFanChassis
type: gauge
- labelname: cpqHeFltTolFanIndex
type: gauge
- name: cpqHeFltTolFanLocale
oid: 1.3.6.1.4.1.232.6.2.6.7.1.3
type: EnumAsInfo
help: This specifies the location of the fan present in the system.
- 1.3.6.1.4.1.232.6.2.6.7.1.3
indexes:
- labelname: cpqHeFltTolFanChassis
type: gauge
- labelname: cpqHeFltTolFanIndex
type: gauge
enum_values:
1: other
2: unknown
3: system
4: systemBoard
5: ioBoard
6: cpu
7: memory
8: storage
9: removableMedia
10: powerSupply
11: ambient
12: chassis
13: bridgeCard
- name: cpqHeFltTolFanCondition
oid: 1.3.6.1.4.1.232.6.2.6.7.1.9
type: gauge
help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9
indexes:
- labelname: cpqHeFltTolFanChassis
type: gauge
- labelname: cpqHeFltTolFanIndex
type: gauge
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHeFltTolPowerSupplyChassis
oid: 1.3.6.1.4.1.232.6.2.9.3.1.1
type: gauge
help: The system chassis number. - 1.3.6.1.4.1.232.6.2.9.3.1.1
indexes:
- labelname: cpqHeFltTolPowerSupplyChassis
type: gauge
- labelname: cpqHeFltTolPowerSupplyBay
type: gauge
- name: cpqHeFltTolPowerSupplyBay
oid: 1.3.6.1.4.1.232.6.2.9.3.1.2
type: gauge
help: The bay number to index within this chassis. - 1.3.6.1.4.1.232.6.2.9.3.1.2
indexes:
- labelname: cpqHeFltTolPowerSupplyChassis
type: gauge
- labelname: cpqHeFltTolPowerSupplyBay
type: gauge
- name: cpqHeFltTolPowerSupplyStatus
oid: 1.3.6.1.4.1.232.6.2.9.3.1.5
type: gauge
help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5
indexes:
- labelname: cpqHeFltTolPowerSupplyChassis
type: gauge
- labelname: cpqHeFltTolPowerSupplyBay
type: gauge
enum_values:
1: noError
2: generalFailure
3: bistFailure
4: fanFailure
5: tempFailure
6: interlockOpen
7: epromFailed
8: vrefFailed
9: dacFailed
10: ramTestFailed
11: voltageChannelFailed
12: orringdiodeFailed
13: brownOut
14: giveupOnStartup
15: nvramInvalid
16: calibrationTableInvalid
17: noPowerInput
- name: cpqSm2CntlrInterfaceStatus
oid: 1.3.6.1.4.1.232.9.2.2.17
type: gauge
help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17
enum_values:
1: other
2: ok
3: notResponding
- name: cpqSm2CntlriLOSecurityOverrideSwitchState
oid: 1.3.6.1.4.1.232.9.2.2.27
type: gauge
help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27
enum_values:
1: notSupported
2: set
3: notSet
- name: cpqSm2CntlrLicenseActive
oid: 1.3.6.1.4.1.232.9.2.2.30
type: gauge
help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30
enum_values:
1: none
2: iloAdvanced
3: iloLight
4: iloAdvancedBlade
5: iloStandard
6: iloEssentials
7: iloScaleOut
8: iloAdvancedPremiumSecurity
- name: cpqSm2CntlrServerPowerState
oid: 1.3.6.1.4.1.232.9.2.2.32
type: gauge
help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32
enum_values:
1: unknown
2: poweredOff
3: poweredOn
4: insufficientPowerOrPowerOnDenied
version: 3
auth:
security_level: authPriv
username: {{ snmp_ilo_user }}
password: {{ snmp_ilo_auth }}
auth_protocol: SHA
priv_protocol: AES
priv_password: {{ snmp_ilo_priv }}
...