prometheus: monitor iLO resilient mem and battery

This commit is contained in:
Alexandre Iooss 2022-01-01 16:45:10 +01:00
parent 5ab3dcdac2
commit 70c8e0ebe0
2 changed files with 54 additions and 5 deletions

View file

@ -10,6 +10,16 @@ groups:
- name: ilo.rules
rules:
- alert: IloResilientMemoryDegraded
expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1
for: 3m
labels:
severity: warning
annotations:
summary: >-
La mémoire vive n'est plus résilente
({{ raw('$labels.cpqHeResilientMemCondition') }})
- alert: IloBiosSelfTestDegraded
expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
for: 3m
@ -20,6 +30,16 @@ groups:
Une erreur a été détectée lors du POST du serveur
({{ raw('$labels.cpqHeHWBiosCondition') }})
- alert: IloBatteryDegraded
expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1
for: 3m
labels:
severity: warning
annotations:
summary: >-
La batterie est dégradée
({{ raw('$labels.cpqHeSysBatteryCondition') }})
- alert: IloTemperatureSensorDegraded
expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
for: 3m

View file

@ -484,13 +484,28 @@ apc_pdu:
ilo:
walk:
- 1.3.6.1.4.1.232.6.2.15 # Power meter
- 1.3.6.1.4.1.232.6.2.16 # POST tests
- 1.3.6.1.4.1.232.6.2.6.8.1 # Temperature sensors
- 1.3.6.1.4.1.232.6.2.6.7.1 # Fans
- 1.3.6.1.4.1.232.6.2.9.3.1 # Power supply
- 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory
- 1.3.6.1.4.1.232.6.2.15.3 # Power meter
- 1.3.6.1.4.1.232.6.2.16.1 # POST tests
- 1.3.6.1.4.1.232.6.2.17.1 # Battery
- 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location
- 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value
- 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit
- 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition
- 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location
- 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition
- 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply
- 1.3.6.1.4.1.232.9.2.2 # iLO
metrics:
- name: cpqHeResilientMemCondition
oid: 1.3.6.1.4.1.232.6.2.14.4
type: EnumAsStateSet
help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHePowerMeterCurrReading
oid: 1.3.6.1.4.1.232.6.2.15.3
type: gauge
@ -505,6 +520,20 @@ ilo:
2: ok
3: degraded
4: failed
- name: cpqHeSysBatteryCondition
oid: 1.3.6.1.4.1.232.6.2.17.1
type: EnumAsStateSet
help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1
indexes:
- labelname: cpqHeSysBatteryChassis
type: gauge
- labelname: cpqHeSysBatteryIndex
type: gauge
enum_values:
1: other
2: ok
3: degraded
4: failed
- name: cpqHeTemperatureLocale
oid: 1.3.6.1.4.1.232.6.2.6.8.1.3
type: EnumAsInfo