Prometheus: cleanup #90
2 changed files with 54 additions and 5 deletions
|
@ -10,6 +10,16 @@ groups:
|
|||
- name: ilo.rules
|
||||
rules:
|
||||
|
||||
- alert: IloResilientMemoryDegraded
|
||||
expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: >-
|
||||
La mémoire vive n'est plus résilente
|
||||
({{ raw('$labels.cpqHeResilientMemCondition') }})
|
||||
|
||||
- alert: IloBiosSelfTestDegraded
|
||||
expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
|
||||
jeltz marked this conversation as resolved
|
||||
for: 3m
|
||||
|
@ -20,6 +30,16 @@ groups:
|
|||
Une erreur a été détectée lors du POST du serveur
|
||||
({{ raw('$labels.cpqHeHWBiosCondition') }})
|
||||
|
||||
- alert: IloBatteryDegraded
|
||||
expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: >-
|
||||
La batterie est dégradée
|
||||
({{ raw('$labels.cpqHeSysBatteryCondition') }})
|
||||
|
||||
- alert: IloTemperatureSensorDegraded
|
||||
expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
|
||||
for: 3m
|
||||
|
|
|
@ -484,13 +484,28 @@ apc_pdu:
|
|||
|
||||
ilo:
|
||||
walk:
|
||||
- 1.3.6.1.4.1.232.6.2.15 # Power meter
|
||||
- 1.3.6.1.4.1.232.6.2.16 # POST tests
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1 # Temperature sensors
|
||||
- 1.3.6.1.4.1.232.6.2.6.7.1 # Fans
|
||||
- 1.3.6.1.4.1.232.6.2.9.3.1 # Power supply
|
||||
- 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory
|
||||
- 1.3.6.1.4.1.232.6.2.15.3 # Power meter
|
||||
- 1.3.6.1.4.1.232.6.2.16.1 # POST tests
|
||||
- 1.3.6.1.4.1.232.6.2.17.1 # Battery
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit
|
||||
- 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition
|
||||
- 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location
|
||||
- 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition
|
||||
- 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply
|
||||
- 1.3.6.1.4.1.232.9.2.2 # iLO
|
||||
metrics:
|
||||
- name: cpqHeResilientMemCondition
|
||||
oid: 1.3.6.1.4.1.232.6.2.14.4
|
||||
type: EnumAsStateSet
|
||||
help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4
|
||||
enum_values:
|
||||
1: other
|
||||
2: ok
|
||||
3: degraded
|
||||
4: failed
|
||||
- name: cpqHePowerMeterCurrReading
|
||||
oid: 1.3.6.1.4.1.232.6.2.15.3
|
||||
type: gauge
|
||||
|
@ -505,6 +520,20 @@ ilo:
|
|||
2: ok
|
||||
3: degraded
|
||||
4: failed
|
||||
- name: cpqHeSysBatteryCondition
|
||||
oid: 1.3.6.1.4.1.232.6.2.17.1
|
||||
type: EnumAsStateSet
|
||||
help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1
|
||||
indexes:
|
||||
- labelname: cpqHeSysBatteryChassis
|
||||
type: gauge
|
||||
- labelname: cpqHeSysBatteryIndex
|
||||
type: gauge
|
||||
enum_values:
|
||||
1: other
|
||||
2: ok
|
||||
3: degraded
|
||||
4: failed
|
||||
- name: cpqHeTemperatureLocale
|
||||
oid: 1.3.6.1.4.1.232.6.2.6.8.1.3
|
||||
type: EnumAsInfo
|
||||
|
|
Loading…
Reference in a new issue
À quoi correspond
other
? C'est jamais une erreur ?other = le module est absent par exemple, on a BEAUCOUP de other
Ok