diff --git a/roles/prometheus/templates/ilo.rules.yml.j2 b/roles/prometheus/templates/ilo.rules.yml.j2 index 47a3e8f..8a3f72b 100644 --- a/roles/prometheus/templates/ilo.rules.yml.j2 +++ b/roles/prometheus/templates/ilo.rules.yml.j2 @@ -10,6 +10,16 @@ groups: - name: ilo.rules rules: + - alert: IloResilientMemoryDegraded + expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: warning + annotations: + summary: >- + La mémoire vive n'est plus résilente + ({{ raw('$labels.cpqHeResilientMemCondition') }}) + - alert: IloBiosSelfTestDegraded expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1 for: 3m @@ -20,6 +30,16 @@ groups: Une erreur a été détectée lors du POST du serveur ({{ raw('$labels.cpqHeHWBiosCondition') }}) + - alert: IloBatteryDegraded + expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1 + for: 3m + labels: + severity: warning + annotations: + summary: >- + La batterie est dégradée + ({{ raw('$labels.cpqHeSysBatteryCondition') }}) + - alert: IloTemperatureSensorDegraded expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1 for: 3m diff --git a/roles/prometheus/templates/snmp.yml.j2 b/roles/prometheus/templates/snmp.yml.j2 index 1171d4c..4ecd4b3 100644 --- a/roles/prometheus/templates/snmp.yml.j2 +++ b/roles/prometheus/templates/snmp.yml.j2 @@ -484,13 +484,28 @@ apc_pdu: ilo: walk: - - 1.3.6.1.4.1.232.6.2.15 # Power meter - - 1.3.6.1.4.1.232.6.2.16 # POST tests - - 1.3.6.1.4.1.232.6.2.6.8.1 # Temperature sensors - - 1.3.6.1.4.1.232.6.2.6.7.1 # Fans - - 1.3.6.1.4.1.232.6.2.9.3.1 # Power supply + - 1.3.6.1.4.1.232.6.2.14.4 # Resilient memory + - 1.3.6.1.4.1.232.6.2.15.3 # Power meter + - 1.3.6.1.4.1.232.6.2.16.1 # POST tests + - 1.3.6.1.4.1.232.6.2.17.1 # Battery + - 1.3.6.1.4.1.232.6.2.6.8.1.3 # Temperature sensors location + - 1.3.6.1.4.1.232.6.2.6.8.1.4 # Temperature sensors value + - 1.3.6.1.4.1.232.6.2.6.8.1.5 # Temperature sensors limit + - 1.3.6.1.4.1.232.6.2.6.8.1.6 # Temperature sensors condition + - 1.3.6.1.4.1.232.6.2.6.7.1.3 # Fans location + - 1.3.6.1.4.1.232.6.2.6.7.1.9 # Fans condition + - 1.3.6.1.4.1.232.6.2.9.3.1.5 # Power supply - 1.3.6.1.4.1.232.9.2.2 # iLO metrics: + - name: cpqHeResilientMemCondition + oid: 1.3.6.1.4.1.232.6.2.14.4 + type: EnumAsStateSet + help: The resilient memory condition - 1.3.6.1.4.1.232.6.2.14.4 + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed - name: cpqHePowerMeterCurrReading oid: 1.3.6.1.4.1.232.6.2.15.3 type: gauge @@ -505,6 +520,20 @@ ilo: 2: ok 3: degraded 4: failed + - name: cpqHeSysBatteryCondition + oid: 1.3.6.1.4.1.232.6.2.17.1 + type: EnumAsStateSet + help: The battery condition - 1.3.6.1.4.1.232.6.2.17.1 + indexes: + - labelname: cpqHeSysBatteryChassis + type: gauge + - labelname: cpqHeSysBatteryIndex + type: gauge + enum_values: + 1: other + 2: ok + 3: degraded + 4: failed - name: cpqHeTemperatureLocale oid: 1.3.6.1.4.1.232.6.2.6.8.1.3 type: EnumAsInfo