ansible/roles/prometheus/templates/ilo.rules.yml.j2

84 lines
2.5 KiB
Plaintext
Raw Normal View History

2021-12-31 21:19:35 +01:00
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: ilo.rules
rules:
- alert: IloResilientMemoryDegraded
expr: cpqHeResilientMemCondition{cpqHeResilientMemCondition!~"ok|other"} == 1
for: 3m
labels:
severity: warning
annotations:
summary: >-
2022-01-01 17:55:32 +01:00
La mémoire vive n'est plus résiliente
({{ raw('$labels.cpqHeResilientMemCondition') }})
2021-12-31 21:19:35 +01:00
- alert: IloBiosSelfTestDegraded
2022-01-01 11:59:23 +01:00
expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
2021-12-31 21:19:35 +01:00
for: 3m
labels:
severity: critical
annotations:
summary: >-
Une erreur a été détectée lors du POST du serveur
2022-01-01 11:59:23 +01:00
({{ raw('$labels.cpqHeHWBiosCondition') }})
2021-12-31 21:19:35 +01:00
- alert: IloBatteryDegraded
expr: cpqHeSysBatteryCondition{cpqHeSysBatteryCondition!~"ok|other"} == 1
for: 3m
labels:
severity: warning
annotations:
summary: >-
La batterie est dégradée
({{ raw('$labels.cpqHeSysBatteryCondition') }})
2021-12-31 21:19:35 +01:00
- alert: IloTemperatureSensorDegraded
2022-01-01 11:59:23 +01:00
expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
2021-12-31 21:19:35 +01:00
for: 3m
labels:
severity: critical
annotations:
summary: >-
Le capteur de température est dégradé
2022-01-01 11:59:23 +01:00
({{ raw('$labels.cpqHeTemperatureCondition') }})
2021-12-31 21:19:35 +01:00
- alert: IloFanDegraded
2022-01-01 11:59:23 +01:00
expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1
2021-12-31 21:19:35 +01:00
for: 3m
labels:
severity: critical
annotations:
summary: >-
Le ventilateur est dégradé
2022-01-01 11:59:23 +01:00
({{ raw('$labels.cpqHeFltTolFanCondition') }})
2021-12-31 21:19:35 +01:00
- alert: IloPowerSupplyDegraded
2022-01-01 11:59:23 +01:00
expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1
2021-12-31 21:19:35 +01:00
for: 3m
labels:
severity: critical
annotations:
summary: >-
L'alimentation est dégradée
2022-01-01 11:59:23 +01:00
({{ raw('$labels.cpqHeFltTolPowerSupplyStatus') }})
2021-12-31 21:19:35 +01:00
- alert: IloOverrideSwitchState
2022-01-01 11:59:23 +01:00
expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1
2021-12-31 21:19:35 +01:00
for: 3m
labels:
severity: critical
annotations:
summary: >-
2022-01-01 11:59:23 +01:00
Le switch de réinitialisation n'est pas à l'état d'origine,
l'authentification est bypassée
2021-12-31 21:19:35 +01:00
...