diff --git a/group_vars/prom/prometheus.yml b/group_vars/prom/prometheus.yml index 61cd1a4..0d6bf5c 100644 --- a/group_vars/prom/prometheus.yml +++ b/group_vars/prom/prometheus.yml @@ -236,10 +236,67 @@ prometheus__alert_rules_bird: prometheus__alert_rules_quanta: - alert: QuantaQueueOverflow - expr: "snAgGblQueueOverflow == 1" + expr: 'snAgGblQueueOverflow == 1' for: 0m labels: severity: critical + - alert: QuantaCpuUsage + expr: 'snAgGblCpuUtil1MinAvg > 50' + for: 5m + labels: + severity: warning + annotations: + summary: "Utilisation forte du processus ({{ '$value' | interp }}%)" + - alert: QuantaCpuUsage + expr: 'snAgGblCpuUtil1MinAvg > 80' + for: 5m + labels: + severity: critical + annotations: + summary: "Utilisation intense du processus ({{ '$value' | interp }}%)" + - alert: QuantaMemoryUsage + expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50' + for: 5m + labels: + severity: warning + annotations: + summary: "Utilisation forte de la mémoire ({{ '$value' | interp }}%)" + - alert: QuantaMemoryUsage + expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80' + for: 5m + labels: + severity: alert + annotations: + summary: "Utilisation intense de la mémoire ({{ '$value' | interp }}%)" + - alert: QuantaFanHealth + expr: 'snChasFanOperStatus{snChasFanOperStatus="normal"} == 0' + for: 0m + labels: + severity: critical + annotations: + summary: "Le ventilateur {{ '$labels.snChasFanDescription' | interp }} est + en mode {{ '$labels.snChasFanOperStatus' | interp }}" + - alert: QuantaTemp + expr: '(snAgentTempValue / 2) > 45' + for: 0m + labels: + severity: warning + annotations: + summary: "La température de {{ '$labels.snAgentTempSensorDescr' }} est + élevée ({{ '$value' | interp }}°C)" + - alert: QuantaTemp + expr: '(snAgentTempValue / 2) > 60' + for: 0m + labels: + severity: critical + annotations: + summary: "La température de {{ '$labels.snAgentTempSensorDescr' }} est + très élevée ({{ '$value' | interp }}°C)" + - alert: QuantaPowerRedundancyFailure + expr: 'count by (instance) (snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"}) < 2' + for: 0m + labels: + severity: warning prometheus__alert_rules: prometheus: "{{ prometheus__alert_rules_prometheus }}"