ansible/group_vars/prom/prometheus/quanta.yml

98 lines
2.4 KiB
YAML
Raw Normal View History

2023-11-02 20:27:45 +01:00
---
prometheus__scraping_quanta:
targets: "{{ groups.quanta }}"
address: 127.0.0.1:9116
path: /snmp
timeout: 60s
2023-11-02 20:27:45 +01:00
params:
module:
- quanta
prometheus__rules_quanta:
- alert: QuantaQueueOverflow
expr:
snAgGblQueueOverflow == 1
for: 0m
labels:
severity: critical
- alert: QuantaCpuUsage
expr:
snAgGblCpuUtil1MinAvg > 50
for: 5m
labels:
severity: warning
annotations:
Usage: !unsafe "{{ $value }} %"
- alert: QuantaCpuUsage
expr:
snAgGblCpuUtil1MinAvg > 80
for: 5m
labels:
severity: critical
annotations:
Usage: !unsafe "{{ $value }} %"
- alert: QuantaMemoryUsage
expr:
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50
for: 5m
labels:
severity: warning
annotations:
UsedMemory: !unsafe "{{ $value }} %"
- alert: QuantaMemoryUsage
expr:
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80
for: 5m
labels:
severity: alert
annotations:
UsedMemory: !unsafe "{{ $value }} %"
- alert: QuantaFanHealth
expr:
snChasFanOperStatus{snChasFanOperStatus="normal"} == 0
for: 0m
labels:
severity: critical
annotations:
Description: !unsafe "{{ $labels.shChasFanDescription }}"
Status: !unsafe "{{ $labels.snChasFanOperStatus }}"
- alert: QuantaMissingIntakeTemp
2023-11-02 20:27:45 +01:00
expr:
count by (instance) (
snAgentTempValue
) - count by (instance) (
snAgentTempValue{snAgentTempSensorDescr=~".*Intake.*"}
) == 0
for: 0m
labels:
severity: critical
- alert: QuantaIntakeTemp
expr:
0.5 * snAgentTempValue{snAgentTempSensorDescr=~".*Intake.*"} > 60
for: 10m
keep_firing_for: 30m
2023-11-02 20:27:45 +01:00
labels:
severity: warning
annotations:
Temperature: !unsafe "{{ $value }} °C"
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
- alert: QuantaIntakeTemp
2023-11-02 20:27:45 +01:00
expr:
0.5 * snAgentTempValue{snAgentTempSensorDescr=~".*Intake.*"} > 70
for: 10m
keep_firing_for: 30m
2023-11-02 20:27:45 +01:00
labels:
severity: critical
annotations:
Temperature: !unsafe "{{ $value }} °C"
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
- alert: QuantaPowerRedundancyFailure
expr:
count by (instance) (
snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"}
) < 2
for: 0m
labels:
severity: warning
...