2023-11-02 20:27:45 +01:00
|
|
|
---
|
|
|
|
prometheus__scraping_quanta:
|
|
|
|
targets: "{{ groups.quanta }}"
|
|
|
|
address: 127.0.0.1:9116
|
|
|
|
path: /snmp
|
2024-03-09 19:29:34 +01:00
|
|
|
timeout: 60s
|
2023-11-02 20:27:45 +01:00
|
|
|
params:
|
|
|
|
module:
|
|
|
|
- quanta
|
|
|
|
|
|
|
|
prometheus__rules_quanta:
|
|
|
|
- alert: QuantaQueueOverflow
|
|
|
|
expr:
|
|
|
|
snAgGblQueueOverflow == 1
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: QuantaCpuUsage
|
|
|
|
expr:
|
|
|
|
snAgGblCpuUtil1MinAvg > 50
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
Usage: !unsafe "{{ $value }} %"
|
|
|
|
- alert: QuantaCpuUsage
|
|
|
|
expr:
|
|
|
|
snAgGblCpuUtil1MinAvg > 80
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
Usage: !unsafe "{{ $value }} %"
|
|
|
|
- alert: QuantaMemoryUsage
|
|
|
|
expr:
|
|
|
|
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
UsedMemory: !unsafe "{{ $value }} %"
|
|
|
|
- alert: QuantaMemoryUsage
|
|
|
|
expr:
|
|
|
|
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: alert
|
|
|
|
annotations:
|
|
|
|
UsedMemory: !unsafe "{{ $value }} %"
|
|
|
|
- alert: QuantaFanHealth
|
|
|
|
expr:
|
|
|
|
snChasFanOperStatus{snChasFanOperStatus="normal"} == 0
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
Description: !unsafe "{{ $labels.shChasFanDescription }}"
|
|
|
|
Status: !unsafe "{{ $labels.snChasFanOperStatus }}"
|
2023-11-04 13:49:52 +01:00
|
|
|
- alert: QuantaMissingIntakeTemp
|
2023-11-02 20:27:45 +01:00
|
|
|
expr:
|
2023-11-04 13:49:52 +01:00
|
|
|
count by (instance) (
|
2023-11-04 14:02:06 +01:00
|
|
|
snAgentTempValue
|
|
|
|
) - count by (instance) (
|
2023-11-04 13:49:52 +01:00
|
|
|
snAgentTempValue{snAgentTempSensorDescr=~".*Intake.*"}
|
2023-11-04 14:02:06 +01:00
|
|
|
) == 0
|
2023-11-04 13:49:52 +01:00
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
- alert: QuantaIntakeTemp
|
|
|
|
expr:
|
2024-03-10 12:04:00 +01:00
|
|
|
0.5 * snAgentTempValue{snAgentTempSensorDescr=~".*Intake.*"} > 60
|
2023-11-03 20:50:56 +01:00
|
|
|
for: 10m
|
|
|
|
keep_firing_for: 30m
|
2023-11-02 20:27:45 +01:00
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
Temperature: !unsafe "{{ $value }} °C"
|
|
|
|
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
|
2023-11-04 13:49:52 +01:00
|
|
|
- alert: QuantaIntakeTemp
|
2023-11-02 20:27:45 +01:00
|
|
|
expr:
|
2024-03-10 12:04:00 +01:00
|
|
|
0.5 * snAgentTempValue{snAgentTempSensorDescr=~".*Intake.*"} > 70
|
2023-11-03 20:50:56 +01:00
|
|
|
for: 10m
|
|
|
|
keep_firing_for: 30m
|
2023-11-02 20:27:45 +01:00
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
Temperature: !unsafe "{{ $value }} °C"
|
|
|
|
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
|
|
|
|
- alert: QuantaPowerRedundancyFailure
|
|
|
|
expr:
|
|
|
|
count by (instance) (
|
|
|
|
snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"}
|
|
|
|
) < 2
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
...
|