prometheus: cleanup + bird alert rules
This commit is contained in:
parent
071daad994
commit
3fa998ae68
1 changed files with 236 additions and 62 deletions
|
@ -35,14 +35,16 @@ prometheus__scraping:
|
|||
|
||||
prometheus__alert_rules_prometheus:
|
||||
- alert: PrometheusTsdbCompactionFailed
|
||||
expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
|
||||
expr:
|
||||
increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
prometheus__alert_rules_common:
|
||||
- alert: CollectorDown
|
||||
expr: 'up == 0'
|
||||
expr: >
|
||||
up == 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -51,26 +53,33 @@ prometheus__alert_rules_common:
|
|||
|
||||
prometheus__alert_rules_node:
|
||||
- alert: OutOfMemory
|
||||
expr: "( node_memory_MemFree_bytes
|
||||
+ node_memory_Cached_bytes
|
||||
+ node_memory_Buffers_bytes )
|
||||
/ node_memory_MemTotal_bytes * 100 < 10"
|
||||
expr:
|
||||
100 * (
|
||||
node_memory_MemFree_bytes
|
||||
+ node_memory_Cached_bytes
|
||||
+ node_memory_Buffers_bytes
|
||||
) / node_memory_MemTotal_bytes < 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
FreeMemory: !unsafe '{{ printf "%.0f" $value }} %'
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: "( 1 - ( node_memory_SwapFree_bytes
|
||||
/ node_memory_SwapTotal_bytes ) )
|
||||
* 100 >= 50"
|
||||
expr:
|
||||
100 * (
|
||||
1 - (
|
||||
node_memory_SwapFree_bytes
|
||||
/ node_memory_SwapTotal_bytes
|
||||
)
|
||||
) >= 50
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
UsedSwap: !unsafe '{{ printf "%.0f" $value }} %'
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: "node_hwmon_temp_celsius > 79"
|
||||
expr:
|
||||
node_hwmon_temp_celsius > 79
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -79,7 +88,8 @@ prometheus__alert_rules_node:
|
|||
Chip: !unsafe "{{ $labels.chip }}"
|
||||
Sensor: !unsafe "{{ $labels.sensor }}"
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: "node_hwmon_temp_crit_alarm_celsius == 1"
|
||||
expr:
|
||||
node_hwmon_temp_crit_alarm_celsius == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -87,45 +97,54 @@ prometheus__alert_rules_node:
|
|||
Chip: !unsafe "{{ $labels.chip }}"
|
||||
Sensor: !unsafe "{{ $labels.sensor }}"
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: 'node_md_state{state="inactive"} > 0'
|
||||
expr:
|
||||
node_md_state{state="inactive"} > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
Device: !unsafe "{{ $labels.device }}"
|
||||
- alert: HostRaidDiskFailure
|
||||
expr: 'node_md_disks{state="failed"} > 0'
|
||||
expr:
|
||||
node_md_disks{state="failed"} > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
severity: !unsafe "{{ $labels.md_device }}"
|
||||
- alert: HostOomKillDetected
|
||||
expr: "increase(node_vmstat_oom_kill[1m]) > 0"
|
||||
expr:
|
||||
increase(node_vmstat_oom_kill[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
PID: !unsafe "{{ $value }}"
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr: "increase(node_edac_correctable_errors_total[1m]) > 0"
|
||||
expr:
|
||||
increase(node_edac_correctable_errors_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
CorrectedErrors: !unsafe "{{ $value }}"
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0"
|
||||
expr:
|
||||
increase(node_edac_uncorrectable_errors_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
DetectedErrors: !unsafe "{{ $value }}"
|
||||
- alert: OutOfDiskSpace
|
||||
expr: "( node_filesystem_free_bytes
|
||||
/ node_filesystem_size_bytes * 100 < 10 )
|
||||
and on (instance, device, mountpoint)
|
||||
node_filesystem_readonly == 0"
|
||||
expr:
|
||||
(
|
||||
100 * node_filesystem_free_bytes
|
||||
/ node_filesystem_size_bytes < 10
|
||||
)
|
||||
and on (instance, device, mountpoint) (
|
||||
node_filesystem_readonly
|
||||
) == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -133,35 +152,45 @@ prometheus__alert_rules_node:
|
|||
Mountpoint: !unsafe '{{ $labels.mountpoint }}'
|
||||
FreeSpace: !unsafe '{{ printf "%.0f" $value }} %'
|
||||
- alert: HostConntrackLimit
|
||||
expr: "( node_nf_conntrack_entries
|
||||
/ node_nf_conntrack_entries_limit ) * 100 > 80"
|
||||
expr:
|
||||
100 * (
|
||||
node_nf_conntrack_entries
|
||||
/ node_nf_conntrack_entries_limit
|
||||
) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
Filled: !unsafe '{{ printf "%.0f" $value }} %'
|
||||
- alert: HostClockSkew
|
||||
expr: "(node_timex_offset_seconds > 0.05
|
||||
and deriv(node_timex_offset_seconds[5m]) >= 0)
|
||||
or (node_timex_offset_seconds < -0.05
|
||||
and deriv(node_timex_offset_seconds[5m]) <= 0)"
|
||||
expr:
|
||||
(
|
||||
node_timex_offset_seconds > 0.05
|
||||
and deriv(node_timex_offset_seconds[5m]) >= 0
|
||||
) or (
|
||||
node_timex_offset_seconds < -0.05
|
||||
and deriv(node_timex_offset_seconds[5m]) <= 0
|
||||
)
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: "min_over_time(node_timex_sync_status[1m]) == 0
|
||||
and node_timex_maxerror_seconds >= 16"
|
||||
expr:
|
||||
min_over_time(node_timex_sync_status[1m]) == 0
|
||||
and node_timex_maxerror_seconds >= 16
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: HostRequiresReboot
|
||||
expr: "node_reboot_required > 0"
|
||||
expr:
|
||||
node_reboot_required > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: OutOfInodes
|
||||
expr: "node_filesystem_files_free
|
||||
/ node_filesystem_files * 100 < 10"
|
||||
expr:
|
||||
100 * node_filesystem_files_free
|
||||
/ node_filesystem_files < 10
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -169,39 +198,46 @@ prometheus__alert_rules_node:
|
|||
Mountpoint: !unsafe "{{ $labels.mountpoint }}"
|
||||
FreeInodes: !unsafe '{{ printf "%.0f" $value }} %'
|
||||
- alert: CpuUsage
|
||||
expr: '( 100 - avg by (instance)
|
||||
( irate(node_cpu_seconds_total{mode="idle"}[5m]) )
|
||||
* 100 ) > 75'
|
||||
expr:
|
||||
100 * (
|
||||
1 - avg by (instance) (
|
||||
irate(node_cpu_seconds_total{mode="idle"}[5m])
|
||||
)
|
||||
) > 75
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
Usage: !unsafe '{{ printf "%.0f" $value }} %'
|
||||
- alert: SystemdServiceFailed
|
||||
expr: 'node_systemd_unit_state{state="failed"} == 1'
|
||||
expr:
|
||||
node_systemd_unit_state{state="failed"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
Service: !unsafe "{{ $labels.name }}"
|
||||
- alert: LoadUsage
|
||||
expr: "node_load1 > 5"
|
||||
expr:
|
||||
node_load1 > 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
Load1: !unsafe '{{ printf "%.0f" $value }}'
|
||||
- alert: UnhealthyDisk
|
||||
expr: "smartmon_device_smart_healthy < 1"
|
||||
expr:
|
||||
smartmon_device_smart_healthy < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
Disk: !unsafe "{{ $labels.disk }}"
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: 'avg by(instance)
|
||||
(rate(node_cpu_seconds_total{mode="steal"}[5m]))
|
||||
* 100 > 10'
|
||||
expr:
|
||||
100 * avg by (instance) (
|
||||
rate(node_cpu_seconds_total{mode="steal"}[5m])
|
||||
) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -210,16 +246,20 @@ prometheus__alert_rules_node:
|
|||
|
||||
prometheus__alert_rules_keepalived:
|
||||
- alert: KeepalivedVrrpFault
|
||||
expr: 'keepalived_vrrp_state{state="fault"} > 0'
|
||||
expr:
|
||||
keepalived_vrrp_state{state="fault"} > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
Instance: !unsafe "{{ $labels.instance }}"
|
||||
- alert: KeepalivedMasterChange
|
||||
expr: 'changes(
|
||||
keepalived_vrrp_state
|
||||
{keepalived_vvrp_state="master"}[1m]) > 1'
|
||||
expr:
|
||||
changes(
|
||||
keepalived_vrrp_state{
|
||||
keepalived_vvrp_state="master"
|
||||
}[1m]
|
||||
) > 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -227,51 +267,146 @@ prometheus__alert_rules_keepalived:
|
|||
Instance: !unsafe "{{ $labels.instance }}"
|
||||
|
||||
prometheus__alert_rules_bird:
|
||||
- alert: BirdProtocolDown
|
||||
expr: "bird_protocol_up == 0"
|
||||
- record: bird:protocol_up:bgp
|
||||
expr:
|
||||
label_replace(
|
||||
bird_protocol_up{proto="BGP"},
|
||||
"group", "$1",
|
||||
"instance", "^([^0-9\\.]+)-[0-9]+.*"
|
||||
)
|
||||
# Sessions qui ne sont volontairement pas redondées
|
||||
# au sein d'un groupe
|
||||
- record: bird:protocol_up:bgp:non_redundant
|
||||
expr:
|
||||
bird:protocol_up:bgp{
|
||||
group="edge",
|
||||
name=~"^(oti|crans|legacy|edge)[46]$"
|
||||
}
|
||||
# Sessions qui le sont
|
||||
- record: bird:protocol_up:bgp:redundant
|
||||
expr:
|
||||
bird:protocol_up:bgp
|
||||
unless
|
||||
bird:protocol_up:bgp:non_redundant
|
||||
- alert: BirdBGPRedundancyDegraded
|
||||
expr:
|
||||
(
|
||||
count by (group, name) (
|
||||
bird:protocol_up:bgp:redundant{state="Established"}
|
||||
) or (
|
||||
count by (group, name) (
|
||||
bird:protocol_up:bgp:redundant{state!="Established"}
|
||||
) * 0
|
||||
)
|
||||
) < 2
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
Session: !unsafe "{{ $labels.name }}"
|
||||
Count: !unsafe "{{ $value }}"
|
||||
Group: !unsafe "{{ $labels.group }}"
|
||||
- alert: BirdBGPDown
|
||||
expr:
|
||||
(
|
||||
count by (group, name) (
|
||||
bird:protocol_up:bgp{state="Established"}
|
||||
) or (
|
||||
count by (group, name) (
|
||||
bird:protocol_up:bgp{state!="Established"}
|
||||
) * 0
|
||||
)
|
||||
) == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
Protocol: !unsafe "{{ $labels.name }}"
|
||||
State: !unsafe "{{ $labels.state }}"
|
||||
Session: !unsafe "{{ $labels.name }}"
|
||||
Group: !unsafe "{{ $labels.group }}"
|
||||
- alert: BirdBGPNoExportedPrefixRedundant
|
||||
expr:
|
||||
bird_protocol_prefix_export_count{
|
||||
export_filter!="REJECT",
|
||||
} * on (instance, name) (
|
||||
bird:protocol_up:bgp:redundant{state="Established"}
|
||||
) == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
Session: !unsafe "{{ $labels.name }}"
|
||||
- alert: BirdBGPNoExportedPrefixNonRedundant
|
||||
expr:
|
||||
sum by (group) (
|
||||
bird_protocol_prefix_export_count{
|
||||
export_filter!="REJECT",
|
||||
} * on (instance, name) group_left (group) (
|
||||
bird:protocol_up:bgp:non_redundant{state="Established"}
|
||||
)
|
||||
) == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
Session: !unsafe "{{ $labels.name }}"
|
||||
- alert: BirdOSPFNeighboursChange
|
||||
expr:
|
||||
changes(bird_ospf_neighbor_count[5m]) > 0
|
||||
or changes(bird_ospfv3_neighbor_count[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: BirdOSPFDown
|
||||
expr:
|
||||
bird_ospf_running == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
Instance: !unsafe "{{ $labels.name }}"
|
||||
|
||||
prometheus__alert_rules_quanta:
|
||||
- alert: QuantaQueueOverflow
|
||||
expr: 'snAgGblQueueOverflow == 1'
|
||||
expr:
|
||||
snAgGblQueueOverflow == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: QuantaCpuUsage
|
||||
expr: 'snAgGblCpuUtil1MinAvg > 50'
|
||||
expr:
|
||||
snAgGblCpuUtil1MinAvg > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
Usage: !unsafe "{{ $value }} %"
|
||||
- alert: QuantaCpuUsage
|
||||
expr: 'snAgGblCpuUtil1MinAvg > 80'
|
||||
expr:
|
||||
snAgGblCpuUtil1MinAvg > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
Usage: !unsafe "{{ $value }} %"
|
||||
- alert: QuantaMemoryUsage
|
||||
expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50'
|
||||
expr:
|
||||
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
UsedMemory: !unsafe "{{ $value }} %"
|
||||
- alert: QuantaMemoryUsage
|
||||
expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80'
|
||||
expr:
|
||||
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: alert
|
||||
annotations:
|
||||
UsedMemory: !unsafe "{{ $value }} %"
|
||||
- alert: QuantaFanHealth
|
||||
expr: 'snChasFanOperStatus{snChasFanOperStatus="normal"} == 0'
|
||||
expr:
|
||||
snChasFanOperStatus{snChasFanOperStatus="normal"} == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -279,7 +414,8 @@ prometheus__alert_rules_quanta:
|
|||
Description: !unsafe "{{ $labels.shChasFanDescription }}"
|
||||
Status: !unsafe "{{ $labels.snChasFanOperStatus }}"
|
||||
- alert: QuantaTemp
|
||||
expr: '(snAgentTempValue / 2) > 45'
|
||||
expr:
|
||||
0.5 * snAgentTempValue > 45
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -287,7 +423,8 @@ prometheus__alert_rules_quanta:
|
|||
Temperature: !unsafe "{{ $value }} °C"
|
||||
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
|
||||
- alert: QuantaTemp
|
||||
expr: '(snAgentTempValue / 2) > 60'
|
||||
expr:
|
||||
0.5 * snAgentTempValue > 60
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -295,22 +432,59 @@ prometheus__alert_rules_quanta:
|
|||
Temperature: !unsafe "{{ $value }} °C"
|
||||
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
|
||||
- alert: QuantaPowerRedundancyFailure
|
||||
expr: 'count by (instance)
|
||||
(snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"})
|
||||
< 2'
|
||||
expr:
|
||||
count by (instance) (
|
||||
snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"}
|
||||
) < 2
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
prometheus__alert_rules_switch:
|
||||
- alert: SwitchPromiscuousChange
|
||||
expr: "changes(ifPromiscuousMode[5m]) > 0"
|
||||
expr:
|
||||
changes(ifPromiscuousMode[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
Interface: !unsafe "{{ $labels.ifName }}
|
||||
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
||||
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
||||
- alert: SwitchInterfaceUpChange
|
||||
expr:
|
||||
changes(ifOperStatus{ifOperStatus="up"}[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
Interface: !unsafe "{{ $labels.ifName }}
|
||||
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
||||
- alert: SwitchInErrors
|
||||
expr:
|
||||
100 * irate(ifInErrors[5m]) / (
|
||||
irate(ifInUcastPkts[5m])
|
||||
+ irate(ifInNUcastPkts[5m])
|
||||
) > 0.01
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %'
|
||||
Interface: !unsafe "{{ $labels.ifName }}
|
||||
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
||||
- alert: SwitchOutErrors
|
||||
expr:
|
||||
100 * irate(ifOutErrors[5m]) / (
|
||||
irate(ifOutUcastPkts[5m])
|
||||
+ irate(ifOutNUcastPkts[5m])
|
||||
) > 0.01
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %'
|
||||
Interface: !unsafe "{{ $labels.ifName }}
|
||||
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
||||
|
||||
prometheus__alert_rules:
|
||||
common: "{{ prometheus__alert_rules_common }}"
|
||||
|
@ -319,5 +493,5 @@ prometheus__alert_rules:
|
|||
node: "{{ prometheus__alert_rules_node }}"
|
||||
keepalived: "{{ prometheus__alert_rules_keepalived }}"
|
||||
quanta: "{{ prometheus__alert_rules_quanta }}"
|
||||
#bird: "{{ prometheus__alert_rules_bird }}"
|
||||
bird: "{{ prometheus__alert_rules_bird }}"
|
||||
...
|
||||
|
|
Loading…
Reference in a new issue