prometheus: refactoring of the config
This commit is contained in:
parent
2928d7e809
commit
16a0d95936
15 changed files with 556 additions and 530 deletions
|
@ -1,526 +0,0 @@
|
||||||
---
|
|
||||||
prometheus__alertmanager_targets:
|
|
||||||
- docker-ovh.adm.auro.re:9093
|
|
||||||
|
|
||||||
prometheus__tsdb_retention_time: 90d
|
|
||||||
|
|
||||||
prometheus__scraping:
|
|
||||||
node:
|
|
||||||
targets: "{{ groups.vm_network + groups.pve_network }}"
|
|
||||||
address:
|
|
||||||
port: 9100
|
|
||||||
prometheus:
|
|
||||||
targets: "{{ groups.prom }}"
|
|
||||||
address:
|
|
||||||
port: 9090
|
|
||||||
kresd:
|
|
||||||
targets: "{{ groups.dns }}"
|
|
||||||
address:
|
|
||||||
port: 8453
|
|
||||||
bird:
|
|
||||||
targets: "{{ groups.router }}"
|
|
||||||
address:
|
|
||||||
port: 9324
|
|
||||||
quanta:
|
|
||||||
targets: "{{ groups.quanta }}"
|
|
||||||
address: 127.0.0.1:9116
|
|
||||||
path: /snmp
|
|
||||||
params:
|
|
||||||
module:
|
|
||||||
- quanta
|
|
||||||
snmp:
|
|
||||||
targets: "{{ groups.prom }}"
|
|
||||||
address:
|
|
||||||
port: 9116
|
|
||||||
|
|
||||||
prometheus__alert_rules_prometheus:
|
|
||||||
- alert: PrometheusTsdbCompactionFailed
|
|
||||||
expr:
|
|
||||||
increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
|
|
||||||
prometheus__alert_rules_common:
|
|
||||||
- alert: CollectorDown
|
|
||||||
expr: >
|
|
||||||
up == 0
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Job: !unsafe "{{ $labels.job }}"
|
|
||||||
|
|
||||||
prometheus__alert_rules_node:
|
|
||||||
- alert: OutOfMemory
|
|
||||||
expr:
|
|
||||||
100 * (
|
|
||||||
node_memory_MemFree_bytes
|
|
||||||
+ node_memory_Cached_bytes
|
|
||||||
+ node_memory_Buffers_bytes
|
|
||||||
) / node_memory_MemTotal_bytes < 10
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
FreeMemory: !unsafe '{{ printf "%.0f" $value }} %'
|
|
||||||
- alert: HostSwapIsFillingUp
|
|
||||||
expr:
|
|
||||||
100 * (
|
|
||||||
1 - (
|
|
||||||
node_memory_SwapFree_bytes
|
|
||||||
/ node_memory_SwapTotal_bytes
|
|
||||||
)
|
|
||||||
) >= 50
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
UsedSwap: !unsafe '{{ printf "%.0f" $value }} %'
|
|
||||||
- alert: HostPhysicalComponentTooHot
|
|
||||||
expr:
|
|
||||||
node_hwmon_temp_celsius > 79
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Temperature: !unsafe '{{ printf "%.0f" $value }} °C'
|
|
||||||
Chip: !unsafe "{{ $labels.chip }}"
|
|
||||||
Sensor: !unsafe "{{ $labels.sensor }}"
|
|
||||||
- alert: HostNodeOvertemperatureAlarm
|
|
||||||
expr:
|
|
||||||
node_hwmon_temp_crit_alarm_celsius == 1
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Chip: !unsafe "{{ $labels.chip }}"
|
|
||||||
Sensor: !unsafe "{{ $labels.sensor }}"
|
|
||||||
- alert: HostRaidArrayGotInactive
|
|
||||||
expr:
|
|
||||||
node_md_state{state="inactive"} > 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Device: !unsafe "{{ $labels.device }}"
|
|
||||||
- alert: HostRaidDiskFailure
|
|
||||||
expr:
|
|
||||||
node_md_disks{state="failed"} > 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
severity: !unsafe "{{ $labels.md_device }}"
|
|
||||||
- alert: HostOomKillDetected
|
|
||||||
expr:
|
|
||||||
increase(node_vmstat_oom_kill[1m]) > 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
PID: !unsafe "{{ $value }}"
|
|
||||||
- alert: HostEdacCorrectableErrorsDetected
|
|
||||||
expr:
|
|
||||||
increase(node_edac_correctable_errors_total[1m]) > 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
CorrectedErrors: !unsafe "{{ $value }}"
|
|
||||||
- alert: HostEdacUncorrectableErrorsDetected
|
|
||||||
expr:
|
|
||||||
increase(node_edac_uncorrectable_errors_total[1m]) > 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
DetectedErrors: !unsafe "{{ $value }}"
|
|
||||||
- alert: OutOfDiskSpace
|
|
||||||
expr:
|
|
||||||
(
|
|
||||||
100 * node_filesystem_free_bytes
|
|
||||||
/ node_filesystem_size_bytes < 10
|
|
||||||
)
|
|
||||||
and on (instance, device, mountpoint) (
|
|
||||||
node_filesystem_readonly
|
|
||||||
) == 0
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Mountpoint: !unsafe '{{ $labels.mountpoint }}'
|
|
||||||
FreeSpace: !unsafe '{{ printf "%.0f" $value }} %'
|
|
||||||
- alert: HostConntrackLimit
|
|
||||||
expr:
|
|
||||||
100 * (
|
|
||||||
node_nf_conntrack_entries
|
|
||||||
/ node_nf_conntrack_entries_limit
|
|
||||||
) > 80
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
Filled: !unsafe '{{ printf "%.0f" $value }} %'
|
|
||||||
- alert: HostClockSkew
|
|
||||||
expr:
|
|
||||||
(
|
|
||||||
node_timex_offset_seconds > 0.05
|
|
||||||
and deriv(node_timex_offset_seconds[5m]) >= 0
|
|
||||||
) or (
|
|
||||||
node_timex_offset_seconds < -0.05
|
|
||||||
and deriv(node_timex_offset_seconds[5m]) <= 0
|
|
||||||
)
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: HostClockNotSynchronising
|
|
||||||
expr:
|
|
||||||
min_over_time(node_timex_sync_status[1m]) == 0
|
|
||||||
and node_timex_maxerror_seconds >= 16
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: HostRequiresReboot
|
|
||||||
expr:
|
|
||||||
node_reboot_required > 0
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: OutOfInodes
|
|
||||||
expr:
|
|
||||||
100 * node_filesystem_files_free
|
|
||||||
/ node_filesystem_files < 10
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
Mountpoint: !unsafe "{{ $labels.mountpoint }}"
|
|
||||||
FreeInodes: !unsafe '{{ printf "%.0f" $value }} %'
|
|
||||||
- alert: CpuUsage
|
|
||||||
expr:
|
|
||||||
100 * (
|
|
||||||
1 - avg by (instance) (
|
|
||||||
irate(node_cpu_seconds_total{mode="idle"}[5m])
|
|
||||||
)
|
|
||||||
) > 75
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
Usage: !unsafe '{{ printf "%.0f" $value }} %'
|
|
||||||
- alert: SystemdServiceFailed
|
|
||||||
expr:
|
|
||||||
node_systemd_unit_state{state="failed"} == 1
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
Service: !unsafe "{{ $labels.name }}"
|
|
||||||
- alert: LoadUsage
|
|
||||||
expr:
|
|
||||||
node_load1 > 5
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
Load1: !unsafe '{{ printf "%.0f" $value }}'
|
|
||||||
- alert: UnhealthyDisk
|
|
||||||
expr:
|
|
||||||
smartmon_device_smart_healthy < 1
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Disk: !unsafe "{{ $labels.disk }}"
|
|
||||||
- alert: HostCpuStealNoisyNeighbor
|
|
||||||
expr:
|
|
||||||
100 * avg by (instance) (
|
|
||||||
rate(node_cpu_seconds_total{mode="steal"}[5m])
|
|
||||||
) > 10
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
Disk: !unsafe "{{ $labels.disk }}"
|
|
||||||
|
|
||||||
prometheus__alert_rules_keepalived:
|
|
||||||
- alert: KeepalivedVrrpFault
|
|
||||||
expr:
|
|
||||||
keepalived_vrrp_state{state="fault"} > 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Instance: !unsafe "{{ $labels.instance }}"
|
|
||||||
- alert: KeepalivedMasterChange
|
|
||||||
expr:
|
|
||||||
changes(
|
|
||||||
keepalived_vrrp_state{
|
|
||||||
keepalived_vvrp_state="master"
|
|
||||||
}[1m]
|
|
||||||
) > 1
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
Instance: !unsafe "{{ $labels.instance }}"
|
|
||||||
|
|
||||||
prometheus__alert_rules_bird:
|
|
||||||
- record: bird:protocol_up:bgp
|
|
||||||
expr:
|
|
||||||
label_replace(
|
|
||||||
bird_protocol_up{proto="BGP",}
|
|
||||||
unless bird_protocol_up{
|
|
||||||
proto="BGP",
|
|
||||||
name=~"^(viarezo|isp[12]|rezel)[46]$"
|
|
||||||
},
|
|
||||||
"group", "$1",
|
|
||||||
"instance", "^([^0-9\\.]+)-[0-9]+.*"
|
|
||||||
)
|
|
||||||
# Sessions qui ne sont volontairement pas redondées
|
|
||||||
# au sein d'un groupe
|
|
||||||
- record: bird:protocol_up:bgp:non_redundant
|
|
||||||
expr:
|
|
||||||
bird:protocol_up:bgp{
|
|
||||||
group="edge",
|
|
||||||
name=~"^(oti|crans|legacy|edge)[46]$"
|
|
||||||
}
|
|
||||||
# Sessions qui le sont
|
|
||||||
- record: bird:protocol_up:bgp:redundant
|
|
||||||
expr:
|
|
||||||
bird:protocol_up:bgp
|
|
||||||
unless
|
|
||||||
bird:protocol_up:bgp:non_redundant
|
|
||||||
- alert: BirdBGPRedundancyDegraded
|
|
||||||
expr:
|
|
||||||
(
|
|
||||||
count by (group, name) (
|
|
||||||
bird:protocol_up:bgp:redundant{state="Established"}
|
|
||||||
) or (
|
|
||||||
count by (group, name) (
|
|
||||||
bird:protocol_up:bgp:redundant{state!="Established"}
|
|
||||||
) * 0
|
|
||||||
)
|
|
||||||
) < 2
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
Session: !unsafe "{{ $labels.name }}"
|
|
||||||
Count: !unsafe "{{ $value }}"
|
|
||||||
Group: !unsafe "{{ $labels.group }}"
|
|
||||||
- alert: BirdBGPDown
|
|
||||||
expr:
|
|
||||||
(
|
|
||||||
count by (group, name) (
|
|
||||||
bird:protocol_up:bgp{state="Established"}
|
|
||||||
) or (
|
|
||||||
count by (group, name) (
|
|
||||||
bird:protocol_up:bgp{state!="Established"}
|
|
||||||
) * 0
|
|
||||||
)
|
|
||||||
) == 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Session: !unsafe "{{ $labels.name }}"
|
|
||||||
Group: !unsafe "{{ $labels.group }}"
|
|
||||||
- alert: BirdBGPNoExportedPrefixRedundant
|
|
||||||
expr:
|
|
||||||
bird_protocol_prefix_export_count{
|
|
||||||
export_filter!="REJECT",
|
|
||||||
} * on (instance, name) (
|
|
||||||
bird:protocol_up:bgp:redundant{state="Established"}
|
|
||||||
) == 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Session: !unsafe "{{ $labels.name }}"
|
|
||||||
- alert: BirdBGPNoImportedPrefixRedundant
|
|
||||||
expr:
|
|
||||||
bird_protocol_prefix_import_count{
|
|
||||||
import_filter!="REJECT",
|
|
||||||
} * on (instance, name) (
|
|
||||||
bird:protocol_up:bgp:redundant{state="Established"}
|
|
||||||
) == 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
- alert: BirdBGPNoExportedPrefixNonRedundant
|
|
||||||
expr:
|
|
||||||
sum by (group) (
|
|
||||||
bird_protocol_prefix_export_count{
|
|
||||||
export_filter!="REJECT",
|
|
||||||
} * on (instance, name) group_left (group) (
|
|
||||||
bird:protocol_up:bgp:non_redundant{state="Established"}
|
|
||||||
)
|
|
||||||
) == 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Session: !unsafe "{{ $labels.name }}"
|
|
||||||
- alert: BirdBGPNoImportedPrefixNonRedundant
|
|
||||||
expr:
|
|
||||||
sum by (group) (
|
|
||||||
bird_protocol_prefix_import_count{
|
|
||||||
import_filter!="REJECT",
|
|
||||||
} * on (instance, name) group_left (group) (
|
|
||||||
bird:protocol_up:bgp:non_redundant{state="Established"}
|
|
||||||
)
|
|
||||||
) == 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Session: !unsafe "{{ $labels.name }}"
|
|
||||||
- alert: BirdOSPFNeighboursChange
|
|
||||||
expr:
|
|
||||||
changes(bird_ospf_neighbor_count[5m]) > 0
|
|
||||||
or changes(bird_ospfv3_neighbor_count[5m]) > 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: BirdOSPFDown
|
|
||||||
expr:
|
|
||||||
bird_ospf_running == 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Instance: !unsafe "{{ $labels.name }}"
|
|
||||||
|
|
||||||
prometheus__alert_rules_quanta:
|
|
||||||
- alert: QuantaQueueOverflow
|
|
||||||
expr:
|
|
||||||
snAgGblQueueOverflow == 1
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: QuantaCpuUsage
|
|
||||||
expr:
|
|
||||||
snAgGblCpuUtil1MinAvg > 50
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
Usage: !unsafe "{{ $value }} %"
|
|
||||||
- alert: QuantaCpuUsage
|
|
||||||
expr:
|
|
||||||
snAgGblCpuUtil1MinAvg > 80
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Usage: !unsafe "{{ $value }} %"
|
|
||||||
- alert: QuantaMemoryUsage
|
|
||||||
expr:
|
|
||||||
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
UsedMemory: !unsafe "{{ $value }} %"
|
|
||||||
- alert: QuantaMemoryUsage
|
|
||||||
expr:
|
|
||||||
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: alert
|
|
||||||
annotations:
|
|
||||||
UsedMemory: !unsafe "{{ $value }} %"
|
|
||||||
- alert: QuantaFanHealth
|
|
||||||
expr:
|
|
||||||
snChasFanOperStatus{snChasFanOperStatus="normal"} == 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Description: !unsafe "{{ $labels.shChasFanDescription }}"
|
|
||||||
Status: !unsafe "{{ $labels.snChasFanOperStatus }}"
|
|
||||||
- alert: QuantaTemp
|
|
||||||
expr:
|
|
||||||
0.5 * snAgentTempValue > 45
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
Temperature: !unsafe "{{ $value }} °C"
|
|
||||||
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
|
|
||||||
- alert: QuantaTemp
|
|
||||||
expr:
|
|
||||||
0.5 * snAgentTempValue > 60
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
Temperature: !unsafe "{{ $value }} °C"
|
|
||||||
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
|
|
||||||
- alert: QuantaPowerRedundancyFailure
|
|
||||||
expr:
|
|
||||||
count by (instance) (
|
|
||||||
snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"}
|
|
||||||
) < 2
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
|
|
||||||
prometheus__alert_rules_switch:
|
|
||||||
- alert: SwitchPromiscuousChange
|
|
||||||
expr:
|
|
||||||
changes(ifPromiscuousMode[5m]) > 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
Interface: !unsafe "{{ $labels.ifName }}
|
|
||||||
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
|
||||||
- alert: SwitchInterfaceUpChange
|
|
||||||
expr:
|
|
||||||
changes(ifOperStatus{ifOperStatus="up"}[5m]) > 0
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
Interface: !unsafe "{{ $labels.ifName }}
|
|
||||||
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
|
||||||
- alert: SwitchInErrors
|
|
||||||
expr:
|
|
||||||
100 * irate(ifInErrors[5m]) / (
|
|
||||||
irate(ifInUcastPkts[5m])
|
|
||||||
+ irate(ifInNUcastPkts[5m])
|
|
||||||
) > 0.01
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %'
|
|
||||||
Interface: !unsafe "{{ $labels.ifName }}
|
|
||||||
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
|
||||||
- alert: SwitchOutErrors
|
|
||||||
expr:
|
|
||||||
100 * irate(ifOutErrors[5m]) / (
|
|
||||||
irate(ifOutUcastPkts[5m])
|
|
||||||
+ irate(ifOutNUcastPkts[5m])
|
|
||||||
) > 0.01
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %'
|
|
||||||
Interface: !unsafe "{{ $labels.ifName }}
|
|
||||||
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
|
||||||
|
|
||||||
prometheus__alert_rules:
|
|
||||||
common: "{{ prometheus__alert_rules_common }}"
|
|
||||||
switch: "{{ prometheus__alert_rules_switch }}"
|
|
||||||
prometheus: "{{ prometheus__alert_rules_prometheus }}"
|
|
||||||
node: "{{ prometheus__alert_rules_node }}"
|
|
||||||
keepalived: "{{ prometheus__alert_rules_keepalived }}"
|
|
||||||
quanta: "{{ prometheus__alert_rules_quanta }}"
|
|
||||||
bird: "{{ prometheus__alert_rules_bird }}"
|
|
||||||
...
|
|
139
group_vars/prom/prometheus/bird.yml
Normal file
139
group_vars/prom/prometheus/bird.yml
Normal file
|
@ -0,0 +1,139 @@
|
||||||
|
---
|
||||||
|
prometheus__scraping_bird:
|
||||||
|
targets: "{{ groups.router }}"
|
||||||
|
address:
|
||||||
|
port: 9324
|
||||||
|
|
||||||
|
prometheus__rules_bird:
|
||||||
|
- record: bird:protocol_up:bgp
|
||||||
|
expr:
|
||||||
|
label_replace(
|
||||||
|
bird_protocol_up{proto="BGP",}
|
||||||
|
unless bird_protocol_up{
|
||||||
|
proto="BGP",
|
||||||
|
name=~"^(viarezo|isp[12]|rezel)[46]$"
|
||||||
|
},
|
||||||
|
"group", "$1",
|
||||||
|
"instance", "^([^0-9\\.]+)-[0-9]+.*"
|
||||||
|
)
|
||||||
|
# Sessions qui ne sont volontairement pas redondées
|
||||||
|
# au sein d'un groupe
|
||||||
|
- record: bird:protocol_up:bgp:non_redundant
|
||||||
|
expr:
|
||||||
|
bird:protocol_up:bgp{
|
||||||
|
group="edge",
|
||||||
|
name=~"^(oti|crans|legacy|edge)[46]$"
|
||||||
|
}
|
||||||
|
# Sessions qui le sont
|
||||||
|
- record: bird:protocol_up:bgp:redundant
|
||||||
|
expr:
|
||||||
|
bird:protocol_up:bgp
|
||||||
|
unless
|
||||||
|
bird:protocol_up:bgp:non_redundant
|
||||||
|
- alert: BirdBGPRedundancyDegraded
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
count by (group, name) (
|
||||||
|
bird:protocol_up:bgp:redundant{state="Established"}
|
||||||
|
) or (
|
||||||
|
count by (group, name) (
|
||||||
|
bird:protocol_up:bgp:redundant{state!="Established"}
|
||||||
|
) * 0
|
||||||
|
)
|
||||||
|
) < 2
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
Session: !unsafe "{{ $labels.name }}"
|
||||||
|
Count: !unsafe "{{ $value }}"
|
||||||
|
Group: !unsafe "{{ $labels.group }}"
|
||||||
|
- alert: BirdBGPDown
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
count by (group, name) (
|
||||||
|
bird:protocol_up:bgp{state="Established"}
|
||||||
|
) or (
|
||||||
|
count by (group, name) (
|
||||||
|
bird:protocol_up:bgp{state!="Established"}
|
||||||
|
) * 0
|
||||||
|
)
|
||||||
|
) == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Session: !unsafe "{{ $labels.name }}"
|
||||||
|
Group: !unsafe "{{ $labels.group }}"
|
||||||
|
- alert: BirdBGPNoExportedPrefixRedundant
|
||||||
|
expr:
|
||||||
|
bird_protocol_prefix_export_count{
|
||||||
|
export_filter!="REJECT",
|
||||||
|
} * on (instance, name) group_left (group) (
|
||||||
|
bird:protocol_up:bgp:redundant{state="Established"}
|
||||||
|
) == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Session: !unsafe "{{ $labels.name }}"
|
||||||
|
Group: !unsafe "{{ $labels.group }}"
|
||||||
|
- alert: BirdBGPNoImportedPrefixRedundant
|
||||||
|
expr:
|
||||||
|
bird_protocol_prefix_import_count{
|
||||||
|
import_filter!="REJECT",
|
||||||
|
} * on (instance, name) group_left (group) (
|
||||||
|
bird:protocol_up:bgp:redundant{state="Established"}
|
||||||
|
) == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Session: !unsafe "{{ $labels.name }}"
|
||||||
|
Group: !unsafe "{{ $labels.group }}"
|
||||||
|
- alert: BirdBGPNoExportedPrefixNonRedundant
|
||||||
|
expr:
|
||||||
|
sum by (group) (
|
||||||
|
bird_protocol_prefix_export_count{
|
||||||
|
export_filter!="REJECT",
|
||||||
|
} * on (instance, name) group_left (group) (
|
||||||
|
bird:protocol_up:bgp:non_redundant{state="Established"}
|
||||||
|
)
|
||||||
|
) == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Session: !unsafe "{{ $labels.name }}"
|
||||||
|
Group: !unsafe "{{ $labels.group }}"
|
||||||
|
- alert: BirdBGPNoImportedPrefixNonRedundant
|
||||||
|
expr:
|
||||||
|
sum by (group) (
|
||||||
|
bird_protocol_prefix_import_count{
|
||||||
|
import_filter!="REJECT",
|
||||||
|
} * on (instance, name) group_left (group) (
|
||||||
|
bird:protocol_up:bgp:non_redundant{state="Established"}
|
||||||
|
)
|
||||||
|
) == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Session: !unsafe "{{ $labels.name }}"
|
||||||
|
Group: !unsafe "{{ $labels.group }}"
|
||||||
|
- alert: BirdOSPFNeighboursChange
|
||||||
|
expr:
|
||||||
|
changes(bird_ospf_neighbor_count[5m]) > 0
|
||||||
|
or changes(bird_ospfv3_neighbor_count[5m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: BirdOSPFDown
|
||||||
|
expr:
|
||||||
|
bird_ospf_running == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Instance: !unsafe "{{ $labels.name }}"
|
||||||
|
...
|
11
group_vars/prom/prometheus/common.yml
Normal file
11
group_vars/prom/prometheus/common.yml
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
---
|
||||||
|
prometheus__rules_common:
|
||||||
|
- alert: CollectorDown
|
||||||
|
expr: >
|
||||||
|
up == 0
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Job: !unsafe "{{ $labels.job }}"
|
||||||
|
...
|
23
group_vars/prom/prometheus/keepalived.yml
Normal file
23
group_vars/prom/prometheus/keepalived.yml
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
---
|
||||||
|
prometheus__rules_keepalived:
|
||||||
|
- alert: KeepalivedVrrpFault
|
||||||
|
expr:
|
||||||
|
keepalived_vrrp_state{state="fault"} > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Instance: !unsafe "{{ $labels.instance }}"
|
||||||
|
- alert: KeepalivedMasterChange
|
||||||
|
expr:
|
||||||
|
changes(
|
||||||
|
keepalived_vrrp_state{
|
||||||
|
keepalived_vvrp_state="master"
|
||||||
|
}[1m]
|
||||||
|
) > 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
Instance: !unsafe "{{ $labels.instance }}"
|
||||||
|
...
|
6
group_vars/prom/prometheus/kresd.yml
Normal file
6
group_vars/prom/prometheus/kresd.yml
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
---
|
||||||
|
prometheus__scraping_kresd:
|
||||||
|
targets: "{{ groups.dns }}"
|
||||||
|
address:
|
||||||
|
port: 8453
|
||||||
|
...
|
23
group_vars/prom/prometheus/main.yml
Normal file
23
group_vars/prom/prometheus/main.yml
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
---
|
||||||
|
prometheus__alertmanager_targets:
|
||||||
|
- docker-ovh.adm.auro.re:9093
|
||||||
|
|
||||||
|
prometheus__tsdb_retention_time: 90d
|
||||||
|
|
||||||
|
prometheus__scraping:
|
||||||
|
node: "{{ prometheus__scraping_node }}"
|
||||||
|
prometheus: "{{ prometheus__scraping_prometheus }}"
|
||||||
|
kresd: "{{ prometheus__scraping_kresd }}"
|
||||||
|
bird: "{{ prometheus__scraping_bird }}"
|
||||||
|
quanta: "{{ prometheus__scraping_quanta }}"
|
||||||
|
snmp: "{{ prometheus__scraping_snmp }}"
|
||||||
|
|
||||||
|
prometheus__rules:
|
||||||
|
common: "{{ prometheus__rules_common }}"
|
||||||
|
switch: "{{ prometheus__rules_switch }}"
|
||||||
|
prometheus: "{{ prometheus__rules_prometheus }}"
|
||||||
|
node: "{{ prometheus__rules_node }}"
|
||||||
|
keepalived: "{{ prometheus__rules_keepalived }}"
|
||||||
|
quanta: "{{ prometheus__rules_quanta }}"
|
||||||
|
bird: "{{ prometheus__rules_bird }}"
|
||||||
|
...
|
199
group_vars/prom/prometheus/node.yml
Normal file
199
group_vars/prom/prometheus/node.yml
Normal file
|
@ -0,0 +1,199 @@
|
||||||
|
---
|
||||||
|
prometheus__scraping_node:
|
||||||
|
targets: "{{ groups.vm_network + groups.pve_network }}"
|
||||||
|
address:
|
||||||
|
port: 9100
|
||||||
|
|
||||||
|
prometheus__rules_node:
|
||||||
|
- alert: OutOfMemory
|
||||||
|
expr:
|
||||||
|
100 * (
|
||||||
|
node_memory_MemFree_bytes
|
||||||
|
+ node_memory_Cached_bytes
|
||||||
|
+ node_memory_Buffers_bytes
|
||||||
|
) / node_memory_MemTotal_bytes < 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
FreeMemory: !unsafe '{{ printf "%.0f" $value }} %'
|
||||||
|
- alert: HostSwapIsFillingUp
|
||||||
|
expr:
|
||||||
|
100 * (
|
||||||
|
1 - (
|
||||||
|
node_memory_SwapFree_bytes
|
||||||
|
/ node_memory_SwapTotal_bytes
|
||||||
|
)
|
||||||
|
) >= 50
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
UsedSwap: !unsafe '{{ printf "%.0f" $value }} %'
|
||||||
|
- alert: HostPhysicalComponentTooHot
|
||||||
|
expr:
|
||||||
|
node_hwmon_temp_celsius > 79
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Temperature: !unsafe '{{ printf "%.0f" $value }} °C'
|
||||||
|
Chip: !unsafe "{{ $labels.chip }}"
|
||||||
|
Sensor: !unsafe "{{ $labels.sensor }}"
|
||||||
|
- alert: HostNodeOvertemperatureAlarm
|
||||||
|
expr:
|
||||||
|
node_hwmon_temp_crit_alarm_celsius == 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Chip: !unsafe "{{ $labels.chip }}"
|
||||||
|
Sensor: !unsafe "{{ $labels.sensor }}"
|
||||||
|
- alert: HostRaidArrayGotInactive
|
||||||
|
expr:
|
||||||
|
node_md_state{state="inactive"} > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Device: !unsafe "{{ $labels.device }}"
|
||||||
|
- alert: HostRaidDiskFailure
|
||||||
|
expr:
|
||||||
|
node_md_disks{state="failed"} > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
severity: !unsafe "{{ $labels.md_device }}"
|
||||||
|
- alert: HostOomKillDetected
|
||||||
|
expr:
|
||||||
|
increase(node_vmstat_oom_kill[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
PID: !unsafe "{{ $value }}"
|
||||||
|
- alert: HostEdacCorrectableErrorsDetected
|
||||||
|
expr:
|
||||||
|
increase(node_edac_correctable_errors_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
CorrectedErrors: !unsafe "{{ $value }}"
|
||||||
|
- alert: HostEdacUncorrectableErrorsDetected
|
||||||
|
expr:
|
||||||
|
increase(node_edac_uncorrectable_errors_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
DetectedErrors: !unsafe "{{ $value }}"
|
||||||
|
- alert: OutOfDiskSpace
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
100 * node_filesystem_free_bytes
|
||||||
|
/ node_filesystem_size_bytes < 10
|
||||||
|
)
|
||||||
|
and on (instance, device, mountpoint) (
|
||||||
|
node_filesystem_readonly
|
||||||
|
) == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Mountpoint: !unsafe '{{ $labels.mountpoint }}'
|
||||||
|
FreeSpace: !unsafe '{{ printf "%.0f" $value }} %'
|
||||||
|
- alert: HostConntrackLimit
|
||||||
|
expr:
|
||||||
|
100 * (
|
||||||
|
node_nf_conntrack_entries
|
||||||
|
/ node_nf_conntrack_entries_limit
|
||||||
|
) > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
Filled: !unsafe '{{ printf "%.0f" $value }} %'
|
||||||
|
- alert: HostClockSkew
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
node_timex_offset_seconds > 0.05
|
||||||
|
and deriv(node_timex_offset_seconds[5m]) >= 0
|
||||||
|
) or (
|
||||||
|
node_timex_offset_seconds < -0.05
|
||||||
|
and deriv(node_timex_offset_seconds[5m]) <= 0
|
||||||
|
)
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: HostClockNotSynchronising
|
||||||
|
expr:
|
||||||
|
min_over_time(node_timex_sync_status[1m]) == 0
|
||||||
|
and node_timex_maxerror_seconds >= 16
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: HostRequiresReboot
|
||||||
|
expr:
|
||||||
|
node_reboot_required > 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: OutOfInodes
|
||||||
|
expr:
|
||||||
|
100 * node_filesystem_files_free
|
||||||
|
/ node_filesystem_files < 10
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
Mountpoint: !unsafe "{{ $labels.mountpoint }}"
|
||||||
|
FreeInodes: !unsafe '{{ printf "%.0f" $value }} %'
|
||||||
|
- alert: CpuUsage
|
||||||
|
expr:
|
||||||
|
100 * (
|
||||||
|
1 - avg by (instance) (
|
||||||
|
irate(node_cpu_seconds_total{mode="idle"}[5m])
|
||||||
|
)
|
||||||
|
) > 75
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
Usage: !unsafe '{{ printf "%.0f" $value }} %'
|
||||||
|
- alert: SystemdServiceFailed
|
||||||
|
expr:
|
||||||
|
node_systemd_unit_state{state="failed"} == 1
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
Service: !unsafe "{{ $labels.name }}"
|
||||||
|
- alert: LoadUsage
|
||||||
|
expr:
|
||||||
|
node_load1 > 5
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
Load1: !unsafe '{{ printf "%.0f" $value }}'
|
||||||
|
- alert: UnhealthyDisk
|
||||||
|
expr:
|
||||||
|
smartmon_device_smart_healthy < 1
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Disk: !unsafe "{{ $labels.disk }}"
|
||||||
|
- alert: HostCpuStealNoisyNeighbor
|
||||||
|
expr:
|
||||||
|
100 * avg by (instance) (
|
||||||
|
rate(node_cpu_seconds_total{mode="steal"}[5m])
|
||||||
|
) > 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
Disk: !unsafe "{{ $labels.disk }}"
|
||||||
|
...
|
14
group_vars/prom/prometheus/prometheus.yml
Normal file
14
group_vars/prom/prometheus/prometheus.yml
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
---
|
||||||
|
prometheus__scraping_prometheus:
|
||||||
|
targets: "{{ groups.prom }}"
|
||||||
|
address:
|
||||||
|
port: 9090
|
||||||
|
|
||||||
|
prometheus__rules_prometheus:
|
||||||
|
- alert: PrometheusTsdbCompactionFailed
|
||||||
|
expr:
|
||||||
|
increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
...
|
84
group_vars/prom/prometheus/quanta.yml
Normal file
84
group_vars/prom/prometheus/quanta.yml
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
---
|
||||||
|
prometheus__scraping_quanta:
|
||||||
|
targets: "{{ groups.quanta }}"
|
||||||
|
address: 127.0.0.1:9116
|
||||||
|
path: /snmp
|
||||||
|
params:
|
||||||
|
module:
|
||||||
|
- quanta
|
||||||
|
|
||||||
|
prometheus__rules_quanta:
|
||||||
|
- alert: QuantaQueueOverflow
|
||||||
|
expr:
|
||||||
|
snAgGblQueueOverflow == 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: QuantaCpuUsage
|
||||||
|
expr:
|
||||||
|
snAgGblCpuUtil1MinAvg > 50
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
Usage: !unsafe "{{ $value }} %"
|
||||||
|
- alert: QuantaCpuUsage
|
||||||
|
expr:
|
||||||
|
snAgGblCpuUtil1MinAvg > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Usage: !unsafe "{{ $value }} %"
|
||||||
|
- alert: QuantaMemoryUsage
|
||||||
|
expr:
|
||||||
|
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
UsedMemory: !unsafe "{{ $value }} %"
|
||||||
|
- alert: QuantaMemoryUsage
|
||||||
|
expr:
|
||||||
|
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: alert
|
||||||
|
annotations:
|
||||||
|
UsedMemory: !unsafe "{{ $value }} %"
|
||||||
|
- alert: QuantaFanHealth
|
||||||
|
expr:
|
||||||
|
snChasFanOperStatus{snChasFanOperStatus="normal"} == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Description: !unsafe "{{ $labels.shChasFanDescription }}"
|
||||||
|
Status: !unsafe "{{ $labels.snChasFanOperStatus }}"
|
||||||
|
- alert: QuantaTemp
|
||||||
|
expr:
|
||||||
|
0.5 * snAgentTempValue > 45
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
Temperature: !unsafe "{{ $value }} °C"
|
||||||
|
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
|
||||||
|
- alert: QuantaTemp
|
||||||
|
expr:
|
||||||
|
0.5 * snAgentTempValue > 60
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
Temperature: !unsafe "{{ $value }} °C"
|
||||||
|
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
|
||||||
|
- alert: QuantaPowerRedundancyFailure
|
||||||
|
expr:
|
||||||
|
count by (instance) (
|
||||||
|
snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"}
|
||||||
|
) < 2
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
...
|
6
group_vars/prom/prometheus/snmp.yml
Normal file
6
group_vars/prom/prometheus/snmp.yml
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
---
|
||||||
|
prometheus__scraping_snmp:
|
||||||
|
targets: "{{ groups.prom }}"
|
||||||
|
address:
|
||||||
|
port: 9116
|
||||||
|
...
|
47
group_vars/prom/prometheus/switch.yml
Normal file
47
group_vars/prom/prometheus/switch.yml
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
---
|
||||||
|
prometheus__rules_switch:
|
||||||
|
- alert: SwitchPromiscuousChange
|
||||||
|
expr:
|
||||||
|
changes(ifPromiscuousMode[5m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
Interface: !unsafe "{{ $labels.ifName }}
|
||||||
|
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
||||||
|
- alert: SwitchInterfaceUpChange
|
||||||
|
expr:
|
||||||
|
changes(ifOperStatus{ifOperStatus="up"}[5m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
Interface: !unsafe "{{ $labels.ifName }}
|
||||||
|
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
||||||
|
- alert: SwitchInErrors
|
||||||
|
expr:
|
||||||
|
100 * irate(ifInErrors[5m]) / (
|
||||||
|
irate(ifInUcastPkts[5m])
|
||||||
|
+ irate(ifInNUcastPkts[5m])
|
||||||
|
) > 0.01
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %'
|
||||||
|
Interface: !unsafe "{{ $labels.ifName }}
|
||||||
|
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
||||||
|
- alert: SwitchOutErrors
|
||||||
|
expr:
|
||||||
|
100 * irate(ifOutErrors[5m]) / (
|
||||||
|
irate(ifOutUcastPkts[5m])
|
||||||
|
+ irate(ifOutNUcastPkts[5m])
|
||||||
|
) > 0.01
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %'
|
||||||
|
Interface: !unsafe "{{ $labels.ifName }}
|
||||||
|
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
||||||
|
...
|
Binary file not shown.
|
@ -1,7 +1,7 @@
|
||||||
---
|
---
|
||||||
prometheus__alertmanager_targets: []
|
prometheus__alertmanager_targets: []
|
||||||
prometheus__scraping: {}
|
prometheus__scraping: {}
|
||||||
prometheus__alert_rules: {}
|
prometheus__rules: {}
|
||||||
prometheus__tsdb_retention_time: 15d
|
prometheus__tsdb_retention_time: 15d
|
||||||
prometheus__page_title: "{{ inventory_hostname }}"
|
prometheus__page_title: "{{ inventory_hostname }}"
|
||||||
...
|
...
|
||||||
|
|
|
@ -43,8 +43,8 @@
|
||||||
mode: u=rw,g=r,o=r
|
mode: u=rw,g=r,o=r
|
||||||
validate: "promtool check rules %s"
|
validate: "promtool check rules %s"
|
||||||
vars:
|
vars:
|
||||||
prometheus__rules:
|
prometheus__rules_config:
|
||||||
groups: "{{ prometheus__alert_rules
|
groups: "{{ prometheus__rules
|
||||||
| dict2items(key_name='name', value_name='rules') }}"
|
| dict2items(key_name='name', value_name='rules') }}"
|
||||||
notify:
|
notify:
|
||||||
- Reload prometheus
|
- Reload prometheus
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
---
|
---
|
||||||
{{ ansible_managed | comment }}
|
{{ ansible_managed | comment }}
|
||||||
|
|
||||||
{{ prometheus__rules | to_nice_yaml }}
|
{{ prometheus__rules_config | to_nice_yaml }}
|
||||||
...
|
...
|
||||||
|
|
Loading…
Reference in a new issue