prometheus: refactoring of the config

This commit is contained in:
jeltz 2023-11-02 20:27:45 +01:00
parent 2928d7e809
commit 16a0d95936
Signed by: jeltz
GPG key ID: 800882B66C0C3326
15 changed files with 556 additions and 530 deletions

View file

@ -1,526 +0,0 @@
---
prometheus__alertmanager_targets:
- docker-ovh.adm.auro.re:9093
prometheus__tsdb_retention_time: 90d
prometheus__scraping:
node:
targets: "{{ groups.vm_network + groups.pve_network }}"
address:
port: 9100
prometheus:
targets: "{{ groups.prom }}"
address:
port: 9090
kresd:
targets: "{{ groups.dns }}"
address:
port: 8453
bird:
targets: "{{ groups.router }}"
address:
port: 9324
quanta:
targets: "{{ groups.quanta }}"
address: 127.0.0.1:9116
path: /snmp
params:
module:
- quanta
snmp:
targets: "{{ groups.prom }}"
address:
port: 9116
prometheus__alert_rules_prometheus:
- alert: PrometheusTsdbCompactionFailed
expr:
increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
prometheus__alert_rules_common:
- alert: CollectorDown
expr: >
up == 0
for: 3m
labels:
severity: critical
annotations:
Job: !unsafe "{{ $labels.job }}"
prometheus__alert_rules_node:
- alert: OutOfMemory
expr:
100 * (
node_memory_MemFree_bytes
+ node_memory_Cached_bytes
+ node_memory_Buffers_bytes
) / node_memory_MemTotal_bytes < 10
for: 5m
labels:
severity: warning
annotations:
FreeMemory: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostSwapIsFillingUp
expr:
100 * (
1 - (
node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes
)
) >= 50
for: 3m
labels:
severity: critical
annotations:
UsedSwap: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostPhysicalComponentTooHot
expr:
node_hwmon_temp_celsius > 79
for: 3m
labels:
severity: critical
annotations:
Temperature: !unsafe '{{ printf "%.0f" $value }} °C'
Chip: !unsafe "{{ $labels.chip }}"
Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostNodeOvertemperatureAlarm
expr:
node_hwmon_temp_crit_alarm_celsius == 1
for: 0m
labels:
severity: critical
annotations:
Chip: !unsafe "{{ $labels.chip }}"
Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostRaidArrayGotInactive
expr:
node_md_state{state="inactive"} > 0
for: 0m
labels:
severity: critical
annotations:
Device: !unsafe "{{ $labels.device }}"
- alert: HostRaidDiskFailure
expr:
node_md_disks{state="failed"} > 0
for: 0m
labels:
severity: critical
annotations:
severity: !unsafe "{{ $labels.md_device }}"
- alert: HostOomKillDetected
expr:
increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
PID: !unsafe "{{ $value }}"
- alert: HostEdacCorrectableErrorsDetected
expr:
increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
CorrectedErrors: !unsafe "{{ $value }}"
- alert: HostEdacUncorrectableErrorsDetected
expr:
increase(node_edac_uncorrectable_errors_total[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
DetectedErrors: !unsafe "{{ $value }}"
- alert: OutOfDiskSpace
expr:
(
100 * node_filesystem_free_bytes
/ node_filesystem_size_bytes < 10
)
and on (instance, device, mountpoint) (
node_filesystem_readonly
) == 0
for: 5m
labels:
severity: critical
annotations:
Mountpoint: !unsafe '{{ $labels.mountpoint }}'
FreeSpace: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostConntrackLimit
expr:
100 * (
node_nf_conntrack_entries
/ node_nf_conntrack_entries_limit
) > 80
for: 5m
labels:
severity: warning
annotations:
Filled: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostClockSkew
expr:
(
node_timex_offset_seconds > 0.05
and deriv(node_timex_offset_seconds[5m]) >= 0
) or (
node_timex_offset_seconds < -0.05
and deriv(node_timex_offset_seconds[5m]) <= 0
)
for: 2m
labels:
severity: warning
- alert: HostClockNotSynchronising
expr:
min_over_time(node_timex_sync_status[1m]) == 0
and node_timex_maxerror_seconds >= 16
for: 2m
labels:
severity: warning
- alert: HostRequiresReboot
expr:
node_reboot_required > 0
for: 5m
labels:
severity: warning
- alert: OutOfInodes
expr:
100 * node_filesystem_files_free
/ node_filesystem_files < 10
for: 3m
labels:
severity: warning
annotations:
Mountpoint: !unsafe "{{ $labels.mountpoint }}"
FreeInodes: !unsafe '{{ printf "%.0f" $value }} %'
- alert: CpuUsage
expr:
100 * (
1 - avg by (instance) (
irate(node_cpu_seconds_total{mode="idle"}[5m])
)
) > 75
for: 10m
labels:
severity: warning
annotations:
Usage: !unsafe '{{ printf "%.0f" $value }} %'
- alert: SystemdServiceFailed
expr:
node_systemd_unit_state{state="failed"} == 1
for: 10m
labels:
severity: warning
annotations:
Service: !unsafe "{{ $labels.name }}"
- alert: LoadUsage
expr:
node_load1 > 5
for: 2m
labels:
severity: warning
annotations:
Load1: !unsafe '{{ printf "%.0f" $value }}'
- alert: UnhealthyDisk
expr:
smartmon_device_smart_healthy < 1
for: 10m
labels:
severity: critical
annotations:
Disk: !unsafe "{{ $labels.disk }}"
- alert: HostCpuStealNoisyNeighbor
expr:
100 * avg by (instance) (
rate(node_cpu_seconds_total{mode="steal"}[5m])
) > 10
for: 5m
labels:
severity: warning
annotations:
Disk: !unsafe "{{ $labels.disk }}"
prometheus__alert_rules_keepalived:
- alert: KeepalivedVrrpFault
expr:
keepalived_vrrp_state{state="fault"} > 0
for: 0m
labels:
severity: critical
annotations:
Instance: !unsafe "{{ $labels.instance }}"
- alert: KeepalivedMasterChange
expr:
changes(
keepalived_vrrp_state{
keepalived_vvrp_state="master"
}[1m]
) > 1
for: 0m
labels:
severity: warning
annotations:
Instance: !unsafe "{{ $labels.instance }}"
prometheus__alert_rules_bird:
- record: bird:protocol_up:bgp
expr:
label_replace(
bird_protocol_up{proto="BGP",}
unless bird_protocol_up{
proto="BGP",
name=~"^(viarezo|isp[12]|rezel)[46]$"
},
"group", "$1",
"instance", "^([^0-9\\.]+)-[0-9]+.*"
)
# Sessions qui ne sont volontairement pas redondées
# au sein d'un groupe
- record: bird:protocol_up:bgp:non_redundant
expr:
bird:protocol_up:bgp{
group="edge",
name=~"^(oti|crans|legacy|edge)[46]$"
}
# Sessions qui le sont
- record: bird:protocol_up:bgp:redundant
expr:
bird:protocol_up:bgp
unless
bird:protocol_up:bgp:non_redundant
- alert: BirdBGPRedundancyDegraded
expr:
(
count by (group, name) (
bird:protocol_up:bgp:redundant{state="Established"}
) or (
count by (group, name) (
bird:protocol_up:bgp:redundant{state!="Established"}
) * 0
)
) < 2
for: 0m
labels:
severity: warning
annotations:
Session: !unsafe "{{ $labels.name }}"
Count: !unsafe "{{ $value }}"
Group: !unsafe "{{ $labels.group }}"
- alert: BirdBGPDown
expr:
(
count by (group, name) (
bird:protocol_up:bgp{state="Established"}
) or (
count by (group, name) (
bird:protocol_up:bgp{state!="Established"}
) * 0
)
) == 0
for: 0m
labels:
severity: critical
annotations:
Session: !unsafe "{{ $labels.name }}"
Group: !unsafe "{{ $labels.group }}"
- alert: BirdBGPNoExportedPrefixRedundant
expr:
bird_protocol_prefix_export_count{
export_filter!="REJECT",
} * on (instance, name) (
bird:protocol_up:bgp:redundant{state="Established"}
) == 0
for: 0m
labels:
severity: critical
annotations:
Session: !unsafe "{{ $labels.name }}"
- alert: BirdBGPNoImportedPrefixRedundant
expr:
bird_protocol_prefix_import_count{
import_filter!="REJECT",
} * on (instance, name) (
bird:protocol_up:bgp:redundant{state="Established"}
) == 0
for: 0m
labels:
severity: critical
annotations:
- alert: BirdBGPNoExportedPrefixNonRedundant
expr:
sum by (group) (
bird_protocol_prefix_export_count{
export_filter!="REJECT",
} * on (instance, name) group_left (group) (
bird:protocol_up:bgp:non_redundant{state="Established"}
)
) == 0
for: 0m
labels:
severity: critical
annotations:
Session: !unsafe "{{ $labels.name }}"
- alert: BirdBGPNoImportedPrefixNonRedundant
expr:
sum by (group) (
bird_protocol_prefix_import_count{
import_filter!="REJECT",
} * on (instance, name) group_left (group) (
bird:protocol_up:bgp:non_redundant{state="Established"}
)
) == 0
for: 0m
labels:
severity: critical
annotations:
Session: !unsafe "{{ $labels.name }}"
- alert: BirdOSPFNeighboursChange
expr:
changes(bird_ospf_neighbor_count[5m]) > 0
or changes(bird_ospfv3_neighbor_count[5m]) > 0
for: 0m
labels:
severity: warning
- alert: BirdOSPFDown
expr:
bird_ospf_running == 0
for: 0m
labels:
severity: critical
annotations:
Instance: !unsafe "{{ $labels.name }}"
prometheus__alert_rules_quanta:
- alert: QuantaQueueOverflow
expr:
snAgGblQueueOverflow == 1
for: 0m
labels:
severity: critical
- alert: QuantaCpuUsage
expr:
snAgGblCpuUtil1MinAvg > 50
for: 5m
labels:
severity: warning
annotations:
Usage: !unsafe "{{ $value }} %"
- alert: QuantaCpuUsage
expr:
snAgGblCpuUtil1MinAvg > 80
for: 5m
labels:
severity: critical
annotations:
Usage: !unsafe "{{ $value }} %"
- alert: QuantaMemoryUsage
expr:
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50
for: 5m
labels:
severity: warning
annotations:
UsedMemory: !unsafe "{{ $value }} %"
- alert: QuantaMemoryUsage
expr:
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80
for: 5m
labels:
severity: alert
annotations:
UsedMemory: !unsafe "{{ $value }} %"
- alert: QuantaFanHealth
expr:
snChasFanOperStatus{snChasFanOperStatus="normal"} == 0
for: 0m
labels:
severity: critical
annotations:
Description: !unsafe "{{ $labels.shChasFanDescription }}"
Status: !unsafe "{{ $labels.snChasFanOperStatus }}"
- alert: QuantaTemp
expr:
0.5 * snAgentTempValue > 45
for: 0m
labels:
severity: warning
annotations:
Temperature: !unsafe "{{ $value }} °C"
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
- alert: QuantaTemp
expr:
0.5 * snAgentTempValue > 60
for: 0m
labels:
severity: critical
annotations:
Temperature: !unsafe "{{ $value }} °C"
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
- alert: QuantaPowerRedundancyFailure
expr:
count by (instance) (
snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"}
) < 2
for: 0m
labels:
severity: warning
prometheus__alert_rules_switch:
- alert: SwitchPromiscuousChange
expr:
changes(ifPromiscuousMode[5m]) > 0
for: 0m
labels:
severity: warning
annotations:
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchInterfaceUpChange
expr:
changes(ifOperStatus{ifOperStatus="up"}[5m]) > 0
for: 0m
labels:
severity: warning
annotations:
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchInErrors
expr:
100 * irate(ifInErrors[5m]) / (
irate(ifInUcastPkts[5m])
+ irate(ifInNUcastPkts[5m])
) > 0.01
for: 0m
labels:
severity: warning
annotations:
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %'
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchOutErrors
expr:
100 * irate(ifOutErrors[5m]) / (
irate(ifOutUcastPkts[5m])
+ irate(ifOutNUcastPkts[5m])
) > 0.01
for: 0m
labels:
severity: warning
annotations:
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %'
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
prometheus__alert_rules:
common: "{{ prometheus__alert_rules_common }}"
switch: "{{ prometheus__alert_rules_switch }}"
prometheus: "{{ prometheus__alert_rules_prometheus }}"
node: "{{ prometheus__alert_rules_node }}"
keepalived: "{{ prometheus__alert_rules_keepalived }}"
quanta: "{{ prometheus__alert_rules_quanta }}"
bird: "{{ prometheus__alert_rules_bird }}"
...

View file

@ -0,0 +1,139 @@
---
prometheus__scraping_bird:
targets: "{{ groups.router }}"
address:
port: 9324
prometheus__rules_bird:
- record: bird:protocol_up:bgp
expr:
label_replace(
bird_protocol_up{proto="BGP",}
unless bird_protocol_up{
proto="BGP",
name=~"^(viarezo|isp[12]|rezel)[46]$"
},
"group", "$1",
"instance", "^([^0-9\\.]+)-[0-9]+.*"
)
# Sessions qui ne sont volontairement pas redondées
# au sein d'un groupe
- record: bird:protocol_up:bgp:non_redundant
expr:
bird:protocol_up:bgp{
group="edge",
name=~"^(oti|crans|legacy|edge)[46]$"
}
# Sessions qui le sont
- record: bird:protocol_up:bgp:redundant
expr:
bird:protocol_up:bgp
unless
bird:protocol_up:bgp:non_redundant
- alert: BirdBGPRedundancyDegraded
expr:
(
count by (group, name) (
bird:protocol_up:bgp:redundant{state="Established"}
) or (
count by (group, name) (
bird:protocol_up:bgp:redundant{state!="Established"}
) * 0
)
) < 2
for: 0m
labels:
severity: warning
annotations:
Session: !unsafe "{{ $labels.name }}"
Count: !unsafe "{{ $value }}"
Group: !unsafe "{{ $labels.group }}"
- alert: BirdBGPDown
expr:
(
count by (group, name) (
bird:protocol_up:bgp{state="Established"}
) or (
count by (group, name) (
bird:protocol_up:bgp{state!="Established"}
) * 0
)
) == 0
for: 0m
labels:
severity: critical
annotations:
Session: !unsafe "{{ $labels.name }}"
Group: !unsafe "{{ $labels.group }}"
- alert: BirdBGPNoExportedPrefixRedundant
expr:
bird_protocol_prefix_export_count{
export_filter!="REJECT",
} * on (instance, name) group_left (group) (
bird:protocol_up:bgp:redundant{state="Established"}
) == 0
for: 0m
labels:
severity: critical
annotations:
Session: !unsafe "{{ $labels.name }}"
Group: !unsafe "{{ $labels.group }}"
- alert: BirdBGPNoImportedPrefixRedundant
expr:
bird_protocol_prefix_import_count{
import_filter!="REJECT",
} * on (instance, name) group_left (group) (
bird:protocol_up:bgp:redundant{state="Established"}
) == 0
for: 0m
labels:
severity: critical
annotations:
Session: !unsafe "{{ $labels.name }}"
Group: !unsafe "{{ $labels.group }}"
- alert: BirdBGPNoExportedPrefixNonRedundant
expr:
sum by (group) (
bird_protocol_prefix_export_count{
export_filter!="REJECT",
} * on (instance, name) group_left (group) (
bird:protocol_up:bgp:non_redundant{state="Established"}
)
) == 0
for: 0m
labels:
severity: critical
annotations:
Session: !unsafe "{{ $labels.name }}"
Group: !unsafe "{{ $labels.group }}"
- alert: BirdBGPNoImportedPrefixNonRedundant
expr:
sum by (group) (
bird_protocol_prefix_import_count{
import_filter!="REJECT",
} * on (instance, name) group_left (group) (
bird:protocol_up:bgp:non_redundant{state="Established"}
)
) == 0
for: 0m
labels:
severity: critical
annotations:
Session: !unsafe "{{ $labels.name }}"
Group: !unsafe "{{ $labels.group }}"
- alert: BirdOSPFNeighboursChange
expr:
changes(bird_ospf_neighbor_count[5m]) > 0
or changes(bird_ospfv3_neighbor_count[5m]) > 0
for: 0m
labels:
severity: warning
- alert: BirdOSPFDown
expr:
bird_ospf_running == 0
for: 0m
labels:
severity: critical
annotations:
Instance: !unsafe "{{ $labels.name }}"
...

View file

@ -0,0 +1,11 @@
---
prometheus__rules_common:
- alert: CollectorDown
expr: >
up == 0
for: 3m
labels:
severity: critical
annotations:
Job: !unsafe "{{ $labels.job }}"
...

View file

@ -0,0 +1,23 @@
---
prometheus__rules_keepalived:
- alert: KeepalivedVrrpFault
expr:
keepalived_vrrp_state{state="fault"} > 0
for: 0m
labels:
severity: critical
annotations:
Instance: !unsafe "{{ $labels.instance }}"
- alert: KeepalivedMasterChange
expr:
changes(
keepalived_vrrp_state{
keepalived_vvrp_state="master"
}[1m]
) > 1
for: 0m
labels:
severity: warning
annotations:
Instance: !unsafe "{{ $labels.instance }}"
...

View file

@ -0,0 +1,6 @@
---
prometheus__scraping_kresd:
targets: "{{ groups.dns }}"
address:
port: 8453
...

View file

@ -0,0 +1,23 @@
---
prometheus__alertmanager_targets:
- docker-ovh.adm.auro.re:9093
prometheus__tsdb_retention_time: 90d
prometheus__scraping:
node: "{{ prometheus__scraping_node }}"
prometheus: "{{ prometheus__scraping_prometheus }}"
kresd: "{{ prometheus__scraping_kresd }}"
bird: "{{ prometheus__scraping_bird }}"
quanta: "{{ prometheus__scraping_quanta }}"
snmp: "{{ prometheus__scraping_snmp }}"
prometheus__rules:
common: "{{ prometheus__rules_common }}"
switch: "{{ prometheus__rules_switch }}"
prometheus: "{{ prometheus__rules_prometheus }}"
node: "{{ prometheus__rules_node }}"
keepalived: "{{ prometheus__rules_keepalived }}"
quanta: "{{ prometheus__rules_quanta }}"
bird: "{{ prometheus__rules_bird }}"
...

View file

@ -0,0 +1,199 @@
---
prometheus__scraping_node:
targets: "{{ groups.vm_network + groups.pve_network }}"
address:
port: 9100
prometheus__rules_node:
- alert: OutOfMemory
expr:
100 * (
node_memory_MemFree_bytes
+ node_memory_Cached_bytes
+ node_memory_Buffers_bytes
) / node_memory_MemTotal_bytes < 10
for: 5m
labels:
severity: warning
annotations:
FreeMemory: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostSwapIsFillingUp
expr:
100 * (
1 - (
node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes
)
) >= 50
for: 3m
labels:
severity: critical
annotations:
UsedSwap: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostPhysicalComponentTooHot
expr:
node_hwmon_temp_celsius > 79
for: 3m
labels:
severity: critical
annotations:
Temperature: !unsafe '{{ printf "%.0f" $value }} °C'
Chip: !unsafe "{{ $labels.chip }}"
Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostNodeOvertemperatureAlarm
expr:
node_hwmon_temp_crit_alarm_celsius == 1
for: 0m
labels:
severity: critical
annotations:
Chip: !unsafe "{{ $labels.chip }}"
Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostRaidArrayGotInactive
expr:
node_md_state{state="inactive"} > 0
for: 0m
labels:
severity: critical
annotations:
Device: !unsafe "{{ $labels.device }}"
- alert: HostRaidDiskFailure
expr:
node_md_disks{state="failed"} > 0
for: 0m
labels:
severity: critical
annotations:
severity: !unsafe "{{ $labels.md_device }}"
- alert: HostOomKillDetected
expr:
increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
PID: !unsafe "{{ $value }}"
- alert: HostEdacCorrectableErrorsDetected
expr:
increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
CorrectedErrors: !unsafe "{{ $value }}"
- alert: HostEdacUncorrectableErrorsDetected
expr:
increase(node_edac_uncorrectable_errors_total[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
DetectedErrors: !unsafe "{{ $value }}"
- alert: OutOfDiskSpace
expr:
(
100 * node_filesystem_free_bytes
/ node_filesystem_size_bytes < 10
)
and on (instance, device, mountpoint) (
node_filesystem_readonly
) == 0
for: 5m
labels:
severity: critical
annotations:
Mountpoint: !unsafe '{{ $labels.mountpoint }}'
FreeSpace: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostConntrackLimit
expr:
100 * (
node_nf_conntrack_entries
/ node_nf_conntrack_entries_limit
) > 80
for: 5m
labels:
severity: warning
annotations:
Filled: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostClockSkew
expr:
(
node_timex_offset_seconds > 0.05
and deriv(node_timex_offset_seconds[5m]) >= 0
) or (
node_timex_offset_seconds < -0.05
and deriv(node_timex_offset_seconds[5m]) <= 0
)
for: 2m
labels:
severity: warning
- alert: HostClockNotSynchronising
expr:
min_over_time(node_timex_sync_status[1m]) == 0
and node_timex_maxerror_seconds >= 16
for: 2m
labels:
severity: warning
- alert: HostRequiresReboot
expr:
node_reboot_required > 0
for: 5m
labels:
severity: warning
- alert: OutOfInodes
expr:
100 * node_filesystem_files_free
/ node_filesystem_files < 10
for: 3m
labels:
severity: warning
annotations:
Mountpoint: !unsafe "{{ $labels.mountpoint }}"
FreeInodes: !unsafe '{{ printf "%.0f" $value }} %'
- alert: CpuUsage
expr:
100 * (
1 - avg by (instance) (
irate(node_cpu_seconds_total{mode="idle"}[5m])
)
) > 75
for: 10m
labels:
severity: warning
annotations:
Usage: !unsafe '{{ printf "%.0f" $value }} %'
- alert: SystemdServiceFailed
expr:
node_systemd_unit_state{state="failed"} == 1
for: 10m
labels:
severity: warning
annotations:
Service: !unsafe "{{ $labels.name }}"
- alert: LoadUsage
expr:
node_load1 > 5
for: 2m
labels:
severity: warning
annotations:
Load1: !unsafe '{{ printf "%.0f" $value }}'
- alert: UnhealthyDisk
expr:
smartmon_device_smart_healthy < 1
for: 10m
labels:
severity: critical
annotations:
Disk: !unsafe "{{ $labels.disk }}"
- alert: HostCpuStealNoisyNeighbor
expr:
100 * avg by (instance) (
rate(node_cpu_seconds_total{mode="steal"}[5m])
) > 10
for: 5m
labels:
severity: warning
annotations:
Disk: !unsafe "{{ $labels.disk }}"
...

View file

@ -0,0 +1,14 @@
---
prometheus__scraping_prometheus:
targets: "{{ groups.prom }}"
address:
port: 9090
prometheus__rules_prometheus:
- alert: PrometheusTsdbCompactionFailed
expr:
increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
...

View file

@ -0,0 +1,84 @@
---
prometheus__scraping_quanta:
targets: "{{ groups.quanta }}"
address: 127.0.0.1:9116
path: /snmp
params:
module:
- quanta
prometheus__rules_quanta:
- alert: QuantaQueueOverflow
expr:
snAgGblQueueOverflow == 1
for: 0m
labels:
severity: critical
- alert: QuantaCpuUsage
expr:
snAgGblCpuUtil1MinAvg > 50
for: 5m
labels:
severity: warning
annotations:
Usage: !unsafe "{{ $value }} %"
- alert: QuantaCpuUsage
expr:
snAgGblCpuUtil1MinAvg > 80
for: 5m
labels:
severity: critical
annotations:
Usage: !unsafe "{{ $value }} %"
- alert: QuantaMemoryUsage
expr:
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50
for: 5m
labels:
severity: warning
annotations:
UsedMemory: !unsafe "{{ $value }} %"
- alert: QuantaMemoryUsage
expr:
100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80
for: 5m
labels:
severity: alert
annotations:
UsedMemory: !unsafe "{{ $value }} %"
- alert: QuantaFanHealth
expr:
snChasFanOperStatus{snChasFanOperStatus="normal"} == 0
for: 0m
labels:
severity: critical
annotations:
Description: !unsafe "{{ $labels.shChasFanDescription }}"
Status: !unsafe "{{ $labels.snChasFanOperStatus }}"
- alert: QuantaTemp
expr:
0.5 * snAgentTempValue > 45
for: 0m
labels:
severity: warning
annotations:
Temperature: !unsafe "{{ $value }} °C"
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
- alert: QuantaTemp
expr:
0.5 * snAgentTempValue > 60
for: 0m
labels:
severity: critical
annotations:
Temperature: !unsafe "{{ $value }} °C"
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
- alert: QuantaPowerRedundancyFailure
expr:
count by (instance) (
snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"}
) < 2
for: 0m
labels:
severity: warning
...

View file

@ -0,0 +1,6 @@
---
prometheus__scraping_snmp:
targets: "{{ groups.prom }}"
address:
port: 9116
...

View file

@ -0,0 +1,47 @@
---
prometheus__rules_switch:
- alert: SwitchPromiscuousChange
expr:
changes(ifPromiscuousMode[5m]) > 0
for: 0m
labels:
severity: warning
annotations:
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchInterfaceUpChange
expr:
changes(ifOperStatus{ifOperStatus="up"}[5m]) > 0
for: 0m
labels:
severity: warning
annotations:
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchInErrors
expr:
100 * irate(ifInErrors[5m]) / (
irate(ifInUcastPkts[5m])
+ irate(ifInNUcastPkts[5m])
) > 0.01
for: 0m
labels:
severity: warning
annotations:
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %'
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchOutErrors
expr:
100 * irate(ifOutErrors[5m]) / (
irate(ifOutUcastPkts[5m])
+ irate(ifOutNUcastPkts[5m])
) > 0.01
for: 0m
labels:
severity: warning
annotations:
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %'
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
...

View file

@ -1,7 +1,7 @@
---
prometheus__alertmanager_targets: []
prometheus__scraping: {}
prometheus__alert_rules: {}
prometheus__rules: {}
prometheus__tsdb_retention_time: 15d
prometheus__page_title: "{{ inventory_hostname }}"
...

View file

@ -43,8 +43,8 @@
mode: u=rw,g=r,o=r
validate: "promtool check rules %s"
vars:
prometheus__rules:
groups: "{{ prometheus__alert_rules
prometheus__rules_config:
groups: "{{ prometheus__rules
| dict2items(key_name='name', value_name='rules') }}"
notify:
- Reload prometheus

View file

@ -1,5 +1,5 @@
---
{{ ansible_managed | comment }}
{{ prometheus__rules | to_nice_yaml }}
{{ prometheus__rules_config | to_nice_yaml }}
...