Compare commits

...

2 Commits

@ -7,29 +7,29 @@ prometheus__scraping_node:
prometheus__rules_node:
- alert: OutOfMemory
expr:
100 * (
(
node_memory_MemFree_bytes
+ node_memory_Cached_bytes
+ node_memory_Buffers_bytes
) / node_memory_MemTotal_bytes < 10
) / node_memory_MemTotal_bytes < 0.1
for: 5m
labels:
severity: warning
annotations:
FreeMemory: !unsafe '{{ printf "%.0f" $value }} %'
FreeMemory: !unsafe "{{ $value | humanizePercentage }} %"
- alert: HostSwapIsFillingUp
expr:
100 * (
(
1 - (
node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes
)
) >= 50
) >= 0.5
for: 3m
labels:
severity: critical
annotations:
UsedSwap: !unsafe '{{ printf "%.0f" $value }} %'
UsedSwap: !unsafe "{{ $value | humanizePercentage }} %"
- alert: HostPhysicalComponentTooHot
expr:
node_hwmon_temp_celsius > 79
@ -37,7 +37,7 @@ prometheus__rules_node:
labels:
severity: critical
annotations:
Temperature: !unsafe '{{ printf "%.0f" $value }} °C'
Temperature: !unsafe "{{ $value | humanize }} °C"
Chip: !unsafe "{{ $labels.chip }}"
Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostNodeOvertemperatureAlarm
@ -92,8 +92,8 @@ prometheus__rules_node:
- alert: OutOfDiskSpace
expr:
(
100 * node_filesystem_free_bytes
/ node_filesystem_size_bytes < 10
node_filesystem_free_bytes
/ node_filesystem_size_bytes < 0.1
)
and on (instance, device, mountpoint) (
node_filesystem_readonly
@ -102,19 +102,19 @@ prometheus__rules_node:
labels:
severity: critical
annotations:
Mountpoint: !unsafe '{{ $labels.mountpoint }}'
FreeSpace: !unsafe '{{ printf "%.0f" $value }} %'
Mountpoint: !unsafe "{{ $labels.mountpoint }}"
FreeSpace: !unsafe "{{ $value | humanizePercentage }} %"
- alert: HostConntrackLimit
expr:
100 * (
(
node_nf_conntrack_entries
/ node_nf_conntrack_entries_limit
) > 80
) > 0.8
for: 5m
labels:
severity: warning
annotations:
Filled: !unsafe '{{ printf "%.0f" $value }} %'
Filled: !unsafe "{{ $value | humanizePercentage }} %"
- alert: HostClockSkew
expr:
(
@ -142,26 +142,26 @@ prometheus__rules_node:
severity: warning
- alert: OutOfInodes
expr:
100 * node_filesystem_files_free
/ node_filesystem_files < 10
node_filesystem_files_free
/ node_filesystem_files < 0.1
for: 3m
labels:
severity: warning
annotations:
Mountpoint: !unsafe "{{ $labels.mountpoint }}"
FreeInodes: !unsafe '{{ printf "%.0f" $value }} %'
FreeInodes: !unsafe "{{ $value | humanizePercentage }} %"
- alert: CpuUsage
expr:
100 * (
(
1 - avg by (instance) (
irate(node_cpu_seconds_total{mode="idle"}[5m])
)
) > 75
) > 0.75
for: 10m
labels:
severity: warning
annotations:
Usage: !unsafe '{{ printf "%.0f" $value }} %'
Usage: !unsafe "{{ $value | humanizePercentage }} %"
- alert: SystemdServiceFailed
expr:
node_systemd_unit_state{state="failed"} == 1
@ -177,7 +177,7 @@ prometheus__rules_node:
labels:
severity: warning
annotations:
Load1: !unsafe '{{ printf "%.0f" $value }}'
Load1: !unsafe "{{ $value | humanize }}"
- alert: UnhealthyDisk
expr:
smartmon_device_smart_healthy < 1
@ -188,12 +188,13 @@ prometheus__rules_node:
Disk: !unsafe "{{ $labels.disk }}"
- alert: HostCpuStealNoisyNeighbor
expr:
100 * avg by (instance) (
avg by (instance) (
rate(node_cpu_seconds_total{mode="steal"}[5m])
) > 10
) > 0.1
for: 5m
labels:
severity: warning
annotations:
Disk: !unsafe "{{ $labels.disk }}"
Steal: !unsafe "{{ $value | humanizePercentage }} %"
...

@ -20,72 +20,72 @@ prometheus__rules_switch:
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchInErrors
expr:
100 * irate(ifInErrors[5m]) / (
irate(ifInErrors[5m]) / (
irate(ifInUcastPkts[5m])
+ irate(ifInNUcastPkts[5m])
) > 0.01
) > 0.0001
for: 0m
labels:
severity: warning
annotations:
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %'
ErrorRate: !unsafe "{{ $value | humanizePercentage }} %"
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchOutErrors
expr:
100 * irate(ifOutErrors[5m]) / (
irate(ifOutErrors[5m]) / (
irate(ifOutUcastPkts[5m])
+ irate(ifOutNUcastPkts[5m])
) > 0.01
) > 0.0001
for: 0m
labels:
severity: warning
annotations:
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %'
ErrorRate: !unsafe "{{ $value | humanizePercentage }} %"
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchInLinkUsage
expr:
100 * rate(ifHCInOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 50
rate(ifHCInOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 0.5
for: 5m
keep_firing_for: 10m
labels:
severity: warning
annotations:
Usage: !unsafe '{{ printf "%.2f" $value }} %'
Usage: !unsafe "{{ $value | humanizePercentage }} %"
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchInLinkUsage
expr:
100 * rate(ifHCInOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 80
rate(ifHCInOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 0.8
for: 5m
keep_firing_for: 10m
labels:
severity: critical
annotations:
Usage: !unsafe '{{ printf "%.2f" $value }} %'
Usage: !unsafe "{{ $value | humanizePercentage }} %"
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchOutLinkUsage
expr:
100 * rate(ifHCOutOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 50
rate(ifHCOutOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 0.5
for: 5m
keep_firing_for: 10m
labels:
severity: warning
annotations:
Usage: !unsafe '{{ printf "%.2f" $value }} %'
Usage: !unsafe "{{ $value | humanizePercentage }} %"
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchOutLinkUsage
expr:
100 * rate(ifHCOutOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 80
rate(ifHCOutOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 0.8
for: 5m
keep_firing_for: 10m
labels:
severity: warning
annotations:
Usage: !unsafe '{{ printf "%.2f" $value }} %'
Usage: !unsafe "{{ $value | humanizePercentage }} %"
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
...

@ -432,6 +432,9 @@ knotd__zones:
fd-1.core.sw:
#- 2a09:6840:207::1:5
- 10.207.1.5
ff-3.core.sw:
#- 2a09:6840:207::1:6
- 10.207.1.6
gk-1.core.sw:
#- 2a09:6840:207::2:1
- 10.207.2.1

@ -6,6 +6,7 @@ eb-1.acs.sw.infra.auro.re
[quanta]
ff-1.core.sw.infra.auro.re
ff-2.core.sw.infra.auro.re
ff-3.core.sw.infra.auro.re
fl-1.core.sw.infra.auro.re
fl-2.core.sw.infra.auro.re
fd-1.core.sw.infra.auro.re

Loading…
Cancel
Save