199 lines
4.9 KiB
YAML
199 lines
4.9 KiB
YAML
---
|
|
prometheus__scraping_node:
|
|
targets: "{{ groups.vm_network + groups.pve_network }}"
|
|
address:
|
|
port: 9100
|
|
|
|
prometheus__rules_node:
|
|
- alert: OutOfMemory
|
|
expr:
|
|
100 * (
|
|
node_memory_MemFree_bytes
|
|
+ node_memory_Cached_bytes
|
|
+ node_memory_Buffers_bytes
|
|
) / node_memory_MemTotal_bytes < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
FreeMemory: !unsafe '{{ printf "%.0f" $value }} %'
|
|
- alert: HostSwapIsFillingUp
|
|
expr:
|
|
100 * (
|
|
1 - (
|
|
node_memory_SwapFree_bytes
|
|
/ node_memory_SwapTotal_bytes
|
|
)
|
|
) >= 50
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
UsedSwap: !unsafe '{{ printf "%.0f" $value }} %'
|
|
- alert: HostPhysicalComponentTooHot
|
|
expr:
|
|
node_hwmon_temp_celsius > 79
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
Temperature: !unsafe '{{ printf "%.0f" $value }} °C'
|
|
Chip: !unsafe "{{ $labels.chip }}"
|
|
Sensor: !unsafe "{{ $labels.sensor }}"
|
|
- alert: HostNodeOvertemperatureAlarm
|
|
expr:
|
|
node_hwmon_temp_crit_alarm_celsius == 1
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
Chip: !unsafe "{{ $labels.chip }}"
|
|
Sensor: !unsafe "{{ $labels.sensor }}"
|
|
- alert: HostRaidArrayGotInactive
|
|
expr:
|
|
node_md_state{state="inactive"} > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
Device: !unsafe "{{ $labels.device }}"
|
|
- alert: HostRaidDiskFailure
|
|
expr:
|
|
node_md_disks{state="failed"} > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
severity: !unsafe "{{ $labels.md_device }}"
|
|
- alert: HostOomKillDetected
|
|
expr:
|
|
increase(node_vmstat_oom_kill[1m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
PID: !unsafe "{{ $value }}"
|
|
- alert: HostEdacCorrectableErrorsDetected
|
|
expr:
|
|
increase(node_edac_correctable_errors_total[1m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
CorrectedErrors: !unsafe "{{ $value }}"
|
|
- alert: HostEdacUncorrectableErrorsDetected
|
|
expr:
|
|
increase(node_edac_uncorrectable_errors_total[1m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
DetectedErrors: !unsafe "{{ $value }}"
|
|
- alert: OutOfDiskSpace
|
|
expr:
|
|
(
|
|
100 * node_filesystem_free_bytes
|
|
/ node_filesystem_size_bytes < 10
|
|
)
|
|
and on (instance, device, mountpoint) (
|
|
node_filesystem_readonly
|
|
) == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
Mountpoint: !unsafe '{{ $labels.mountpoint }}'
|
|
FreeSpace: !unsafe '{{ printf "%.0f" $value }} %'
|
|
- alert: HostConntrackLimit
|
|
expr:
|
|
100 * (
|
|
node_nf_conntrack_entries
|
|
/ node_nf_conntrack_entries_limit
|
|
) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
Filled: !unsafe '{{ printf "%.0f" $value }} %'
|
|
- alert: HostClockSkew
|
|
expr:
|
|
(
|
|
node_timex_offset_seconds > 0.05
|
|
and deriv(node_timex_offset_seconds[5m]) >= 0
|
|
) or (
|
|
node_timex_offset_seconds < -0.05
|
|
and deriv(node_timex_offset_seconds[5m]) <= 0
|
|
)
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
- alert: HostClockNotSynchronising
|
|
expr:
|
|
min_over_time(node_timex_sync_status[1m]) == 0
|
|
and node_timex_maxerror_seconds >= 16
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
- alert: HostRequiresReboot
|
|
expr:
|
|
node_reboot_required > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: OutOfInodes
|
|
expr:
|
|
100 * node_filesystem_files_free
|
|
/ node_filesystem_files < 10
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
Mountpoint: !unsafe "{{ $labels.mountpoint }}"
|
|
FreeInodes: !unsafe '{{ printf "%.0f" $value }} %'
|
|
- alert: CpuUsage
|
|
expr:
|
|
100 * (
|
|
1 - avg by (instance) (
|
|
irate(node_cpu_seconds_total{mode="idle"}[5m])
|
|
)
|
|
) > 75
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
Usage: !unsafe '{{ printf "%.0f" $value }} %'
|
|
- alert: SystemdServiceFailed
|
|
expr:
|
|
node_systemd_unit_state{state="failed"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
Service: !unsafe "{{ $labels.name }}"
|
|
- alert: LoadUsage
|
|
expr:
|
|
node_load1 > 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
Load1: !unsafe '{{ printf "%.0f" $value }}'
|
|
- alert: UnhealthyDisk
|
|
expr:
|
|
smartmon_device_smart_healthy < 1
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
Disk: !unsafe "{{ $labels.disk }}"
|
|
- alert: HostCpuStealNoisyNeighbor
|
|
expr:
|
|
100 * avg by (instance) (
|
|
rate(node_cpu_seconds_total{mode="steal"}[5m])
|
|
) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
Disk: !unsafe "{{ $labels.disk }}"
|
|
...
|