ansible/group_vars/prom/prometheus/node.yml

201 lines
4.9 KiB
YAML
Raw Normal View History

2023-11-02 20:27:45 +01:00
---
prometheus__scraping_node:
targets: "{{ groups.vm_network + groups.pve_network }}"
address:
port: 9100
prometheus__rules_node:
- alert: OutOfMemory
expr:
(
2023-11-02 20:27:45 +01:00
node_memory_MemFree_bytes
+ node_memory_Cached_bytes
+ node_memory_Buffers_bytes
) / node_memory_MemTotal_bytes < 0.1
2023-11-02 20:27:45 +01:00
for: 5m
labels:
severity: warning
annotations:
FreeMemory: !unsafe "{{ $value | humanizePercentage }}"
2023-11-02 20:27:45 +01:00
- alert: HostSwapIsFillingUp
expr:
(
2023-11-02 20:27:45 +01:00
1 - (
node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes
)
) >= 0.5
2023-11-02 20:27:45 +01:00
for: 3m
labels:
severity: critical
annotations:
UsedSwap: !unsafe "{{ $value | humanizePercentage }}"
2023-11-02 20:27:45 +01:00
- alert: HostPhysicalComponentTooHot
expr:
node_hwmon_temp_celsius > 79
for: 3m
labels:
severity: critical
annotations:
Temperature: !unsafe "{{ $value | humanize }} °C"
2023-11-02 20:27:45 +01:00
Chip: !unsafe "{{ $labels.chip }}"
Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostNodeOvertemperatureAlarm
expr:
node_hwmon_temp_crit_alarm_celsius == 1
for: 0m
labels:
severity: critical
annotations:
Chip: !unsafe "{{ $labels.chip }}"
Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostRaidArrayGotInactive
expr:
node_md_state{state="inactive"} > 0
for: 0m
labels:
severity: critical
annotations:
Device: !unsafe "{{ $labels.device }}"
- alert: HostRaidDiskFailure
expr:
node_md_disks{state="failed"} > 0
for: 0m
labels:
severity: critical
annotations:
severity: !unsafe "{{ $labels.md_device }}"
- alert: HostOomKillDetected
expr:
increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
PID: !unsafe "{{ $value }}"
- alert: HostEdacCorrectableErrorsDetected
expr:
increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
CorrectedErrors: !unsafe "{{ $value }}"
- alert: HostEdacUncorrectableErrorsDetected
expr:
increase(node_edac_uncorrectable_errors_total[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
DetectedErrors: !unsafe "{{ $value }}"
- alert: OutOfDiskSpace
expr:
(
node_filesystem_free_bytes
/ node_filesystem_size_bytes < 0.1
2023-11-02 20:27:45 +01:00
)
and on (instance, device, mountpoint) (
node_filesystem_readonly
) == 0
for: 5m
labels:
severity: critical
annotations:
Mountpoint: !unsafe "{{ $labels.mountpoint }}"
FreeSpace: !unsafe "{{ $value | humanizePercentage }}"
2023-11-02 20:27:45 +01:00
- alert: HostConntrackLimit
expr:
(
2023-11-02 20:27:45 +01:00
node_nf_conntrack_entries
/ node_nf_conntrack_entries_limit
) > 0.8
2023-11-02 20:27:45 +01:00
for: 5m
labels:
severity: warning
annotations:
Filled: !unsafe "{{ $value | humanizePercentage }}"
2023-11-02 20:27:45 +01:00
- alert: HostClockSkew
expr:
(
node_timex_offset_seconds > 0.05
and deriv(node_timex_offset_seconds[5m]) >= 0
) or (
node_timex_offset_seconds < -0.05
and deriv(node_timex_offset_seconds[5m]) <= 0
)
for: 2m
labels:
severity: warning
- alert: HostClockNotSynchronising
expr:
min_over_time(node_timex_sync_status[1m]) == 0
and node_timex_maxerror_seconds >= 16
for: 2m
labels:
severity: warning
- alert: HostRequiresReboot
expr:
node_reboot_required > 0
for: 5m
labels:
severity: warning
- alert: OutOfInodes
expr:
node_filesystem_files_free
/ node_filesystem_files < 0.1
2023-11-02 20:27:45 +01:00
for: 3m
labels:
severity: warning
annotations:
Mountpoint: !unsafe "{{ $labels.mountpoint }}"
FreeInodes: !unsafe "{{ $value | humanizePercentage }}"
2023-11-02 20:27:45 +01:00
- alert: CpuUsage
expr:
(
2023-11-02 20:27:45 +01:00
1 - avg by (instance) (
irate(node_cpu_seconds_total{mode="idle"}[5m])
)
) > 0.75
2023-11-02 20:27:45 +01:00
for: 10m
labels:
severity: warning
annotations:
Usage: !unsafe "{{ $value | humanizePercentage }}"
2023-11-02 20:27:45 +01:00
- alert: SystemdServiceFailed
expr:
node_systemd_unit_state{state="failed"} == 1
for: 10m
labels:
severity: warning
annotations:
Service: !unsafe "{{ $labels.name }}"
- alert: LoadUsage
expr:
node_load1 > 5
for: 2m
labels:
severity: warning
annotations:
Load1: !unsafe "{{ $value | humanize }}"
2023-11-02 20:27:45 +01:00
- alert: UnhealthyDisk
expr:
smartmon_device_smart_healthy < 1
for: 10m
labels:
severity: critical
annotations:
Disk: !unsafe "{{ $labels.disk }}"
- alert: HostCpuStealNoisyNeighbor
expr:
avg by (instance) (
2023-11-02 20:27:45 +01:00
rate(node_cpu_seconds_total{mode="steal"}[5m])
) > 0.1
2023-11-02 20:27:45 +01:00
for: 5m
labels:
severity: warning
annotations:
Disk: !unsafe "{{ $labels.disk }}"
Steal: !unsafe "{{ $value | humanizePercentage }}"
2023-11-02 20:27:45 +01:00
...