--- prometheus__scraping_node: targets: "{{ groups.vm_network + groups.pve_network }}" address: port: 9100 prometheus__rules_node: - alert: OutOfMemory expr: ( node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes ) / node_memory_MemTotal_bytes < 0.1 for: 5m labels: severity: warning annotations: FreeMemory: !unsafe "{{ $value | humanizePercentage }}" - alert: HostSwapIsFillingUp expr: ( 1 - ( node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes ) ) >= 0.5 for: 3m labels: severity: critical annotations: UsedSwap: !unsafe "{{ $value | humanizePercentage }}" - alert: HostPhysicalComponentTooHot expr: node_hwmon_temp_celsius > 79 for: 3m labels: severity: critical annotations: Temperature: !unsafe "{{ $value | humanize }} °C" Chip: !unsafe "{{ $labels.chip }}" Sensor: !unsafe "{{ $labels.sensor }}" - alert: HostNodeOvertemperatureAlarm expr: node_hwmon_temp_crit_alarm_celsius == 1 for: 0m labels: severity: critical annotations: Chip: !unsafe "{{ $labels.chip }}" Sensor: !unsafe "{{ $labels.sensor }}" - alert: HostRaidArrayGotInactive expr: node_md_state{state="inactive"} > 0 for: 0m labels: severity: critical annotations: Device: !unsafe "{{ $labels.device }}" - alert: HostRaidDiskFailure expr: node_md_disks{state="failed"} > 0 for: 0m labels: severity: critical annotations: severity: !unsafe "{{ $labels.md_device }}" - alert: HostOomKillDetected expr: increase(node_vmstat_oom_kill[1m]) > 0 for: 0m labels: severity: warning annotations: PID: !unsafe "{{ $value }}" - alert: HostEdacCorrectableErrorsDetected expr: increase(node_edac_correctable_errors_total[1m]) > 0 for: 0m labels: severity: warning annotations: CorrectedErrors: !unsafe "{{ $value }}" - alert: HostEdacUncorrectableErrorsDetected expr: increase(node_edac_uncorrectable_errors_total[1m]) > 0 for: 0m labels: severity: warning annotations: DetectedErrors: !unsafe "{{ $value }}" - alert: OutOfDiskSpace expr: ( node_filesystem_free_bytes / node_filesystem_size_bytes < 0.1 ) and on (instance, device, mountpoint) ( node_filesystem_readonly ) == 0 for: 5m labels: severity: critical annotations: Mountpoint: !unsafe "{{ $labels.mountpoint }}" FreeSpace: !unsafe "{{ $value | humanizePercentage }}" - alert: HostConntrackLimit expr: ( node_nf_conntrack_entries / node_nf_conntrack_entries_limit ) > 0.8 for: 5m labels: severity: warning annotations: Filled: !unsafe "{{ $value | humanizePercentage }}" - alert: HostClockSkew expr: ( node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0 ) or ( node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0 ) for: 2m labels: severity: warning - alert: HostClockNotSynchronising expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 for: 2m labels: severity: warning - alert: HostRequiresReboot expr: node_reboot_required > 0 for: 5m labels: severity: warning - alert: OutOfInodes expr: node_filesystem_files_free / node_filesystem_files < 0.1 for: 3m labels: severity: warning annotations: Mountpoint: !unsafe "{{ $labels.mountpoint }}" FreeInodes: !unsafe "{{ $value | humanizePercentage }}" - alert: CpuUsage expr: ( 1 - avg by (instance) ( irate(node_cpu_seconds_total{mode="idle"}[5m]) ) ) > 0.75 for: 10m labels: severity: warning annotations: Usage: !unsafe "{{ $value | humanizePercentage }}" - alert: SystemdServiceFailed expr: node_systemd_unit_state{state="failed"} == 1 for: 10m labels: severity: warning annotations: Service: !unsafe "{{ $labels.name }}" - alert: LoadUsage expr: node_load1 > 5 for: 2m labels: severity: warning annotations: Load1: !unsafe "{{ $value | humanize }}" - alert: UnhealthyDisk expr: smartmon_device_smart_healthy < 1 for: 10m labels: severity: critical annotations: Disk: !unsafe "{{ $labels.disk }}" - alert: HostCpuStealNoisyNeighbor expr: avg by (instance) ( rate(node_cpu_seconds_total{mode="steal"}[5m]) ) > 0.1 for: 5m labels: severity: warning annotations: Disk: !unsafe "{{ $labels.disk }}" Steal: !unsafe "{{ $value | humanizePercentage }}" ...