ansible/group_vars/prom/prometheus/node.yml

---
prometheus__scraping_node:
  targets: "{{ groups.vm_network + groups.pve_network }}"
  address:
    port: 9100

prometheus__rules_node:
  - alert: OutOfMemory
    expr:
      (
        node_memory_MemFree_bytes
        + node_memory_Cached_bytes
        + node_memory_Buffers_bytes
      ) / node_memory_MemTotal_bytes < 0.1
    for: 5m
    labels:
      severity: warning
    annotations:
      FreeMemory: !unsafe "{{ $value | humanizePercentage }}"
  - alert: HostSwapIsFillingUp
    expr:
      (
        1 - (
          node_memory_SwapFree_bytes
          / node_memory_SwapTotal_bytes
        )
      ) >= 0.5
    for: 3m
    labels:
      severity: critical
    annotations:
      UsedSwap: !unsafe "{{ $value | humanizePercentage }}"
  - alert: HostPhysicalComponentTooHot
    expr:
      node_hwmon_temp_celsius > 79
    for: 3m
    labels:
      severity: critical
    annotations:
      Temperature: !unsafe "{{ $value | humanize }} °C"
      Chip: !unsafe "{{ $labels.chip }}"
      Sensor: !unsafe "{{ $labels.sensor }}"
  - alert: HostNodeOvertemperatureAlarm
    expr:
      node_hwmon_temp_crit_alarm_celsius == 1
    for: 0m
    labels:
      severity: critical
    annotations:
      Chip: !unsafe "{{ $labels.chip }}"
      Sensor: !unsafe "{{ $labels.sensor }}"
  - alert: HostRaidArrayGotInactive
    expr:
      node_md_state{state="inactive"} > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      Device: !unsafe "{{ $labels.device }}"
  - alert: HostRaidDiskFailure
    expr:
      node_md_disks{state="failed"} > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      severity: !unsafe "{{ $labels.md_device }}"
  - alert: HostOomKillDetected
    expr:
      increase(node_vmstat_oom_kill[1m]) > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      PID: !unsafe "{{ $value }}"
  - alert: HostEdacCorrectableErrorsDetected
    expr:
      increase(node_edac_correctable_errors_total[1m]) > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      CorrectedErrors: !unsafe "{{ $value }}"
  - alert: HostEdacUncorrectableErrorsDetected
    expr:
      increase(node_edac_uncorrectable_errors_total[1m]) > 0
    for: 0m
    labels:
      severity: warning
    annotations:
      DetectedErrors: !unsafe "{{ $value }}"
  - alert: OutOfDiskSpace
    expr:
      (
        node_filesystem_free_bytes
        / node_filesystem_size_bytes < 0.1
      )
      and on (instance, device, mountpoint) (
        node_filesystem_readonly
      ) == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      Mountpoint: !unsafe "{{ $labels.mountpoint }}"
      FreeSpace: !unsafe "{{ $value | humanizePercentage }}"
  - alert: HostConntrackLimit
    expr:
      (
        node_nf_conntrack_entries
        / node_nf_conntrack_entries_limit
      ) > 0.8
    for: 5m
    labels:
      severity: warning
    annotations:
      Filled: !unsafe "{{ $value | humanizePercentage }}"
  - alert: HostClockSkew
    expr:
      (
        node_timex_offset_seconds > 0.05
        and deriv(node_timex_offset_seconds[5m]) >= 0
      ) or (
        node_timex_offset_seconds < -0.05
        and deriv(node_timex_offset_seconds[5m]) <= 0
      )
    for: 2m
    labels:
      severity: warning
  - alert: HostClockNotSynchronising
    expr:
      min_over_time(node_timex_sync_status[1m]) == 0
      and node_timex_maxerror_seconds >= 16
    for: 2m
    labels:
      severity: warning
  - alert: HostRequiresReboot
    expr:
      node_reboot_required > 0
    for: 5m
    labels:
      severity: warning
  - alert: OutOfInodes
    expr:
      node_filesystem_files_free
      / node_filesystem_files < 0.1
    for: 3m
    labels:
      severity: warning
    annotations:
      Mountpoint: !unsafe "{{ $labels.mountpoint }}"
      FreeInodes: !unsafe "{{ $value | humanizePercentage }}"
  - alert: CpuUsage
    expr:
      (
        1 - avg by (instance) (
          irate(node_cpu_seconds_total{mode="idle"}[5m])
        )
      ) > 0.75
    for: 10m
    labels:
      severity: warning
    annotations:
      Usage: !unsafe "{{ $value | humanizePercentage }}"
  - alert: SystemdServiceFailed
    expr:
      node_systemd_unit_state{state="failed"} == 1
    for: 10m
    labels:
      severity: warning
    annotations:
      Service: !unsafe "{{ $labels.name }}"
  - alert: LoadUsage
    expr:
      node_load1 > 5
    for: 2m
    labels:
      severity: warning
    annotations:
      Load1: !unsafe "{{ $value | humanize }}"
  - alert: UnhealthyDisk
    expr:
      smartmon_device_smart_healthy < 1
    for: 10m
    labels:
      severity: critical
    annotations:
      Disk: !unsafe "{{ $labels.disk }}"
  - alert: HostCpuStealNoisyNeighbor
    expr:
      avg by (instance) (
        rate(node_cpu_seconds_total{mode="steal"}[5m])
      ) > 0.1
    for: 5m
    labels:
      severity: warning
    annotations:
      Disk: !unsafe "{{ $labels.disk }}"
      Steal: !unsafe "{{ $value | humanizePercentage }}"
...
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`---`
			`prometheus__scraping_node:`
			`targets: "{{ groups.vm_network + groups.pve_network }}"`
			`address:`
			`port: 9100`

			`prometheus__rules_node:`
			`- alert: OutOfMemory`
			`expr:`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`(`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`node_memory_MemFree_bytes`
			`+ node_memory_Cached_bytes`
			`+ node_memory_Buffers_bytes`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`) / node_memory_MemTotal_bytes < 0.1`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
prometheus: remove redundant '%' symbols 2023-11-06 22:05:53 +01:00			`FreeMemory: !unsafe "{{ $value \| humanizePercentage }}"`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`- alert: HostSwapIsFillingUp`
			`expr:`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`(`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`1 - (`
			`node_memory_SwapFree_bytes`
			`/ node_memory_SwapTotal_bytes`
			`)`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`) >= 0.5`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`for: 3m`
			`labels:`
			`severity: critical`
			`annotations:`
prometheus: remove redundant '%' symbols 2023-11-06 22:05:53 +01:00			`UsedSwap: !unsafe "{{ $value \| humanizePercentage }}"`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`- alert: HostPhysicalComponentTooHot`
			`expr:`
			`node_hwmon_temp_celsius > 79`
			`for: 3m`
			`labels:`
			`severity: critical`
			`annotations:`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`Temperature: !unsafe "{{ $value \| humanize }} °C"`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`Chip: !unsafe "{{ $labels.chip }}"`
			`Sensor: !unsafe "{{ $labels.sensor }}"`
			`- alert: HostNodeOvertemperatureAlarm`
			`expr:`
			`node_hwmon_temp_crit_alarm_celsius == 1`
			`for: 0m`
			`labels:`
			`severity: critical`
			`annotations:`
			`Chip: !unsafe "{{ $labels.chip }}"`
			`Sensor: !unsafe "{{ $labels.sensor }}"`
			`- alert: HostRaidArrayGotInactive`
			`expr:`
			`node_md_state{state="inactive"} > 0`
			`for: 0m`
			`labels:`
			`severity: critical`
			`annotations:`
			`Device: !unsafe "{{ $labels.device }}"`
			`- alert: HostRaidDiskFailure`
			`expr:`
			`node_md_disks{state="failed"} > 0`
			`for: 0m`
			`labels:`
			`severity: critical`
			`annotations:`
			`severity: !unsafe "{{ $labels.md_device }}"`
			`- alert: HostOomKillDetected`
			`expr:`
			`increase(node_vmstat_oom_kill[1m]) > 0`
			`for: 0m`
			`labels:`
			`severity: warning`
			`annotations:`
			`PID: !unsafe "{{ $value }}"`
			`- alert: HostEdacCorrectableErrorsDetected`
			`expr:`
			`increase(node_edac_correctable_errors_total[1m]) > 0`
			`for: 0m`
			`labels:`
			`severity: warning`
			`annotations:`
			`CorrectedErrors: !unsafe "{{ $value }}"`
			`- alert: HostEdacUncorrectableErrorsDetected`
			`expr:`
			`increase(node_edac_uncorrectable_errors_total[1m]) > 0`
			`for: 0m`
			`labels:`
			`severity: warning`
			`annotations:`
			`DetectedErrors: !unsafe "{{ $value }}"`
			`- alert: OutOfDiskSpace`
			`expr:`
			`(`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`node_filesystem_free_bytes`
			`/ node_filesystem_size_bytes < 0.1`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`)`
			`and on (instance, device, mountpoint) (`
			`node_filesystem_readonly`
			`) == 0`
			`for: 5m`
			`labels:`
			`severity: critical`
			`annotations:`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`Mountpoint: !unsafe "{{ $labels.mountpoint }}"`
prometheus: remove redundant '%' symbols 2023-11-06 22:05:53 +01:00			`FreeSpace: !unsafe "{{ $value \| humanizePercentage }}"`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`- alert: HostConntrackLimit`
			`expr:`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`(`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`node_nf_conntrack_entries`
			`/ node_nf_conntrack_entries_limit`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`) > 0.8`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
prometheus: remove redundant '%' symbols 2023-11-06 22:05:53 +01:00			`Filled: !unsafe "{{ $value \| humanizePercentage }}"`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`- alert: HostClockSkew`
			`expr:`
			`(`
			`node_timex_offset_seconds > 0.05`
			`and deriv(node_timex_offset_seconds[5m]) >= 0`
			`) or (`
			`node_timex_offset_seconds < -0.05`
			`and deriv(node_timex_offset_seconds[5m]) <= 0`
			`)`
			`for: 2m`
			`labels:`
			`severity: warning`
			`- alert: HostClockNotSynchronising`
			`expr:`
			`min_over_time(node_timex_sync_status[1m]) == 0`
			`and node_timex_maxerror_seconds >= 16`
			`for: 2m`
			`labels:`
			`severity: warning`
			`- alert: HostRequiresReboot`
			`expr:`
			`node_reboot_required > 0`
			`for: 5m`
			`labels:`
			`severity: warning`
			`- alert: OutOfInodes`
			`expr:`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`node_filesystem_files_free`
			`/ node_filesystem_files < 0.1`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`for: 3m`
			`labels:`
			`severity: warning`
			`annotations:`
			`Mountpoint: !unsafe "{{ $labels.mountpoint }}"`
prometheus: remove redundant '%' symbols 2023-11-06 22:05:53 +01:00			`FreeInodes: !unsafe "{{ $value \| humanizePercentage }}"`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`- alert: CpuUsage`
			`expr:`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`(`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`1 - avg by (instance) (`
			`irate(node_cpu_seconds_total{mode="idle"}[5m])`
			`)`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`) > 0.75`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`for: 10m`
			`labels:`
			`severity: warning`
			`annotations:`
prometheus: remove redundant '%' symbols 2023-11-06 22:05:53 +01:00			`Usage: !unsafe "{{ $value \| humanizePercentage }}"`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`- alert: SystemdServiceFailed`
			`expr:`
			`node_systemd_unit_state{state="failed"} == 1`
			`for: 10m`
			`labels:`
			`severity: warning`
			`annotations:`
			`Service: !unsafe "{{ $labels.name }}"`
			`- alert: LoadUsage`
			`expr:`
			`node_load1 > 5`
			`for: 2m`
			`labels:`
			`severity: warning`
			`annotations:`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`Load1: !unsafe "{{ $value \| humanize }}"`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`- alert: UnhealthyDisk`
			`expr:`
			`smartmon_device_smart_healthy < 1`
			`for: 10m`
			`labels:`
			`severity: critical`
			`annotations:`
			`Disk: !unsafe "{{ $labels.disk }}"`
			`- alert: HostCpuStealNoisyNeighbor`
			`expr:`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`avg by (instance) (`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`rate(node_cpu_seconds_total{mode="steal"}[5m])`
prometheus: use humanize/humanizePercentage 2023-11-04 13:33:06 +01:00			`) > 0.1`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`Disk: !unsafe "{{ $labels.disk }}"`
prometheus: remove redundant '%' symbols 2023-11-06 22:05:53 +01:00			`Steal: !unsafe "{{ $value \| humanizePercentage }}"`
prometheus: refactoring of the config 2023-11-02 20:27:45 +01:00			`...`