diff --git a/group_vars/prom/prometheus/node.yml b/group_vars/prom/prometheus/node.yml index e1bab96..dd74aad 100644 --- a/group_vars/prom/prometheus/node.yml +++ b/group_vars/prom/prometheus/node.yml @@ -7,29 +7,29 @@ prometheus__scraping_node: prometheus__rules_node: - alert: OutOfMemory expr: - 100 * ( + ( node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes - ) / node_memory_MemTotal_bytes < 10 + ) / node_memory_MemTotal_bytes < 0.1 for: 5m labels: severity: warning annotations: - FreeMemory: !unsafe '{{ printf "%.0f" $value }} %' + FreeMemory: !unsafe "{{ $value | humanizePercentage }} %" - alert: HostSwapIsFillingUp expr: - 100 * ( + ( 1 - ( node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes ) - ) >= 50 + ) >= 0.5 for: 3m labels: severity: critical annotations: - UsedSwap: !unsafe '{{ printf "%.0f" $value }} %' + UsedSwap: !unsafe "{{ $value | humanizePercentage }} %" - alert: HostPhysicalComponentTooHot expr: node_hwmon_temp_celsius > 79 @@ -37,7 +37,7 @@ prometheus__rules_node: labels: severity: critical annotations: - Temperature: !unsafe '{{ printf "%.0f" $value }} °C' + Temperature: !unsafe "{{ $value | humanize }} °C" Chip: !unsafe "{{ $labels.chip }}" Sensor: !unsafe "{{ $labels.sensor }}" - alert: HostNodeOvertemperatureAlarm @@ -92,8 +92,8 @@ prometheus__rules_node: - alert: OutOfDiskSpace expr: ( - 100 * node_filesystem_free_bytes - / node_filesystem_size_bytes < 10 + node_filesystem_free_bytes + / node_filesystem_size_bytes < 0.1 ) and on (instance, device, mountpoint) ( node_filesystem_readonly @@ -102,19 +102,19 @@ prometheus__rules_node: labels: severity: critical annotations: - Mountpoint: !unsafe '{{ $labels.mountpoint }}' - FreeSpace: !unsafe '{{ printf "%.0f" $value }} %' + Mountpoint: !unsafe "{{ $labels.mountpoint }}" + FreeSpace: !unsafe "{{ $value | humanizePercentage }} %" - alert: HostConntrackLimit expr: - 100 * ( + ( node_nf_conntrack_entries / node_nf_conntrack_entries_limit - ) > 80 + ) > 0.8 for: 5m labels: severity: warning annotations: - Filled: !unsafe '{{ printf "%.0f" $value }} %' + Filled: !unsafe "{{ $value | humanizePercentage }} %" - alert: HostClockSkew expr: ( @@ -142,26 +142,26 @@ prometheus__rules_node: severity: warning - alert: OutOfInodes expr: - 100 * node_filesystem_files_free - / node_filesystem_files < 10 + node_filesystem_files_free + / node_filesystem_files < 0.1 for: 3m labels: severity: warning annotations: Mountpoint: !unsafe "{{ $labels.mountpoint }}" - FreeInodes: !unsafe '{{ printf "%.0f" $value }} %' + FreeInodes: !unsafe "{{ $value | humanizePercentage }} %" - alert: CpuUsage expr: - 100 * ( + ( 1 - avg by (instance) ( irate(node_cpu_seconds_total{mode="idle"}[5m]) ) - ) > 75 + ) > 0.75 for: 10m labels: severity: warning annotations: - Usage: !unsafe '{{ printf "%.0f" $value }} %' + Usage: !unsafe "{{ $value | humanizePercentage }} %" - alert: SystemdServiceFailed expr: node_systemd_unit_state{state="failed"} == 1 @@ -177,7 +177,7 @@ prometheus__rules_node: labels: severity: warning annotations: - Load1: !unsafe '{{ printf "%.0f" $value }}' + Load1: !unsafe "{{ $value | humanize }}" - alert: UnhealthyDisk expr: smartmon_device_smart_healthy < 1 @@ -188,12 +188,13 @@ prometheus__rules_node: Disk: !unsafe "{{ $labels.disk }}" - alert: HostCpuStealNoisyNeighbor expr: - 100 * avg by (instance) ( + avg by (instance) ( rate(node_cpu_seconds_total{mode="steal"}[5m]) - ) > 10 + ) > 0.1 for: 5m labels: severity: warning annotations: Disk: !unsafe "{{ $labels.disk }}" + Steal: !unsafe "{{ $value | humanizePercentage }} %" ... diff --git a/group_vars/prom/prometheus/switch.yml b/group_vars/prom/prometheus/switch.yml index 6bc510e..a6e7535 100644 --- a/group_vars/prom/prometheus/switch.yml +++ b/group_vars/prom/prometheus/switch.yml @@ -20,72 +20,72 @@ prometheus__rules_switch: {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" - alert: SwitchInErrors expr: - 100 * irate(ifInErrors[5m]) / ( + irate(ifInErrors[5m]) / ( irate(ifInUcastPkts[5m]) + irate(ifInNUcastPkts[5m]) - ) > 0.01 + ) > 0.0001 for: 0m labels: severity: warning annotations: - ErrorRate: !unsafe '{{ printf "%.2f" $value }} %' + ErrorRate: !unsafe "{{ $value | humanizePercentage }} %" Interface: !unsafe "{{ $labels.ifName }} {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" - alert: SwitchOutErrors expr: - 100 * irate(ifOutErrors[5m]) / ( + irate(ifOutErrors[5m]) / ( irate(ifOutUcastPkts[5m]) + irate(ifOutNUcastPkts[5m]) - ) > 0.01 + ) > 0.0001 for: 0m labels: severity: warning annotations: - ErrorRate: !unsafe '{{ printf "%.2f" $value }} %' + ErrorRate: !unsafe "{{ $value | humanizePercentage }} %" Interface: !unsafe "{{ $labels.ifName }} {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" - alert: SwitchInLinkUsage expr: - 100 * rate(ifHCInOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 50 + rate(ifHCInOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 0.5 for: 5m keep_firing_for: 10m labels: severity: warning annotations: - Usage: !unsafe '{{ printf "%.2f" $value }} %' + Usage: !unsafe "{{ $value | humanizePercentage }} %" Interface: !unsafe "{{ $labels.ifName }} {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" - alert: SwitchInLinkUsage expr: - 100 * rate(ifHCInOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 80 + rate(ifHCInOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 0.8 for: 5m keep_firing_for: 10m labels: severity: critical annotations: - Usage: !unsafe '{{ printf "%.2f" $value }} %' + Usage: !unsafe "{{ $value | humanizePercentage }} %" Interface: !unsafe "{{ $labels.ifName }} {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" - alert: SwitchOutLinkUsage expr: - 100 * rate(ifHCOutOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 50 + rate(ifHCOutOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 0.5 for: 5m keep_firing_for: 10m labels: severity: warning annotations: - Usage: !unsafe '{{ printf "%.2f" $value }} %' + Usage: !unsafe "{{ $value | humanizePercentage }} %" Interface: !unsafe "{{ $labels.ifName }} {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" - alert: SwitchOutLinkUsage expr: - 100 * rate(ifHCOutOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 80 + rate(ifHCOutOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 0.8 for: 5m keep_firing_for: 10m labels: severity: warning annotations: - Usage: !unsafe '{{ printf "%.2f" $value }} %' + Usage: !unsafe "{{ $value | humanizePercentage }} %" Interface: !unsafe "{{ $labels.ifName }} {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" ...