prometheus: use humanize/humanizePercentage

This commit is contained in:
jeltz 2023-11-04 13:33:06 +01:00
parent 19953b2951
commit c2f2c03af6
Signed by: jeltz
GPG key ID: 800882B66C0C3326
2 changed files with 38 additions and 37 deletions

View file

@ -7,29 +7,29 @@ prometheus__scraping_node:
prometheus__rules_node: prometheus__rules_node:
- alert: OutOfMemory - alert: OutOfMemory
expr: expr:
100 * ( (
node_memory_MemFree_bytes node_memory_MemFree_bytes
+ node_memory_Cached_bytes + node_memory_Cached_bytes
+ node_memory_Buffers_bytes + node_memory_Buffers_bytes
) / node_memory_MemTotal_bytes < 10 ) / node_memory_MemTotal_bytes < 0.1
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
FreeMemory: !unsafe '{{ printf "%.0f" $value }} %' FreeMemory: !unsafe "{{ $value | humanizePercentage }} %"
- alert: HostSwapIsFillingUp - alert: HostSwapIsFillingUp
expr: expr:
100 * ( (
1 - ( 1 - (
node_memory_SwapFree_bytes node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes / node_memory_SwapTotal_bytes
) )
) >= 50 ) >= 0.5
for: 3m for: 3m
labels: labels:
severity: critical severity: critical
annotations: annotations:
UsedSwap: !unsafe '{{ printf "%.0f" $value }} %' UsedSwap: !unsafe "{{ $value | humanizePercentage }} %"
- alert: HostPhysicalComponentTooHot - alert: HostPhysicalComponentTooHot
expr: expr:
node_hwmon_temp_celsius > 79 node_hwmon_temp_celsius > 79
@ -37,7 +37,7 @@ prometheus__rules_node:
labels: labels:
severity: critical severity: critical
annotations: annotations:
Temperature: !unsafe '{{ printf "%.0f" $value }} °C' Temperature: !unsafe "{{ $value | humanize }} °C"
Chip: !unsafe "{{ $labels.chip }}" Chip: !unsafe "{{ $labels.chip }}"
Sensor: !unsafe "{{ $labels.sensor }}" Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostNodeOvertemperatureAlarm - alert: HostNodeOvertemperatureAlarm
@ -92,8 +92,8 @@ prometheus__rules_node:
- alert: OutOfDiskSpace - alert: OutOfDiskSpace
expr: expr:
( (
100 * node_filesystem_free_bytes node_filesystem_free_bytes
/ node_filesystem_size_bytes < 10 / node_filesystem_size_bytes < 0.1
) )
and on (instance, device, mountpoint) ( and on (instance, device, mountpoint) (
node_filesystem_readonly node_filesystem_readonly
@ -102,19 +102,19 @@ prometheus__rules_node:
labels: labels:
severity: critical severity: critical
annotations: annotations:
Mountpoint: !unsafe '{{ $labels.mountpoint }}' Mountpoint: !unsafe "{{ $labels.mountpoint }}"
FreeSpace: !unsafe '{{ printf "%.0f" $value }} %' FreeSpace: !unsafe "{{ $value | humanizePercentage }} %"
- alert: HostConntrackLimit - alert: HostConntrackLimit
expr: expr:
100 * ( (
node_nf_conntrack_entries node_nf_conntrack_entries
/ node_nf_conntrack_entries_limit / node_nf_conntrack_entries_limit
) > 80 ) > 0.8
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
Filled: !unsafe '{{ printf "%.0f" $value }} %' Filled: !unsafe "{{ $value | humanizePercentage }} %"
- alert: HostClockSkew - alert: HostClockSkew
expr: expr:
( (
@ -142,26 +142,26 @@ prometheus__rules_node:
severity: warning severity: warning
- alert: OutOfInodes - alert: OutOfInodes
expr: expr:
100 * node_filesystem_files_free node_filesystem_files_free
/ node_filesystem_files < 10 / node_filesystem_files < 0.1
for: 3m for: 3m
labels: labels:
severity: warning severity: warning
annotations: annotations:
Mountpoint: !unsafe "{{ $labels.mountpoint }}" Mountpoint: !unsafe "{{ $labels.mountpoint }}"
FreeInodes: !unsafe '{{ printf "%.0f" $value }} %' FreeInodes: !unsafe "{{ $value | humanizePercentage }} %"
- alert: CpuUsage - alert: CpuUsage
expr: expr:
100 * ( (
1 - avg by (instance) ( 1 - avg by (instance) (
irate(node_cpu_seconds_total{mode="idle"}[5m]) irate(node_cpu_seconds_total{mode="idle"}[5m])
) )
) > 75 ) > 0.75
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
Usage: !unsafe '{{ printf "%.0f" $value }} %' Usage: !unsafe "{{ $value | humanizePercentage }} %"
- alert: SystemdServiceFailed - alert: SystemdServiceFailed
expr: expr:
node_systemd_unit_state{state="failed"} == 1 node_systemd_unit_state{state="failed"} == 1
@ -177,7 +177,7 @@ prometheus__rules_node:
labels: labels:
severity: warning severity: warning
annotations: annotations:
Load1: !unsafe '{{ printf "%.0f" $value }}' Load1: !unsafe "{{ $value | humanize }}"
- alert: UnhealthyDisk - alert: UnhealthyDisk
expr: expr:
smartmon_device_smart_healthy < 1 smartmon_device_smart_healthy < 1
@ -188,12 +188,13 @@ prometheus__rules_node:
Disk: !unsafe "{{ $labels.disk }}" Disk: !unsafe "{{ $labels.disk }}"
- alert: HostCpuStealNoisyNeighbor - alert: HostCpuStealNoisyNeighbor
expr: expr:
100 * avg by (instance) ( avg by (instance) (
rate(node_cpu_seconds_total{mode="steal"}[5m]) rate(node_cpu_seconds_total{mode="steal"}[5m])
) > 10 ) > 0.1
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
Disk: !unsafe "{{ $labels.disk }}" Disk: !unsafe "{{ $labels.disk }}"
Steal: !unsafe "{{ $value | humanizePercentage }} %"
... ...

View file

@ -20,72 +20,72 @@ prometheus__rules_switch:
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchInErrors - alert: SwitchInErrors
expr: expr:
100 * irate(ifInErrors[5m]) / ( irate(ifInErrors[5m]) / (
irate(ifInUcastPkts[5m]) irate(ifInUcastPkts[5m])
+ irate(ifInNUcastPkts[5m]) + irate(ifInNUcastPkts[5m])
) > 0.01 ) > 0.0001
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %' ErrorRate: !unsafe "{{ $value | humanizePercentage }} %"
Interface: !unsafe "{{ $labels.ifName }} Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchOutErrors - alert: SwitchOutErrors
expr: expr:
100 * irate(ifOutErrors[5m]) / ( irate(ifOutErrors[5m]) / (
irate(ifOutUcastPkts[5m]) irate(ifOutUcastPkts[5m])
+ irate(ifOutNUcastPkts[5m]) + irate(ifOutNUcastPkts[5m])
) > 0.01 ) > 0.0001
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
ErrorRate: !unsafe '{{ printf "%.2f" $value }} %' ErrorRate: !unsafe "{{ $value | humanizePercentage }} %"
Interface: !unsafe "{{ $labels.ifName }} Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchInLinkUsage - alert: SwitchInLinkUsage
expr: expr:
100 * rate(ifHCInOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 50 rate(ifHCInOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 0.5
for: 5m for: 5m
keep_firing_for: 10m keep_firing_for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
Usage: !unsafe '{{ printf "%.2f" $value }} %' Usage: !unsafe "{{ $value | humanizePercentage }} %"
Interface: !unsafe "{{ $labels.ifName }} Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchInLinkUsage - alert: SwitchInLinkUsage
expr: expr:
100 * rate(ifHCInOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 80 rate(ifHCInOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 0.8
for: 5m for: 5m
keep_firing_for: 10m keep_firing_for: 10m
labels: labels:
severity: critical severity: critical
annotations: annotations:
Usage: !unsafe '{{ printf "%.2f" $value }} %' Usage: !unsafe "{{ $value | humanizePercentage }} %"
Interface: !unsafe "{{ $labels.ifName }} Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchOutLinkUsage - alert: SwitchOutLinkUsage
expr: expr:
100 * rate(ifHCOutOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 50 rate(ifHCOutOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 0.5
for: 5m for: 5m
keep_firing_for: 10m keep_firing_for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
Usage: !unsafe '{{ printf "%.2f" $value }} %' Usage: !unsafe "{{ $value | humanizePercentage }} %"
Interface: !unsafe "{{ $labels.ifName }} Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
- alert: SwitchOutLinkUsage - alert: SwitchOutLinkUsage
expr: expr:
100 * rate(ifHCOutOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 80 rate(ifHCOutOctets[5m]) / (ifHighSpeed * 1000000 / 8) > 0.8
for: 5m for: 5m
keep_firing_for: 10m keep_firing_for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
Usage: !unsafe '{{ printf "%.2f" $value }} %' Usage: !unsafe "{{ $value | humanizePercentage }} %"
Interface: !unsafe "{{ $labels.ifName }} Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
... ...