Browse Source

add label 'value' to alerts

monitoring
histausse 1 year ago
parent
commit
5c4c90e501
Signed by: histausse
GPG Key ID: 67486F107F62E9E9
  1. 12
      roles/prometheus-node-exporter/files/alerts-node.yml

12
roles/prometheus-node-exporter/files/alerts-node.yml

@ -11,6 +11,7 @@ groups:
description: >-
{{ $labels.instance }} has been down for more than 5 minutes.
labels:
value: "{{ $value }}"
severity: critical
- alert: OutOfDiskSpace
@ -22,6 +23,7 @@ groups:
Partition `{{ $labels.mountpoint }}` (`{{ $labels.device }}`) of {{ $labels.instance }}
uses {{ $value | printf "%.1f" }}% of its capacity.
labels:
value: "{{ $value }}"
severity: warning
- alert: OutOfMemory
@ -38,6 +40,7 @@ groups:
description: >-
{{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity.
labels:
value: "{{ $value }}"
severity: warning
- alert: OutOfInode
@ -53,6 +56,7 @@ groups:
Partition {{ $labels.mountpoint }} ({{ $labels.device }}) of {{ $labels.instance }}
uses {{ $value | printf "%.1f" }}% of its Inodes.
labels:
value: "{{ $value }}"
severity: warning
- alert: Swapping
@ -67,6 +71,7 @@ groups:
description: >-
{{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity.
labels:
value: "{{ $value }}"
severity: warning
- alert: PhysicalComponentTooHot
@ -77,6 +82,7 @@ groups:
description: >-
The internal temperature of {{ $labels.instance }} is {{ $value }}°C!
labels:
value: "{{ $value }}"
severity: critical
- alert: PhysicalComponentHeatAlarm
@ -87,6 +93,7 @@ groups:
description: >-
Do something!
labels:
value: "{{ $value }}"
severity: critical
- alert: OOMKill
@ -97,6 +104,7 @@ groups:
description: >-
The kernel killed {{ $value }} proccesses (OOM killer)
labels:
value: "{{ $value }}"
severity: warning
- alert: CorrectableErrorDetected
@ -107,6 +115,7 @@ groups:
description: >-
{{ $value | printf "%.1f" }} error(s) have been corrected (EDAC)
labels:
value: "{{ $value }}"
severity: warning
- alert: UncorrectableErrorDetected
@ -117,6 +126,7 @@ groups:
description: >-
{{ $value | printf "%.1f" }} error(s) could not be corrected (EDAC)
labels:
value: "{{ $value }}"
severity: warning
- alert: UnhealthyDisk
@ -133,6 +143,7 @@ groups:
Smartools detected that `{{ $labels.disk }}` on {{ $labels.instance }} is unhealthy
and will probably need to be changed.
labels:
value: "{{ $value }}"
severity: critical
- alert: ServiceFailed
@ -143,5 +154,6 @@ groups:
description: >-
The systemd service {{ $labels.name }} failed on {{ $labels.instance }}
labels:
value: "{{ $value }}"
severity: warning
...

Loading…
Cancel
Save