ansible/roles/prometheus-node-exporter/files/alerts-node.yml

182 lines
5.4 KiB
YAML
Raw Normal View History

2021-10-08 22:20:08 +02:00
---
groups:
2021-10-10 01:33:08 +02:00
- name: NodeAllInstances
2021-10-08 22:20:08 +02:00
rules:
- alert: InstanceDown
expr: up{job='node'} == 0
for: 5m
annotations:
title: 'Instance {{ $labels.instance }} down'
description: >-
{{ $labels.instance }} has been down for more than 5 minutes.
labels:
2021-10-08 22:36:59 +02:00
value: "{{ $value }}"
2021-10-08 22:20:08 +02:00
severity: critical
- alert: OutOfDiskSpace
expr: (100 - node_filesystem_avail_bytes{} *100 / node_filesystem_size_bytes{}) > 80
for: 1m
annotations:
title: '`{{ $labels.instance }}:{{ $labels.mountpoint }}` is out of space'
description: >-
Partition `{{ $labels.mountpoint }}` (`{{ $labels.device }}`) of {{ $labels.instance }}
uses {{ $value | printf "%.1f" }}% of its capacity.
labels:
2021-10-08 22:36:59 +02:00
value: "{{ $value }}"
2021-10-08 22:20:08 +02:00
severity: warning
- alert: OutOfMemory
expr: >-
(
node_memory_MemTotal_bytes
- node_memory_MemFree_bytes
- node_memory_Cached_bytes
- node_memory_Buffers_bytes
) / node_memory_MemTotal_bytes * 100 > 80
for: 1m
annotations:
title: '{{ $labels.instance }} is out of memory'
description: >-
{{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity.
labels:
2021-10-08 22:36:59 +02:00
value: "{{ $value }}"
2021-10-08 22:20:08 +02:00
severity: warning
- alert: OutOfInode
expr: >-
(
node_filesystem_files
- node_filesystem_files_free
) / node_filesystem_files * 100 >= 90
for: 5m
annotations:
title: '`{{ $labels.instance }}:{{ $labels.mountpoint }}` is out of Inodes'
description: >-
Partition {{ $labels.mountpoint }} ({{ $labels.device }}) of {{ $labels.instance }}
uses {{ $value | printf "%.1f" }}% of its Inodes.
labels:
2021-10-08 22:36:59 +02:00
value: "{{ $value }}"
2021-10-08 22:20:08 +02:00
severity: warning
- alert: Swapping
expr: >-
(
node_memory_SwapTotal_bytes
- node_memory_SwapFree_bytes
) / node_memory_SwapTotal_bytes * 100 >= 50
for: 5m
annotations:
title: '{{ $labels.instance }} is using a lot of swap'
description: >-
{{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity.
labels:
2021-10-08 22:36:59 +02:00
value: "{{ $value }}"
2021-10-08 22:20:08 +02:00
severity: warning
- alert: PhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 79
for: 5m
annotations:
title: '{{ $labels.instance }} is heating up'
description: >-
The internal temperature of {{ $labels.instance }} is {{ $value }}°C!
labels:
2021-10-08 22:36:59 +02:00
value: "{{ $value }}"
2021-10-08 22:20:08 +02:00
severity: critical
- alert: PhysicalComponentHeatAlarm
expr: node_hwmon_temp_crit_alarm_celsius == 1
for: 0m
annotations:
title: 'The temperature alarm of {{ $labels.instance }} is up'
description: >-
Do something!
labels:
2021-10-08 22:36:59 +02:00
value: "{{ $value }}"
2021-10-08 22:20:08 +02:00
severity: critical
- alert: OOMKill
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
annotations:
title: 'The kernel is killing processes'
description: >-
The kernel killed {{ $value }} proccesses (OOM killer)
labels:
2021-10-08 22:36:59 +02:00
value: "{{ $value }}"
2021-10-08 22:20:08 +02:00
severity: warning
- alert: CorrectableErrorDetected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
annotations:
title: 'Memory errors have been corrected'
description: >-
{{ $value | printf "%.1f" }} error(s) have been corrected (EDAC)
labels:
2021-10-08 22:36:59 +02:00
value: "{{ $value }}"
2021-10-08 22:20:08 +02:00
severity: warning
- alert: UncorrectableErrorDetected
2021-10-09 15:28:11 +02:00
expr: increase(node_edac_uncorrectable_errors_total[1m]) > 0
2021-10-08 22:20:08 +02:00
for: 0m
annotations:
title: 'Memory errors could not be corrected'
description: >-
{{ $value | printf "%.1f" }} error(s) could not be corrected (EDAC)
labels:
2021-10-08 22:36:59 +02:00
value: "{{ $value }}"
2021-10-08 22:20:08 +02:00
severity: warning
- alert: UnhealthyDisk
expr: >-
(
smartmon_device_smart_healthy
and on (instance, disk)
smartmon_device_info{product!="QEMU HARDDISK"}
) < 1
for: 10m
annotations:
title: '`{{ $labels.instance }}:{{ $labels.disk }}` is unhealthy'
description: >-
Smartools detected that `{{ $labels.disk }}` on {{ $labels.instance }} is unhealthy
and will probably need to be changed.
labels:
2021-10-08 22:36:59 +02:00
value: "{{ $value }}"
2021-10-08 22:20:08 +02:00
severity: critical
- alert: ServiceFailed
expr: node_systemd_unit_state{state="failed"}==1
for: 10m
annotations:
title: '{{ $labels.name }} failed'
description: >-
The systemd service {{ $labels.name }} failed on {{ $labels.instance }}
labels:
2021-10-08 22:36:59 +02:00
value: "{{ $value }}"
2021-10-08 22:20:08 +02:00
severity: warning
2021-10-12 22:28:47 +02:00
- alert: CertExpLess30days
expr: (local_x509_expiry_date{job="blackbox internal tls"}-time()) < 2592000
annotations:
title: '{{ $labels.cname }} will expire soon'
description: >-
The certificate {{ $labels.cname }} on {{ $labels.instance }} at {{ $labels.file }}
will expire in {{ $value | humanizeDuration }}, it's time to renew it.
labels:
value: "{{ $value }}"
severity: 'warning'
- alert: CertExpLess10days
expr: (local_x509_expiry_date{job="blackbox internal tls"}-time()) < 864000
annotations:
title: '{{ $labels.cname }} expiracy is imminent!'
description: >-
The certificate {{ $labels.cname }} on {{ $labels.instance }} at {{ $labels.file }}
will expire in {{ $value | humanizeDuration }}, RENEW IT!!!
labels:
value: "{{ $value }}"
severity: 'critical'
2021-10-08 22:20:08 +02:00
...