add alerts for node
This commit is contained in:
parent
8b6986d955
commit
70831ce0d1
3 changed files with 163 additions and 2 deletions
147
roles/prometheus-node-exporter/files/alerts-node.yml
Normal file
147
roles/prometheus-node-exporter/files/alerts-node.yml
Normal file
|
@ -0,0 +1,147 @@
|
||||||
|
---
|
||||||
|
groups:
|
||||||
|
- name: AllInstances
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: InstanceDown
|
||||||
|
expr: up{job='node'} == 0
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
title: 'Instance {{ $labels.instance }} down'
|
||||||
|
description: >-
|
||||||
|
{{ $labels.instance }} has been down for more than 5 minutes.
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
|
||||||
|
- alert: OutOfDiskSpace
|
||||||
|
expr: (100 - node_filesystem_avail_bytes{} *100 / node_filesystem_size_bytes{}) > 80
|
||||||
|
for: 1m
|
||||||
|
annotations:
|
||||||
|
title: '`{{ $labels.instance }}:{{ $labels.mountpoint }}` is out of space'
|
||||||
|
description: >-
|
||||||
|
Partition `{{ $labels.mountpoint }}` (`{{ $labels.device }}`) of {{ $labels.instance }}
|
||||||
|
uses {{ $value | printf "%.1f" }}% of its capacity.
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|
||||||
|
- alert: OutOfMemory
|
||||||
|
expr: >-
|
||||||
|
(
|
||||||
|
node_memory_MemTotal_bytes
|
||||||
|
- node_memory_MemFree_bytes
|
||||||
|
- node_memory_Cached_bytes
|
||||||
|
- node_memory_Buffers_bytes
|
||||||
|
) / node_memory_MemTotal_bytes * 100 > 80
|
||||||
|
for: 1m
|
||||||
|
annotations:
|
||||||
|
title: '{{ $labels.instance }} is out of memory'
|
||||||
|
description: >-
|
||||||
|
{{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity.
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|
||||||
|
- alert: OutOfInode
|
||||||
|
expr: >-
|
||||||
|
(
|
||||||
|
node_filesystem_files
|
||||||
|
- node_filesystem_files_free
|
||||||
|
) / node_filesystem_files * 100 >= 90
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
title: '`{{ $labels.instance }}:{{ $labels.mountpoint }}` is out of Inodes'
|
||||||
|
description: >-
|
||||||
|
Partition {{ $labels.mountpoint }} ({{ $labels.device }}) of {{ $labels.instance }}
|
||||||
|
uses {{ $value | printf "%.1f" }}% of its Inodes.
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|
||||||
|
- alert: Swapping
|
||||||
|
expr: >-
|
||||||
|
(
|
||||||
|
node_memory_SwapTotal_bytes
|
||||||
|
- node_memory_SwapFree_bytes
|
||||||
|
) / node_memory_SwapTotal_bytes * 100 >= 50
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
title: '{{ $labels.instance }} is using a lot of swap'
|
||||||
|
description: >-
|
||||||
|
{{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity.
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|
||||||
|
- alert: PhysicalComponentTooHot
|
||||||
|
expr: node_hwmon_temp_celsius > 79
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
title: '{{ $labels.instance }} is heating up'
|
||||||
|
description: >-
|
||||||
|
The internal temperature of {{ $labels.instance }} is {{ $value }}°C!
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
|
||||||
|
- alert: PhysicalComponentHeatAlarm
|
||||||
|
expr: node_hwmon_temp_crit_alarm_celsius == 1
|
||||||
|
for: 0m
|
||||||
|
annotations:
|
||||||
|
title: 'The temperature alarm of {{ $labels.instance }} is up'
|
||||||
|
description: >-
|
||||||
|
Do something!
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
|
||||||
|
- alert: OOMKill
|
||||||
|
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
annotations:
|
||||||
|
title: 'The kernel is killing processes'
|
||||||
|
description: >-
|
||||||
|
The kernel killed {{ $value }} proccesses (OOM killer)
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|
||||||
|
- alert: CorrectableErrorDetected
|
||||||
|
expr: increase(node_edac_correctable_errors_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
annotations:
|
||||||
|
title: 'Memory errors have been corrected'
|
||||||
|
description: >-
|
||||||
|
{{ $value | printf "%.1f" }} error(s) have been corrected (EDAC)
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|
||||||
|
- alert: UncorrectableErrorDetected
|
||||||
|
expr: increase(node_edac_csrow_uncorrectable_errors_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
annotations:
|
||||||
|
title: 'Memory errors could not be corrected'
|
||||||
|
description: >-
|
||||||
|
{{ $value | printf "%.1f" }} error(s) could not be corrected (EDAC)
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|
||||||
|
- alert: UnhealthyDisk
|
||||||
|
expr: >-
|
||||||
|
(
|
||||||
|
smartmon_device_smart_healthy
|
||||||
|
and on (instance, disk)
|
||||||
|
smartmon_device_info{product!="QEMU HARDDISK"}
|
||||||
|
) < 1
|
||||||
|
for: 10m
|
||||||
|
annotations:
|
||||||
|
title: '`{{ $labels.instance }}:{{ $labels.disk }}` is unhealthy'
|
||||||
|
description: >-
|
||||||
|
Smartools detected that `{{ $labels.disk }}` on {{ $labels.instance }} is unhealthy
|
||||||
|
and will probably need to be changed.
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
|
||||||
|
- alert: ServiceFailed
|
||||||
|
expr: node_systemd_unit_state{state="failed"}==1
|
||||||
|
for: 10m
|
||||||
|
annotations:
|
||||||
|
title: '{{ $labels.name }} failed'
|
||||||
|
description: >-
|
||||||
|
The systemd service {{ $labels.name }} failed on {{ $labels.instance }}
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
...
|
|
@ -3,3 +3,8 @@
|
||||||
systemd:
|
systemd:
|
||||||
name: prometheus-node-exporter
|
name: prometheus-node-exporter
|
||||||
state: restarted
|
state: restarted
|
||||||
|
- name: Restart appointed_prometheus_server
|
||||||
|
systemd:
|
||||||
|
name: prometheus
|
||||||
|
state: restarted
|
||||||
|
delegate_to: "{{ appointed_prometheus_server }}"
|
||||||
|
|
|
@ -89,7 +89,7 @@
|
||||||
|
|
||||||
- name: Get the list of targets of the server
|
- name: Get the list of targets of the server
|
||||||
slurp:
|
slurp:
|
||||||
src: /etc/prometheus/node-targets.json
|
src: /etc/prometheus/targets/node-targets.json
|
||||||
register: server_target_file
|
register: server_target_file
|
||||||
delegate_to: "{{ appointed_prometheus_server }}"
|
delegate_to: "{{ appointed_prometheus_server }}"
|
||||||
|
|
||||||
|
@ -110,3 +110,12 @@
|
||||||
delegate_to: "{{ appointed_prometheus_server }}"
|
delegate_to: "{{ appointed_prometheus_server }}"
|
||||||
when: (lan_address + '/' + ansible_facts['nodename']) not in server_target.0.targets
|
when: (lan_address + '/' + ansible_facts['nodename']) not in server_target.0.targets
|
||||||
|
|
||||||
|
- name: Add alert rules for node on the prometheus server
|
||||||
|
copy:
|
||||||
|
src: alerts-node.yml
|
||||||
|
dest: /etc/prometheus/alerts/node.yml
|
||||||
|
owner: prometheus
|
||||||
|
group: prometheus
|
||||||
|
mode: u=rw,g=r,o=r
|
||||||
|
delegate_to: "{{ appointed_prometheus_server }}"
|
||||||
|
notify: Restart appointed_prometheus_server
|
||||||
|
|
Loading…
Reference in a new issue