From 70831ce0d1d724432c79c15d4a14602542e5aa9f Mon Sep 17 00:00:00 2001 From: Jean-Marie Mineau Date: Fri, 8 Oct 2021 22:20:08 +0200 Subject: [PATCH] add alerts for node --- .../files/alerts-node.yml | 147 ++++++++++++++++++ .../handlers/main.yml | 5 + roles/prometheus-node-exporter/tasks/main.yml | 13 +- 3 files changed, 163 insertions(+), 2 deletions(-) create mode 100644 roles/prometheus-node-exporter/files/alerts-node.yml diff --git a/roles/prometheus-node-exporter/files/alerts-node.yml b/roles/prometheus-node-exporter/files/alerts-node.yml new file mode 100644 index 0000000..387dcc1 --- /dev/null +++ b/roles/prometheus-node-exporter/files/alerts-node.yml @@ -0,0 +1,147 @@ +--- +groups: +- name: AllInstances + rules: + + - alert: InstanceDown + expr: up{job='node'} == 0 + for: 5m + annotations: + title: 'Instance {{ $labels.instance }} down' + description: >- + {{ $labels.instance }} has been down for more than 5 minutes. + labels: + severity: critical + + - alert: OutOfDiskSpace + expr: (100 - node_filesystem_avail_bytes{} *100 / node_filesystem_size_bytes{}) > 80 + for: 1m + annotations: + title: '`{{ $labels.instance }}:{{ $labels.mountpoint }}` is out of space' + description: >- + Partition `{{ $labels.mountpoint }}` (`{{ $labels.device }}`) of {{ $labels.instance }} + uses {{ $value | printf "%.1f" }}% of its capacity. + labels: + severity: warning + + - alert: OutOfMemory + expr: >- + ( + node_memory_MemTotal_bytes + - node_memory_MemFree_bytes + - node_memory_Cached_bytes + - node_memory_Buffers_bytes + ) / node_memory_MemTotal_bytes * 100 > 80 + for: 1m + annotations: + title: '{{ $labels.instance }} is out of memory' + description: >- + {{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity. + labels: + severity: warning + + - alert: OutOfInode + expr: >- + ( + node_filesystem_files + - node_filesystem_files_free + ) / node_filesystem_files * 100 >= 90 + for: 5m + annotations: + title: '`{{ $labels.instance }}:{{ $labels.mountpoint }}` is out of Inodes' + description: >- + Partition {{ $labels.mountpoint }} ({{ $labels.device }}) of {{ $labels.instance }} + uses {{ $value | printf "%.1f" }}% of its Inodes. + labels: + severity: warning + + - alert: Swapping + expr: >- + ( + node_memory_SwapTotal_bytes + - node_memory_SwapFree_bytes + ) / node_memory_SwapTotal_bytes * 100 >= 50 + for: 5m + annotations: + title: '{{ $labels.instance }} is using a lot of swap' + description: >- + {{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity. + labels: + severity: warning + + - alert: PhysicalComponentTooHot + expr: node_hwmon_temp_celsius > 79 + for: 5m + annotations: + title: '{{ $labels.instance }} is heating up' + description: >- + The internal temperature of {{ $labels.instance }} is {{ $value }}°C! + labels: + severity: critical + + - alert: PhysicalComponentHeatAlarm + expr: node_hwmon_temp_crit_alarm_celsius == 1 + for: 0m + annotations: + title: 'The temperature alarm of {{ $labels.instance }} is up' + description: >- + Do something! + labels: + severity: critical + + - alert: OOMKill + expr: increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + annotations: + title: 'The kernel is killing processes' + description: >- + The kernel killed {{ $value }} proccesses (OOM killer) + labels: + severity: warning + + - alert: CorrectableErrorDetected + expr: increase(node_edac_correctable_errors_total[1m]) > 0 + for: 0m + annotations: + title: 'Memory errors have been corrected' + description: >- + {{ $value | printf "%.1f" }} error(s) have been corrected (EDAC) + labels: + severity: warning + + - alert: UncorrectableErrorDetected + expr: increase(node_edac_csrow_uncorrectable_errors_total[1m]) > 0 + for: 0m + annotations: + title: 'Memory errors could not be corrected' + description: >- + {{ $value | printf "%.1f" }} error(s) could not be corrected (EDAC) + labels: + severity: warning + + - alert: UnhealthyDisk + expr: >- + ( + smartmon_device_smart_healthy + and on (instance, disk) + smartmon_device_info{product!="QEMU HARDDISK"} + ) < 1 + for: 10m + annotations: + title: '`{{ $labels.instance }}:{{ $labels.disk }}` is unhealthy' + description: >- + Smartools detected that `{{ $labels.disk }}` on {{ $labels.instance }} is unhealthy + and will probably need to be changed. + labels: + severity: critical + + - alert: ServiceFailed + expr: node_systemd_unit_state{state="failed"}==1 + for: 10m + annotations: + title: '{{ $labels.name }} failed' + description: >- + The systemd service {{ $labels.name }} failed on {{ $labels.instance }} + labels: + severity: warning +... diff --git a/roles/prometheus-node-exporter/handlers/main.yml b/roles/prometheus-node-exporter/handlers/main.yml index f55aedb..e2be9c1 100644 --- a/roles/prometheus-node-exporter/handlers/main.yml +++ b/roles/prometheus-node-exporter/handlers/main.yml @@ -3,3 +3,8 @@ systemd: name: prometheus-node-exporter state: restarted +- name: Restart appointed_prometheus_server + systemd: + name: prometheus + state: restarted + delegate_to: "{{ appointed_prometheus_server }}" diff --git a/roles/prometheus-node-exporter/tasks/main.yml b/roles/prometheus-node-exporter/tasks/main.yml index c8cad53..26d5aa1 100644 --- a/roles/prometheus-node-exporter/tasks/main.yml +++ b/roles/prometheus-node-exporter/tasks/main.yml @@ -89,7 +89,7 @@ - name: Get the list of targets of the server slurp: - src: /etc/prometheus/node-targets.json + src: /etc/prometheus/targets/node-targets.json register: server_target_file delegate_to: "{{ appointed_prometheus_server }}" @@ -109,4 +109,13 @@ dest: /etc/prometheus/node-targets.json delegate_to: "{{ appointed_prometheus_server }}" when: (lan_address + '/' + ansible_facts['nodename']) not in server_target.0.targets - + +- name: Add alert rules for node on the prometheus server + copy: + src: alerts-node.yml + dest: /etc/prometheus/alerts/node.yml + owner: prometheus + group: prometheus + mode: u=rw,g=r,o=r + delegate_to: "{{ appointed_prometheus_server }}" + notify: Restart appointed_prometheus_server