add alerts for node

2021-10-08 22:20:08 +02:00 · 2021-10-08 22:20:08 +02:00 · 70831ce0d1
commit 70831ce0d1
parent 8b6986d955
3 changed files with 163 additions and 2 deletions
--- a/roles/prometheus-node-exporter/files/alerts-node.yml
+++ b/roles/prometheus-node-exporter/files/alerts-node.yml
@ -0,0 +1,147 @@
+---
+groups:
+- name: AllInstances
+  rules:
+
+  - alert: InstanceDown
+    expr: up{job='node'} == 0
+    for: 5m
+    annotations:
+      title: 'Instance {{ $labels.instance }} down'
+      description: >-
+        {{ $labels.instance }} has been down for more than 5 minutes.
+    labels:
+      severity: critical
+
+  - alert: OutOfDiskSpace
+    expr: (100 - node_filesystem_avail_bytes{} *100 / node_filesystem_size_bytes{}) > 80
+    for: 1m
+    annotations:
+      title: '`{{ $labels.instance }}:{{ $labels.mountpoint }}` is out of space'
+      description: >-
+         Partition `{{ $labels.mountpoint }}` (`{{ $labels.device }}`) of {{ $labels.instance }}
+         uses {{ $value | printf "%.1f" }}% of its capacity.
+    labels:
+      severity: warning
+
+  - alert: OutOfMemory
+    expr: >-
+      (
+          node_memory_MemTotal_bytes
+          - node_memory_MemFree_bytes
+          - node_memory_Cached_bytes
+          - node_memory_Buffers_bytes
+        ) / node_memory_MemTotal_bytes * 100 > 80
+    for: 1m
+    annotations:
+      title: '{{ $labels.instance }} is out of memory'
+      description: >-
+         {{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity.
+    labels:
+      severity: warning
+
+  - alert: OutOfInode
+    expr: >-
+      (
+         node_filesystem_files
+         - node_filesystem_files_free
+       ) / node_filesystem_files * 100 >= 90
+    for: 5m
+    annotations:
+      title: '`{{ $labels.instance }}:{{ $labels.mountpoint }}` is out of Inodes'
+      description: >-
+         Partition {{ $labels.mountpoint }} ({{ $labels.device }}) of {{ $labels.instance }}
+         uses {{ $value | printf "%.1f" }}% of its Inodes.
+    labels:
+      severity: warning
+
+  - alert: Swapping
+    expr: >-
+      (
+         node_memory_SwapTotal_bytes
+         - node_memory_SwapFree_bytes
+       ) / node_memory_SwapTotal_bytes * 100 >= 50
+    for: 5m
+    annotations:
+      title: '{{ $labels.instance }} is using a lot of swap'
+      description: >-
+         {{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity.
+    labels:
+      severity: warning
+
+  - alert: PhysicalComponentTooHot
+    expr: node_hwmon_temp_celsius > 79
+    for: 5m
+    annotations:
+      title: '{{ $labels.instance }} is heating up'
+      description: >-
+        The internal temperature of {{ $labels.instance }} is {{ $value }}°C!
+    labels:
+      severity: critical
+
+  - alert: PhysicalComponentHeatAlarm
+    expr: node_hwmon_temp_crit_alarm_celsius == 1
+    for: 0m
+    annotations:
+      title: 'The temperature alarm of {{ $labels.instance }} is up'
+      description: >-
+        Do something!
+    labels:
+      severity: critical
+
+  - alert: OOMKill
+    expr: increase(node_vmstat_oom_kill[1m]) > 0
+    for: 0m
+    annotations:
+      title: 'The kernel is killing processes'
+      description: >-
+        The kernel killed {{ $value }} proccesses (OOM killer)
+    labels:
+      severity: warning
+
+  - alert: CorrectableErrorDetected
+    expr: increase(node_edac_correctable_errors_total[1m]) > 0
+    for: 0m
+    annotations:
+      title: 'Memory errors have been corrected'
+      description: >-
+        {{ $value | printf "%.1f" }} error(s) have been corrected (EDAC)
+    labels:
+      severity: warning
+
+  - alert: UncorrectableErrorDetected
+    expr: increase(node_edac_csrow_uncorrectable_errors_total[1m]) > 0
+    for: 0m
+    annotations:
+      title: 'Memory errors could not be corrected'
+      description: >-
+        {{ $value | printf "%.1f" }} error(s) could not be corrected (EDAC)
+    labels:
+      severity: warning
+
+  - alert: UnhealthyDisk
+    expr: >-
+      (
+        smartmon_device_smart_healthy
+          and on (instance, disk)
+        smartmon_device_info{product!="QEMU HARDDISK"}
+      ) < 1
+    for: 10m
+    annotations:
+      title: '`{{ $labels.instance }}:{{ $labels.disk }}` is unhealthy'
+      description: >-
+        Smartools detected that `{{ $labels.disk }}` on {{ $labels.instance }} is unhealthy
+        and will probably need to be changed.
+    labels:
+      severity: critical
+
+  - alert: ServiceFailed
+    expr: node_systemd_unit_state{state="failed"}==1
+    for: 10m
+    annotations:
+      title: '{{ $labels.name }} failed'
+      description: >-
+        The systemd service {{ $labels.name }} failed on {{ $labels.instance }}
+    labels:
+      severity: warning
+...
--- a/roles/prometheus-node-exporter/handlers/main.yml
+++ b/roles/prometheus-node-exporter/handlers/main.yml
@ -3,3 +3,8 @@
  systemd:
    name: prometheus-node-exporter
    state: restarted
+- name: Restart appointed_prometheus_server
+  systemd:
+    name: prometheus
+    state: restarted
+  delegate_to: "{{ appointed_prometheus_server }}"
--- a/roles/prometheus-node-exporter/tasks/main.yml
+++ b/roles/prometheus-node-exporter/tasks/main.yml
@ -89,7 +89,7 @@

 - name: Get the list of targets of the server
  slurp:
-    src: /etc/prometheus/node-targets.json
+    src: /etc/prometheus/targets/node-targets.json
  register: server_target_file
  delegate_to: "{{ appointed_prometheus_server }}"

@ -110,3 +110,12 @@
      delegate_to: "{{ appointed_prometheus_server }}"
  when: (lan_address + '/' + ansible_facts['nodename']) not in server_target.0.targets

+- name: Add alert rules for node on the prometheus server
+  copy:
+    src: alerts-node.yml
+    dest: /etc/prometheus/alerts/node.yml
+    owner: prometheus
+    group: prometheus
+    mode: u=rw,g=r,o=r
+  delegate_to: "{{ appointed_prometheus_server }}"
+  notify: Restart appointed_prometheus_server