--- groups: - name: AllInstances rules: - alert: InstanceDown expr: up{job='node'} == 0 for: 5m annotations: title: 'Instance {{ $labels.instance }} down' description: >- {{ $labels.instance }} has been down for more than 5 minutes. labels: value: "{{ $value }}" severity: critical - alert: OutOfDiskSpace expr: (100 - node_filesystem_avail_bytes{} *100 / node_filesystem_size_bytes{}) > 80 for: 1m annotations: title: '`{{ $labels.instance }}:{{ $labels.mountpoint }}` is out of space' description: >- Partition `{{ $labels.mountpoint }}` (`{{ $labels.device }}`) of {{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its capacity. labels: value: "{{ $value }}" severity: warning - alert: OutOfMemory expr: >- ( node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Cached_bytes - node_memory_Buffers_bytes ) / node_memory_MemTotal_bytes * 100 > 80 for: 1m annotations: title: '{{ $labels.instance }} is out of memory' description: >- {{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity. labels: value: "{{ $value }}" severity: warning - alert: OutOfInode expr: >- ( node_filesystem_files - node_filesystem_files_free ) / node_filesystem_files * 100 >= 90 for: 5m annotations: title: '`{{ $labels.instance }}:{{ $labels.mountpoint }}` is out of Inodes' description: >- Partition {{ $labels.mountpoint }} ({{ $labels.device }}) of {{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its Inodes. labels: value: "{{ $value }}" severity: warning - alert: Swapping expr: >- ( node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes ) / node_memory_SwapTotal_bytes * 100 >= 50 for: 5m annotations: title: '{{ $labels.instance }} is using a lot of swap' description: >- {{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity. labels: value: "{{ $value }}" severity: warning - alert: PhysicalComponentTooHot expr: node_hwmon_temp_celsius > 79 for: 5m annotations: title: '{{ $labels.instance }} is heating up' description: >- The internal temperature of {{ $labels.instance }} is {{ $value }}°C! labels: value: "{{ $value }}" severity: critical - alert: PhysicalComponentHeatAlarm expr: node_hwmon_temp_crit_alarm_celsius == 1 for: 0m annotations: title: 'The temperature alarm of {{ $labels.instance }} is up' description: >- Do something! labels: value: "{{ $value }}" severity: critical - alert: OOMKill expr: increase(node_vmstat_oom_kill[1m]) > 0 for: 0m annotations: title: 'The kernel is killing processes' description: >- The kernel killed {{ $value }} proccesses (OOM killer) labels: value: "{{ $value }}" severity: warning - alert: CorrectableErrorDetected expr: increase(node_edac_correctable_errors_total[1m]) > 0 for: 0m annotations: title: 'Memory errors have been corrected' description: >- {{ $value | printf "%.1f" }} error(s) have been corrected (EDAC) labels: value: "{{ $value }}" severity: warning - alert: UncorrectableErrorDetected expr: increase(node_edac_uncorrectable_errors_total[1m]) > 0 for: 0m annotations: title: 'Memory errors could not be corrected' description: >- {{ $value | printf "%.1f" }} error(s) could not be corrected (EDAC) labels: value: "{{ $value }}" severity: warning - alert: UnhealthyDisk expr: >- ( smartmon_device_smart_healthy and on (instance, disk) smartmon_device_info{product!="QEMU HARDDISK"} ) < 1 for: 10m annotations: title: '`{{ $labels.instance }}:{{ $labels.disk }}` is unhealthy' description: >- Smartools detected that `{{ $labels.disk }}` on {{ $labels.instance }} is unhealthy and will probably need to be changed. labels: value: "{{ $value }}" severity: critical - alert: ServiceFailed expr: node_systemd_unit_state{state="failed"}==1 for: 10m annotations: title: '{{ $labels.name }} failed' description: >- The systemd service {{ $labels.name }} failed on {{ $labels.instance }} labels: value: "{{ $value }}" severity: warning ...