---
groups:
- name: AllInstances
  rules:

  - alert: InstanceDown
    expr: up{job='node'} == 0
    for: 5m
    annotations:
      title: 'Instance {{ $labels.instance }} down'
      description: >-
        {{ $labels.instance }} has been down for more than 5 minutes.
    labels:
      value: "{{ $value }}"
      severity: critical

  - alert: OutOfDiskSpace
    expr: (100 - node_filesystem_avail_bytes{} *100 / node_filesystem_size_bytes{}) > 80
    for: 1m
    annotations:
      title: '`{{ $labels.instance }}:{{ $labels.mountpoint }}` is out of space'
      description: >-
         Partition `{{ $labels.mountpoint }}` (`{{ $labels.device }}`) of {{ $labels.instance }}
         uses {{ $value | printf "%.1f" }}% of its capacity.
    labels:
      value: "{{ $value }}"
      severity: warning

  - alert: OutOfMemory
    expr: >-
      (
          node_memory_MemTotal_bytes
          - node_memory_MemFree_bytes
          - node_memory_Cached_bytes
          - node_memory_Buffers_bytes
        ) / node_memory_MemTotal_bytes * 100 > 80
    for: 1m
    annotations:
      title: '{{ $labels.instance }} is out of memory'
      description: >-
         {{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity.
    labels:
      value: "{{ $value }}"
      severity: warning

  - alert: OutOfInode
    expr: >-
      (
         node_filesystem_files
         - node_filesystem_files_free
       ) / node_filesystem_files * 100 >= 90
    for: 5m
    annotations:
      title: '`{{ $labels.instance }}:{{ $labels.mountpoint }}` is out of Inodes'
      description: >-
         Partition {{ $labels.mountpoint }} ({{ $labels.device }}) of {{ $labels.instance }}
         uses {{ $value | printf "%.1f" }}% of its Inodes.
    labels:
      value: "{{ $value }}"
      severity: warning

  - alert: Swapping
    expr: >-
      (
         node_memory_SwapTotal_bytes
         - node_memory_SwapFree_bytes
       ) / node_memory_SwapTotal_bytes * 100 >= 50
    for: 5m
    annotations:
      title: '{{ $labels.instance }} is using a lot of swap'
      description: >-
         {{ $labels.instance }} uses {{ $value | printf "%.1f" }}% of its memory capacity.
    labels:
      value: "{{ $value }}"
      severity: warning

  - alert: PhysicalComponentTooHot
    expr: node_hwmon_temp_celsius > 79
    for: 5m
    annotations:
      title: '{{ $labels.instance }} is heating up'
      description: >-
        The internal temperature of {{ $labels.instance }} is {{ $value }}°C!
    labels:
      value: "{{ $value }}"
      severity: critical

  - alert: PhysicalComponentHeatAlarm
    expr: node_hwmon_temp_crit_alarm_celsius == 1
    for: 0m
    annotations:
      title: 'The temperature alarm of {{ $labels.instance }} is up'
      description: >-
        Do something!
    labels:
      value: "{{ $value }}"
      severity: critical

  - alert: OOMKill
    expr: increase(node_vmstat_oom_kill[1m]) > 0
    for: 0m
    annotations:
      title: 'The kernel is killing processes'
      description: >-
        The kernel killed {{ $value }} proccesses (OOM killer)
    labels:
      value: "{{ $value }}"
      severity: warning

  - alert: CorrectableErrorDetected
    expr: increase(node_edac_correctable_errors_total[1m]) > 0
    for: 0m
    annotations:
      title: 'Memory errors have been corrected'
      description: >-
        {{ $value | printf "%.1f" }} error(s) have been corrected (EDAC)
    labels:
      value: "{{ $value }}"
      severity: warning

  - alert: UncorrectableErrorDetected
    expr: increase(node_edac_uncorrectable_errors_total[1m]) > 0
    for: 0m
    annotations:
      title: 'Memory errors could not be corrected'
      description: >-
        {{ $value | printf "%.1f" }} error(s) could not be corrected (EDAC)
    labels:
      value: "{{ $value }}"
      severity: warning

  - alert: UnhealthyDisk
    expr: >-
      (
        smartmon_device_smart_healthy
          and on (instance, disk)
        smartmon_device_info{product!="QEMU HARDDISK"}
      ) < 1
    for: 10m
    annotations:
      title: '`{{ $labels.instance }}:{{ $labels.disk }}` is unhealthy'
      description: >-
        Smartools detected that `{{ $labels.disk }}` on {{ $labels.instance }} is unhealthy
        and will probably need to be changed.
    labels:
      value: "{{ $value }}"
      severity: critical

  - alert: ServiceFailed
    expr: node_systemd_unit_state{state="failed"}==1
    for: 10m
    annotations:
      title: '{{ $labels.name }} failed'
      description: >-
        The systemd service {{ $labels.name }} failed on {{ $labels.instance }}
    labels:
      value: "{{ $value }}"
      severity: warning
...