From e247aa3f70fe089c36d174f45a8d198ae36afb4c Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 05:21:08 +0200 Subject: [PATCH 01/11] Uniform labels for alerts --- roles/prometheus/handlers/main.yml | 1 + .../templates/prometheus/alert.rules.yml.j2 | 269 ++++++++++-------- .../templates/prometheus/prometheus.yml.j2 | 4 +- 3 files changed, 150 insertions(+), 124 deletions(-) diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml index 670847b..d501c14 100644 --- a/roles/prometheus/handlers/main.yml +++ b/roles/prometheus/handlers/main.yml @@ -8,3 +8,4 @@ service: name: prometheus-snmp-exporter state: restarted +... diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index bf4127b..1de7e24 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -1,138 +1,161 @@ -# {{ ansible_managed }} -{# As this is also Jinja2 it will conflict without a raw block #} -{# Depending of Prometheus Node exporter version, rules can change depending of version #} -{% raw %} +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ string }} +{%- endmacro %} + +{% set instance = '[{{ raw("$label.instance") }}]' %} + groups: -- name: alert.rules - rules: + - name: alert.rules + rules: - # Alert for any instance that is unreachable for >3 minutes. - - alert: InstanceDown - expr: up == 0 - for: 3m - labels: - severity: critical - annotations: - summary: "{{ $labels.instance }} est invisible depuis plus de 3 minutes !" + - alert: InstanceDown + expr: up == 0 + for: 3m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} Invisible depuis plus de 3 minutes - # Alert for out of memory - - alert: OutOfMemory - expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Mémoire libre de {{ $labels.instance }} à {{ humanize $value }}%." + - alert: OutOfMemory + expr: >- + ( + node_memory_MemFree_bytes + + node_memory_Cached_bytes + + node_memory_Buffers_bytes + ) / node_memory_MemTotal_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} {{ raw("$value | round") }}% de mémoire libre - # Alert for out of disk space - - alert: OutOfDiskSpace - expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ humanize $value }}%." + - alert: OutOfDiskSpace + expr: >- + node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} Espace libre de {{ raw("$labels.mountpoint") }} sur + à {{ raw("$value | round") }}% - # Alert for out of inode space on disk - - alert: OutOfInodes - expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}." + - alert: OutOfInodes + expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} {{ raw("$value | round") }}% d'inodes restants + pour {{ raw("$labels.mountpoint") }} - # Alert for high CPU usage - - alert: CpuUsage - expr: (100 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 75 - for: 10m - labels: - severity: warning - annotations: - summary: "CPU sur {{ $labels.instance }} à {{ humanize $value }}%." + - alert: CpuUsage + expr: >- + ( + 100 - avg by (instance) ( + irate(node_cpu_seconds_total{mode="idle"}[5m]) + ) * 100 + ) > 75 + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} CPU à {{ raw("$value | round") }}% - # Check systemd unit (> buster) - - alert: SystemdServiceFailed - expr: node_systemd_unit_state{state="failed"} == 1 - for: 10m - labels: - severity: warning - annotations: - summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" + - alert: SystemdServiceFailed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} {{ raw("$label.name") }} a échoué - # Check load of instance - - alert: LoadUsage - expr: node_load1 > 5 - for: 2m - labels: - severity: warning - annotations: - summary: "La charge de {{ $labels.instance }} est à {{ $value }} !" - - # Check UPS - - alert: UpsOutputSourceChanged - expr: upsOutputSource != 3 - for: 1m - labels: - severity: warning - annotations: - summary: "La source d'alimentation de {{ $labels.instance }} a changé !" + - alert: LoadUsage + expr: node_load1 > 5 + for: 2m + labels: + severity: warning + annotations: + summary: > + {{ instance }} Charge à {{ raw("$value") }} + + - alert: UpsOutputSourceChanged + expr: upsOutputSource != 3 + for: 1m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} Source d'alimentation changée - - alert: UpsBatteryStatusWarning - expr: upsBatteryStatus == 3 - for: 2m - labels: - severity: warning - annotations: - summary: "L'état de la batterie de {{ $labels.instance }} est faible !" + - alert: UpsBatteryStatusWarning + expr: upsBatteryStatus == 3 + for: 2m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} État de la batterie faible - - alert: UpsBatteryStatusCritical - expr: upsBatteryStatus == 4 - for: 10m - labels: - severity: warning - annotations: - summary: "L'état de la batterie de {{ $labels.instance }} est affaibli !" + - alert: UpsBatteryStatusCritical + expr: upsBatteryStatus == 4 + for: 10m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} État de la batterie critique - - alert: UpsHighLoad - expr: upsOutputPercentLoad > 70 - for: 5m - labels: - severity: critical - annotations: - summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !" + - alert: UpsHighLoad + expr: upsOutputPercentLoad > 70 + for: 5m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} Charge de {{ raw("$value | round") }}% - - alert: UpsWrongInputVoltage - expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) - for: 10m - labels: - severity: warning - annotations: - summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V." + - alert: UpsWrongInputVoltage + expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} Tension d'entrée de {{ raw("$value") }}V - - alert: UpsWrongOutputVoltage - expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) - for: 10m - labels: - severity: warning - annotations: - summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V." + - alert: UpsWrongOutputVoltage + expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} Tension de sortie de {{ raw("$value") }}V - - alert: UpsTimeRemainingWarning - expr: upsEstimatedMinutesRemaining < 8 - for: 1m - labels: - severity: warning - annotations: - summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." + - alert: UpsTimeRemainingWarning + expr: upsEstimatedMinutesRemaining < 8 + for: 1m + labels: + severity: warning + annotations: + summary: >- + {{ instance }} Autonomie restante de {{ raw("$value") }} min - - alert: UpsTimeRemainingCritical - expr: upsEstimatedMinutesRemaining < 5 - for: 1m - labels: - severity: critical - annotations: - summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." - - -{% endraw %} + - alert: UpsTimeRemainingCritical + expr: upsEstimatedMinutesRemaining < 5 + for: 1m + labels: + severity: critical + annotations: + summary: >- + {{ instance }} Autonomie restante de {{ raw("$value") }} min +... diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 index 7399f48..e97e986 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -1,4 +1,5 @@ -# {{ ansible_managed }} +--- +{{ ansible_managed | comment }} global: # scrape_interval is set to the global default (60s) @@ -100,3 +101,4 @@ scrape_configs: file_sd_configs: - files: - '/etc/prometheus/targets_docker.json' +... From eeaf0f8486141e741643c3da24e78f7e0cbe76e2 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 06:00:47 +0200 Subject: [PATCH 02/11] Fix syntax errors --- .../templates/prometheus/alert.rules.yml.j2 | 37 ++++++++++--------- .../templates/prometheus/alert.rules.yml.j2 | 9 ++++- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index 1de7e24..c958bac 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -2,10 +2,10 @@ {{ ansible_managed | comment }} {% macro raw(string) -%} -{{ string }} +{{ "{{" }} {{ string }} {{ "}}" }} {%- endmacro %} -{% set instance = '[{{ raw("$label.instance") }}]' %} +{% set instance = "[{{ $labels.instance }}]" %} groups: - name: alert.rules @@ -15,10 +15,10 @@ groups: expr: up == 0 for: 3m labels: - severity: critical + severity: critical annotations: - summary: >- - {{ instance }} Invisible depuis plus de 3 minutes + summary: >- + {{ instance }} Invisible depuis plus de 3 minutes - alert: OutOfMemory expr: >- @@ -32,7 +32,8 @@ groups: severity: warning annotations: summary: >- - {{ instance }} {{ raw("$value | round") }}% de mémoire libre + {{ instance }} {{ raw('$value | printf "%.1f"') }}% de mémoire + libre - alert: OutOfDiskSpace expr: >- @@ -42,8 +43,8 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Espace libre de {{ raw("$labels.mountpoint") }} sur - à {{ raw("$value | round") }}% + {{ instance }} Espace libre de {{ raw('$labels.mountpoint') }} sur + à {{ raw('$value | printf "%.1f"') }}% - alert: OutOfInodes expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 @@ -52,8 +53,8 @@ groups: severity: warning annotations: summary: >- - {{ instance }} {{ raw("$value | round") }}% d'inodes restants - pour {{ raw("$labels.mountpoint") }} + {{ instance }} {{ raw('$value | printf "%.1f"') }}% d'inodes + restants pour {{ raw('$labels.mountpoint') }} - alert: CpuUsage expr: >- @@ -67,7 +68,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} CPU à {{ raw("$value | round") }}% + {{ instance }} CPU à {{ raw('$value | printf "%.1f"') }}% - alert: SystemdServiceFailed expr: node_systemd_unit_state{state="failed"} == 1 @@ -76,7 +77,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} {{ raw("$label.name") }} a échoué + {{ instance }} {{ raw('$labels.name') }} a échoué - alert: LoadUsage expr: node_load1 > 5 @@ -85,7 +86,7 @@ groups: severity: warning annotations: summary: > - {{ instance }} Charge à {{ raw("$value") }} + {{ instance }} Charge à {{ raw('$value') }} - alert: UpsOutputSourceChanged expr: upsOutputSource != 3 @@ -121,7 +122,7 @@ groups: severity: critical annotations: summary: >- - {{ instance }} Charge de {{ raw("$value | round") }}% + {{ instance }} Charge de {{ raw('$value | printf "%.1f"') }}% - alert: UpsWrongInputVoltage expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) @@ -130,7 +131,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Tension d'entrée de {{ raw("$value") }}V + {{ instance }} Tension d'entrée de {{ raw('$value') }}V - alert: UpsWrongOutputVoltage expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) @@ -139,7 +140,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Tension de sortie de {{ raw("$value") }}V + {{ instance }} Tension de sortie de {{ raw('$value') }}V - alert: UpsTimeRemainingWarning expr: upsEstimatedMinutesRemaining < 8 @@ -148,7 +149,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Autonomie restante de {{ raw("$value") }} min + {{ instance }} Autonomie restante de {{ raw('$value') }} min - alert: UpsTimeRemainingCritical expr: upsEstimatedMinutesRemaining < 5 @@ -157,5 +158,5 @@ groups: severity: critical annotations: summary: >- - {{ instance }} Autonomie restante de {{ raw("$value") }} min + {{ instance }} Autonomie restante de {{ raw('$value') }} min ... diff --git a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 index d30511f..030e418 100644 --- a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 @@ -1,6 +1,12 @@ --- {{ ansible_managed | comment }} +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + +{% set instance = "[{{ $labels.instance }}]" %} + groups: - name: alert.rules rules: @@ -11,6 +17,5 @@ groups: severity: critical annotations: summary: >- - Federate : {{ "{{" }} $labels.instance {{ "}}" }} est invisible - depuis plus de 3 minutes ! + {{ instance }} Invisible depuis plus de 3 minutes ... From 5bcc42889550705126b286fa932bf4d15b90c665 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 09:36:11 +0200 Subject: [PATCH 03/11] Remove 'instance' from description and fix typos --- .../templates/prometheus/alert.rules.yml.j2 | 40 +++++++++---------- .../templates/prometheus/alert.rules.yml.j2 | 4 +- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index c958bac..275f0a1 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -5,8 +5,6 @@ {{ "{{" }} {{ string }} {{ "}}" }} {%- endmacro %} -{% set instance = "[{{ $labels.instance }}]" %} - groups: - name: alert.rules rules: @@ -18,7 +16,7 @@ groups: severity: critical annotations: summary: >- - {{ instance }} Invisible depuis plus de 3 minutes + Invisible depuis plus de 3 minutes - alert: OutOfMemory expr: >- @@ -32,7 +30,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} {{ raw('$value | printf "%.1f"') }}% de mémoire + {{ raw('$value | printf "%.1f"') }}% de mémoire libre - alert: OutOfDiskSpace @@ -43,7 +41,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Espace libre de {{ raw('$labels.mountpoint') }} sur + Espace libre de {{ raw('$labels.mountpoint') }} sur à {{ raw('$value | printf "%.1f"') }}% - alert: OutOfInodes @@ -53,7 +51,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} {{ raw('$value | printf "%.1f"') }}% d'inodes + {{ raw('$value | printf "%.1f"') }}% d'inodes restants pour {{ raw('$labels.mountpoint') }} - alert: CpuUsage @@ -68,7 +66,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} CPU à {{ raw('$value | printf "%.1f"') }}% + CPU à {{ raw('$value | printf "%.1f"') }}% - alert: SystemdServiceFailed expr: node_systemd_unit_state{state="failed"} == 1 @@ -77,7 +75,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} {{ raw('$labels.name') }} a échoué + {{ raw('$labels.name') }} a échoué - alert: LoadUsage expr: node_load1 > 5 @@ -86,7 +84,7 @@ groups: severity: warning annotations: summary: > - {{ instance }} Charge à {{ raw('$value') }} + Charge à {{ raw('$value') }} - alert: UpsOutputSourceChanged expr: upsOutputSource != 3 @@ -95,25 +93,25 @@ groups: severity: critical annotations: summary: >- - {{ instance }} Source d'alimentation changée + Source d'alimentation changée - - alert: UpsBatteryStatusWarning + - alert: UpsBatteryStatus expr: upsBatteryStatus == 3 for: 2m labels: severity: warning annotations: summary: >- - {{ instance }} État de la batterie faible + État de la batterie faible - - alert: UpsBatteryStatusCritical + - alert: UpsBatteryStatus expr: upsBatteryStatus == 4 for: 10m labels: severity: critical annotations: summary: >- - {{ instance }} État de la batterie critique + État de la batterie critique - alert: UpsHighLoad expr: upsOutputPercentLoad > 70 @@ -122,7 +120,7 @@ groups: severity: critical annotations: summary: >- - {{ instance }} Charge de {{ raw('$value | printf "%.1f"') }}% + Charge de {{ raw('$value | printf "%.1f"') }}% - alert: UpsWrongInputVoltage expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) @@ -131,7 +129,7 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Tension d'entrée de {{ raw('$value') }}V + Tension d'entrée de {{ raw('$value') }}V - alert: UpsWrongOutputVoltage expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) @@ -140,23 +138,23 @@ groups: severity: warning annotations: summary: >- - {{ instance }} Tension de sortie de {{ raw('$value') }}V + Tension de sortie de {{ raw('$value') }}V - - alert: UpsTimeRemainingWarning + - alert: UpsTimeRemaining expr: upsEstimatedMinutesRemaining < 8 for: 1m labels: severity: warning annotations: summary: >- - {{ instance }} Autonomie restante de {{ raw('$value') }} min + Autonomie restante de {{ raw('$value') }} min - - alert: UpsTimeRemainingCritical + - alert: UpsTimeRemaining expr: upsEstimatedMinutesRemaining < 5 for: 1m labels: severity: critical annotations: summary: >- - {{ instance }} Autonomie restante de {{ raw('$value') }} min + Autonomie restante de {{ raw('$value') }} min ... diff --git a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 index 030e418..95f457e 100644 --- a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 @@ -5,8 +5,6 @@ {{ "{{" }} {{ string }} {{ "}}" }} {%- endmacro %} -{% set instance = "[{{ $labels.instance }}]" %} - groups: - name: alert.rules rules: @@ -17,5 +15,5 @@ groups: severity: critical annotations: summary: >- - {{ instance }} Invisible depuis plus de 3 minutes + Invisible depuis plus de 3 minutes ... From bc35cd8e908bbb9d5d2b9093aaac8ca3b175ce70 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 09:40:22 +0200 Subject: [PATCH 04/11] Move templates of the prometheus role --- roles/prometheus/tasks/main.yml | 23 ++++++++----------- .../{prometheus => }/alert.rules.yml.j2 | 0 .../{prometheus => }/django.rules.yml.j2 | 0 .../{prometheus => }/prometheus.yml.j2 | 0 .../templates/{prometheus => }/snmp.yml.j2 | 0 5 files changed, 10 insertions(+), 13 deletions(-) rename roles/prometheus/templates/{prometheus => }/alert.rules.yml.j2 (100%) rename roles/prometheus/templates/{prometheus => }/django.rules.yml.j2 (100%) rename roles/prometheus/templates/{prometheus => }/prometheus.yml.j2 (100%) rename roles/prometheus/templates/{prometheus => }/snmp.yml.j2 (100%) diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index a3d2063..3a590c9 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -11,20 +11,16 @@ - name: Configure Prometheus template: - src: prometheus/prometheus.yml.j2 + src: prometheus.yml.j2 dest: /etc/prometheus/prometheus.yml - mode: 0644 - notify: Restart Prometheus - -- name: Configure Prometheus alert rules - template: - src: "prometheus/{{ item }}.j2" - dest: "/etc/prometheus/{{ item }}" - mode: 0644 - notify: Restart Prometheus + owner: prometheus + group: prometheus + mode: u=r,g=r,o= loop: + - prometheus.yml - alert.rules.yml - django.rules.yml + notify: Restart Prometheus - name: Make Prometheus snmp-exporter listen on localhost only lineinfile: @@ -36,10 +32,11 @@ # This file store SNMP OIDs - name: Configure Prometheus snmp-exporter template: - src: "prometheus/snmp.yml.j2" - dest: "/etc/prometheus/snmp.yml" - mode: 0600 + src: snmp.yml.j2 + dest: /etc/prometheus/snmp.yml owner: prometheus + group: prometheus + mode: u=r,g=r,o= notify: Restart prometheus-snmp-exporter # We don't need to restart Prometheus when updating nodes diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 similarity index 100% rename from roles/prometheus/templates/prometheus/alert.rules.yml.j2 rename to roles/prometheus/templates/alert.rules.yml.j2 diff --git a/roles/prometheus/templates/prometheus/django.rules.yml.j2 b/roles/prometheus/templates/django.rules.yml.j2 similarity index 100% rename from roles/prometheus/templates/prometheus/django.rules.yml.j2 rename to roles/prometheus/templates/django.rules.yml.j2 diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 similarity index 100% rename from roles/prometheus/templates/prometheus/prometheus.yml.j2 rename to roles/prometheus/templates/prometheus.yml.j2 diff --git a/roles/prometheus/templates/prometheus/snmp.yml.j2 b/roles/prometheus/templates/snmp.yml.j2 similarity index 100% rename from roles/prometheus/templates/prometheus/snmp.yml.j2 rename to roles/prometheus/templates/snmp.yml.j2 From a743ce09fb29fbfe9436f6384135abd95a69de14 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 09:42:31 +0200 Subject: [PATCH 05/11] Move templates of the prometheus_federate role --- roles/prometheus_federate/tasks/main.yml | 16 ++++++---------- .../{prometheus => }/alert.rules.yml.j2 | 0 .../templates/{prometheus => }/prometheus.yml.j2 | 0 3 files changed, 6 insertions(+), 10 deletions(-) rename roles/prometheus_federate/templates/{prometheus => }/alert.rules.yml.j2 (100%) rename roles/prometheus_federate/templates/{prometheus => }/prometheus.yml.j2 (100%) diff --git a/roles/prometheus_federate/tasks/main.yml b/roles/prometheus_federate/tasks/main.yml index 73ae803..c5d81bf 100644 --- a/roles/prometheus_federate/tasks/main.yml +++ b/roles/prometheus_federate/tasks/main.yml @@ -10,19 +10,15 @@ - name: Configure Prometheus template: - src: prometheus/prometheus.yml.j2 - dest: /etc/prometheus/prometheus.yml - mode: 0644 - notify: Restart Prometheus - -- name: Configure Prometheus alert rules - template: - src: "prometheus/{{ item }}.j2" + src: "{{ item }}.j2" dest: "/etc/prometheus/{{ item }}" - mode: 0644 - notify: Restart Prometheus + owner: root + group: root + mode: u=r,g=r,o= loop: + - prometheus.yml - alert.rules.yml + notify: Restart Prometheus # We don't need to restart Prometheus when updating nodes - name: Configure Prometheus Federate devices diff --git a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus_federate/templates/alert.rules.yml.j2 similarity index 100% rename from roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 rename to roles/prometheus_federate/templates/alert.rules.yml.j2 diff --git a/roles/prometheus_federate/templates/prometheus/prometheus.yml.j2 b/roles/prometheus_federate/templates/prometheus.yml.j2 similarity index 100% rename from roles/prometheus_federate/templates/prometheus/prometheus.yml.j2 rename to roles/prometheus_federate/templates/prometheus.yml.j2 From 5d681a95ea4643b96d17efcacc1626c16b141e22 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 17:31:21 +0200 Subject: [PATCH 06/11] Remove unmanaged hosts from inventory --- hosts | 2 -- 1 file changed, 2 deletions(-) diff --git a/hosts b/hosts index f3ec2af..22ea7aa 100644 --- a/hosts +++ b/hosts @@ -63,7 +63,6 @@ matrix-services.adm.auro.re [ovh_vm] serge.adm.auro.re passbolt.adm.auro.re -vpn-ovh.adm.auro.re docker-ovh.adm.auro.re switchs-manager.adm.auro.re ldap-replica-ovh.adm.auro.re @@ -252,7 +251,6 @@ perceval.adm.auro.re [edc_pve] chapalux.adm.auro.re -escalope.adm.auro.re [edc_vm] routeur-edc.adm.auro.re From f69dfd87994f073caa1696ec85eaceb6ff772992 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 17:41:22 +0200 Subject: [PATCH 07/11] Remove other unmanaged hosts --- hosts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/hosts b/hosts index 22ea7aa..79a2d38 100644 --- a/hosts +++ b/hosts @@ -29,7 +29,6 @@ stream.adm.auro.re re2o-server.adm.auro.re re2o-ldap.adm.auro.re re2o-db.adm.auro.re -backup.adm.auro.re mail.adm.auro.re wikijs.adm.auro.re prometheus-aurore.adm.auro.re @@ -39,7 +38,6 @@ log.adm.auro.re bdd.adm.auro.re bdd-ovh.adm.auro.re litl.adm.auro.re -services-bdd-local.adm.auro.re log.adm.auro.re [aurore_testing_vm] @@ -53,7 +51,6 @@ horus.adm.auro.re [ovh_container] synapse.adm.auro.re -#services-bdd.adm.auro.re phabricator.adm.auro.re wiki.adm.auro.re www.adm.auro.re @@ -514,5 +511,4 @@ proxy.adm.auro.re [bdd] bdd.adm.auro.re bdd-ovh.adm.auro.re -services-bdd-local.adm.auro.re re2o-db.adm.auro.re From 083fc4da9acaaf102cdd79f4f0dd8e9600630936 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 18:15:09 +0200 Subject: [PATCH 08/11] Fix permissions on prometheus.yml --- roles/prometheus_federate/tasks/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/prometheus_federate/tasks/main.yml b/roles/prometheus_federate/tasks/main.yml index c5d81bf..02ae85e 100644 --- a/roles/prometheus_federate/tasks/main.yml +++ b/roles/prometheus_federate/tasks/main.yml @@ -12,8 +12,8 @@ template: src: "{{ item }}.j2" dest: "/etc/prometheus/{{ item }}" - owner: root - group: root + owner: prometheus + group: prometheus mode: u=r,g=r,o= loop: - prometheus.yml From 11335a6077c01e1fcb5d282aefc00efefccc1569 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 18:15:22 +0200 Subject: [PATCH 09/11] Fix typo in alert description --- roles/prometheus/templates/alert.rules.yml.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 index 275f0a1..3528823 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -41,8 +41,8 @@ groups: severity: warning annotations: summary: >- - Espace libre de {{ raw('$labels.mountpoint') }} sur - à {{ raw('$value | printf "%.1f"') }}% + Espace libre de {{ raw('$labels.mountpoint') }} à + {{ raw('$value | printf "%.1f"') }}% - alert: OutOfInodes expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 From 35286a661ab169647c83e039d5b1cefead6b1f9e Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 18:24:03 +0200 Subject: [PATCH 10/11] Change an alert description --- roles/prometheus/templates/alert.rules.yml.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 index 3528823..c5ba1c2 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -41,8 +41,8 @@ groups: severity: warning annotations: summary: >- - Espace libre de {{ raw('$labels.mountpoint') }} à - {{ raw('$value | printf "%.1f"') }}% + {{ raw('$value | printf "%.1f"') }}% d'espace libre pour + {{ raw('$labels.mountpoint') }} - alert: OutOfInodes expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 From 83f5b35e59ea427fe146eb739f6a0c1b96ac0ee8 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 1 Apr 2021 18:24:21 +0200 Subject: [PATCH 11/11] Fix a filename typo --- roles/prometheus/tasks/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 3a590c9..4dc518b 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -11,8 +11,8 @@ - name: Configure Prometheus template: - src: prometheus.yml.j2 - dest: /etc/prometheus/prometheus.yml + src: "{{ item }}.j2" + dest: "/etc/prometheus/{{ item }}" owner: prometheus group: prometheus mode: u=r,g=r,o=