diff --git a/hosts b/hosts index f3ec2af..79a2d38 100644 --- a/hosts +++ b/hosts @@ -29,7 +29,6 @@ stream.adm.auro.re re2o-server.adm.auro.re re2o-ldap.adm.auro.re re2o-db.adm.auro.re -backup.adm.auro.re mail.adm.auro.re wikijs.adm.auro.re prometheus-aurore.adm.auro.re @@ -39,7 +38,6 @@ log.adm.auro.re bdd.adm.auro.re bdd-ovh.adm.auro.re litl.adm.auro.re -services-bdd-local.adm.auro.re log.adm.auro.re [aurore_testing_vm] @@ -53,7 +51,6 @@ horus.adm.auro.re [ovh_container] synapse.adm.auro.re -#services-bdd.adm.auro.re phabricator.adm.auro.re wiki.adm.auro.re www.adm.auro.re @@ -63,7 +60,6 @@ matrix-services.adm.auro.re [ovh_vm] serge.adm.auro.re passbolt.adm.auro.re -vpn-ovh.adm.auro.re docker-ovh.adm.auro.re switchs-manager.adm.auro.re ldap-replica-ovh.adm.auro.re @@ -252,7 +248,6 @@ perceval.adm.auro.re [edc_pve] chapalux.adm.auro.re -escalope.adm.auro.re [edc_vm] routeur-edc.adm.auro.re @@ -516,5 +511,4 @@ proxy.adm.auro.re [bdd] bdd.adm.auro.re bdd-ovh.adm.auro.re -services-bdd-local.adm.auro.re re2o-db.adm.auro.re diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml index 670847b..d501c14 100644 --- a/roles/prometheus/handlers/main.yml +++ b/roles/prometheus/handlers/main.yml @@ -8,3 +8,4 @@ service: name: prometheus-snmp-exporter state: restarted +... diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index a3d2063..4dc518b 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -11,20 +11,16 @@ - name: Configure Prometheus template: - src: prometheus/prometheus.yml.j2 - dest: /etc/prometheus/prometheus.yml - mode: 0644 - notify: Restart Prometheus - -- name: Configure Prometheus alert rules - template: - src: "prometheus/{{ item }}.j2" + src: "{{ item }}.j2" dest: "/etc/prometheus/{{ item }}" - mode: 0644 - notify: Restart Prometheus + owner: prometheus + group: prometheus + mode: u=r,g=r,o= loop: + - prometheus.yml - alert.rules.yml - django.rules.yml + notify: Restart Prometheus - name: Make Prometheus snmp-exporter listen on localhost only lineinfile: @@ -36,10 +32,11 @@ # This file store SNMP OIDs - name: Configure Prometheus snmp-exporter template: - src: "prometheus/snmp.yml.j2" - dest: "/etc/prometheus/snmp.yml" - mode: 0600 + src: snmp.yml.j2 + dest: /etc/prometheus/snmp.yml owner: prometheus + group: prometheus + mode: u=r,g=r,o= notify: Restart prometheus-snmp-exporter # We don't need to restart Prometheus when updating nodes diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 new file mode 100644 index 0000000..c5ba1c2 --- /dev/null +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -0,0 +1,160 @@ +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + +groups: + - name: alert.rules + rules: + + - alert: InstanceDown + expr: up == 0 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Invisible depuis plus de 3 minutes + + - alert: OutOfMemory + expr: >- + ( + node_memory_MemFree_bytes + + node_memory_Cached_bytes + + node_memory_Buffers_bytes + ) / node_memory_MemTotal_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ raw('$value | printf "%.1f"') }}% de mémoire + libre + + - alert: OutOfDiskSpace + expr: >- + node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ raw('$value | printf "%.1f"') }}% d'espace libre pour + {{ raw('$labels.mountpoint') }} + + - alert: OutOfInodes + expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ raw('$value | printf "%.1f"') }}% d'inodes + restants pour {{ raw('$labels.mountpoint') }} + + - alert: CpuUsage + expr: >- + ( + 100 - avg by (instance) ( + irate(node_cpu_seconds_total{mode="idle"}[5m]) + ) * 100 + ) > 75 + for: 10m + labels: + severity: warning + annotations: + summary: >- + CPU à {{ raw('$value | printf "%.1f"') }}% + + - alert: SystemdServiceFailed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ raw('$labels.name') }} a échoué + + - alert: LoadUsage + expr: node_load1 > 5 + for: 2m + labels: + severity: warning + annotations: + summary: > + Charge à {{ raw('$value') }} + + - alert: UpsOutputSourceChanged + expr: upsOutputSource != 3 + for: 1m + labels: + severity: critical + annotations: + summary: >- + Source d'alimentation changée + + - alert: UpsBatteryStatus + expr: upsBatteryStatus == 3 + for: 2m + labels: + severity: warning + annotations: + summary: >- + État de la batterie faible + + - alert: UpsBatteryStatus + expr: upsBatteryStatus == 4 + for: 10m + labels: + severity: critical + annotations: + summary: >- + État de la batterie critique + + - alert: UpsHighLoad + expr: upsOutputPercentLoad > 70 + for: 5m + labels: + severity: critical + annotations: + summary: >- + Charge de {{ raw('$value | printf "%.1f"') }}% + + - alert: UpsWrongInputVoltage + expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) + for: 10m + labels: + severity: warning + annotations: + summary: >- + Tension d'entrée de {{ raw('$value') }}V + + - alert: UpsWrongOutputVoltage + expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) + for: 10m + labels: + severity: warning + annotations: + summary: >- + Tension de sortie de {{ raw('$value') }}V + + - alert: UpsTimeRemaining + expr: upsEstimatedMinutesRemaining < 8 + for: 1m + labels: + severity: warning + annotations: + summary: >- + Autonomie restante de {{ raw('$value') }} min + + - alert: UpsTimeRemaining + expr: upsEstimatedMinutesRemaining < 5 + for: 1m + labels: + severity: critical + annotations: + summary: >- + Autonomie restante de {{ raw('$value') }} min +... diff --git a/roles/prometheus/templates/prometheus/django.rules.yml.j2 b/roles/prometheus/templates/django.rules.yml.j2 similarity index 100% rename from roles/prometheus/templates/prometheus/django.rules.yml.j2 rename to roles/prometheus/templates/django.rules.yml.j2 diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 similarity index 98% rename from roles/prometheus/templates/prometheus/prometheus.yml.j2 rename to roles/prometheus/templates/prometheus.yml.j2 index 7399f48..e97e986 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -1,4 +1,5 @@ -# {{ ansible_managed }} +--- +{{ ansible_managed | comment }} global: # scrape_interval is set to the global default (60s) @@ -100,3 +101,4 @@ scrape_configs: file_sd_configs: - files: - '/etc/prometheus/targets_docker.json' +... diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 deleted file mode 100644 index bf4127b..0000000 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ /dev/null @@ -1,138 +0,0 @@ -# {{ ansible_managed }} -{# As this is also Jinja2 it will conflict without a raw block #} -{# Depending of Prometheus Node exporter version, rules can change depending of version #} -{% raw %} -groups: -- name: alert.rules - rules: - - # Alert for any instance that is unreachable for >3 minutes. - - alert: InstanceDown - expr: up == 0 - for: 3m - labels: - severity: critical - annotations: - summary: "{{ $labels.instance }} est invisible depuis plus de 3 minutes !" - - # Alert for out of memory - - alert: OutOfMemory - expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Mémoire libre de {{ $labels.instance }} à {{ humanize $value }}%." - - # Alert for out of disk space - - alert: OutOfDiskSpace - expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ humanize $value }}%." - - # Alert for out of inode space on disk - - alert: OutOfInodes - expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}." - - # Alert for high CPU usage - - alert: CpuUsage - expr: (100 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 75 - for: 10m - labels: - severity: warning - annotations: - summary: "CPU sur {{ $labels.instance }} à {{ humanize $value }}%." - - # Check systemd unit (> buster) - - alert: SystemdServiceFailed - expr: node_systemd_unit_state{state="failed"} == 1 - for: 10m - labels: - severity: warning - annotations: - summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" - - # Check load of instance - - alert: LoadUsage - expr: node_load1 > 5 - for: 2m - labels: - severity: warning - annotations: - summary: "La charge de {{ $labels.instance }} est à {{ $value }} !" - - # Check UPS - - alert: UpsOutputSourceChanged - expr: upsOutputSource != 3 - for: 1m - labels: - severity: warning - annotations: - summary: "La source d'alimentation de {{ $labels.instance }} a changé !" - - - alert: UpsBatteryStatusWarning - expr: upsBatteryStatus == 3 - for: 2m - labels: - severity: warning - annotations: - summary: "L'état de la batterie de {{ $labels.instance }} est faible !" - - - alert: UpsBatteryStatusCritical - expr: upsBatteryStatus == 4 - for: 10m - labels: - severity: warning - annotations: - summary: "L'état de la batterie de {{ $labels.instance }} est affaibli !" - - - alert: UpsHighLoad - expr: upsOutputPercentLoad > 70 - for: 5m - labels: - severity: critical - annotations: - summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !" - - - alert: UpsWrongInputVoltage - expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) - for: 10m - labels: - severity: warning - annotations: - summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V." - - - alert: UpsWrongOutputVoltage - expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) - for: 10m - labels: - severity: warning - annotations: - summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V." - - - alert: UpsTimeRemainingWarning - expr: upsEstimatedMinutesRemaining < 8 - for: 1m - labels: - severity: warning - annotations: - summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." - - - alert: UpsTimeRemainingCritical - expr: upsEstimatedMinutesRemaining < 5 - for: 1m - labels: - severity: critical - annotations: - summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." - - -{% endraw %} diff --git a/roles/prometheus/templates/prometheus/snmp.yml.j2 b/roles/prometheus/templates/snmp.yml.j2 similarity index 100% rename from roles/prometheus/templates/prometheus/snmp.yml.j2 rename to roles/prometheus/templates/snmp.yml.j2 diff --git a/roles/prometheus_federate/tasks/main.yml b/roles/prometheus_federate/tasks/main.yml index 73ae803..02ae85e 100644 --- a/roles/prometheus_federate/tasks/main.yml +++ b/roles/prometheus_federate/tasks/main.yml @@ -10,19 +10,15 @@ - name: Configure Prometheus template: - src: prometheus/prometheus.yml.j2 - dest: /etc/prometheus/prometheus.yml - mode: 0644 - notify: Restart Prometheus - -- name: Configure Prometheus alert rules - template: - src: "prometheus/{{ item }}.j2" + src: "{{ item }}.j2" dest: "/etc/prometheus/{{ item }}" - mode: 0644 - notify: Restart Prometheus + owner: prometheus + group: prometheus + mode: u=r,g=r,o= loop: + - prometheus.yml - alert.rules.yml + notify: Restart Prometheus # We don't need to restart Prometheus when updating nodes - name: Configure Prometheus Federate devices diff --git a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus_federate/templates/alert.rules.yml.j2 similarity index 67% rename from roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 rename to roles/prometheus_federate/templates/alert.rules.yml.j2 index d30511f..95f457e 100644 --- a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus_federate/templates/alert.rules.yml.j2 @@ -1,6 +1,10 @@ --- {{ ansible_managed | comment }} +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + groups: - name: alert.rules rules: @@ -11,6 +15,5 @@ groups: severity: critical annotations: summary: >- - Federate : {{ "{{" }} $labels.instance {{ "}}" }} est invisible - depuis plus de 3 minutes ! + Invisible depuis plus de 3 minutes ... diff --git a/roles/prometheus_federate/templates/prometheus/prometheus.yml.j2 b/roles/prometheus_federate/templates/prometheus.yml.j2 similarity index 100% rename from roles/prometheus_federate/templates/prometheus/prometheus.yml.j2 rename to roles/prometheus_federate/templates/prometheus.yml.j2