diff --git a/hosts b/hosts index 636874e..4b5ac3f 100644 --- a/hosts +++ b/hosts @@ -29,7 +29,6 @@ stream.adm.auro.re re2o-server.adm.auro.re re2o-ldap.adm.auro.re re2o-db.adm.auro.re -backup.adm.auro.re mail.adm.auro.re wikijs.adm.auro.re prometheus-aurore.adm.auro.re @@ -39,7 +38,6 @@ log.adm.auro.re bdd.adm.auro.re bdd-ovh.adm.auro.re litl.adm.auro.re -services-bdd-local.adm.auro.re log.adm.auro.re [aurore_testing_vm] @@ -53,7 +51,6 @@ horus.adm.auro.re [ovh_container] synapse.adm.auro.re -#services-bdd.adm.auro.re phabricator.adm.auro.re wiki.adm.auro.re www.adm.auro.re @@ -63,7 +60,6 @@ matrix-services.adm.auro.re [ovh_vm] serge.adm.auro.re passbolt.adm.auro.re -vpn-ovh.adm.auro.re docker-ovh.adm.auro.re switchs-manager.adm.auro.re ldap-replica-ovh.adm.auro.re @@ -252,7 +248,6 @@ perceval.adm.auro.re [edc_pve] chapalux.adm.auro.re -escalope.adm.auro.re [edc_vm] routeur-edc.adm.auro.re @@ -524,5 +519,4 @@ proxy.adm.auro.re [bdd] bdd.adm.auro.re bdd-ovh.adm.auro.re -services-bdd-local.adm.auro.re re2o-db.adm.auro.re diff --git a/log.yml b/log.yml index fb8db3d..e63f59a 100644 --- a/log.yml +++ b/log.yml @@ -3,7 +3,7 @@ roles: - rsyslog_collector -- hosts: all +- hosts: all,!unifi roles: - rsyslog_common ... diff --git a/roles/baseconfig/tasks/main.yml b/roles/baseconfig/tasks/main.yml index cdedd37..77cb834 100644 --- a/roles/baseconfig/tasks/main.yml +++ b/roles/baseconfig/tasks/main.yml @@ -32,11 +32,6 @@ - include_role: name: update_motd -- name: Remove Debian warranty motd - file: - path: /etc/motd - state: absent - # Configure APT mirrors on Debian Stretch - name: Configure APT mirrors when: diff --git a/roles/docker/tasks/main.yml b/roles/docker/tasks/main.yml index 66eed8c..5484f88 100644 --- a/roles/docker/tasks/main.yml +++ b/roles/docker/tasks/main.yml @@ -51,8 +51,11 @@ dest: /usr/local/bin/docker-compose mode: "0755" -- name: Indicate role in motd - template: - src: update-motd.d/05-service.j2 - dest: /etc/update-motd.d/05-docker - mode: 0755 +- include_role: + name: update_motd + vars: + motd_messages: + - key: 10-docker + message: >- + Docker est installé sur ce serveur. +... diff --git a/roles/docker/templates/update-motd.d/05-service.j2 b/roles/docker/templates/update-motd.d/05-service.j2 deleted file mode 100755 index ce5faaa..0000000 --- a/roles/docker/templates/update-motd.d/05-service.j2 +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -# {{ ansible_managed }} -echo "> Les recettes Docker-compose se trouvent dans /var/local/ansible-docker" diff --git a/roles/nginx/tasks/main.yml b/roles/nginx/tasks/main.yml index a8fb885..146b93c 100644 --- a/roles/nginx/tasks/main.yml +++ b/roles/nginx/tasks/main.yml @@ -148,11 +148,13 @@ group: www-data mode: 0644 -- name: Indicate role in motd - template: - src: update-motd.d/05-service.j2 - dest: /etc/update-motd.d/05-nginx - mode: 0755 +- include_role: + name: update_motd + vars: + motd_messages: + - key: 10-nginx + message: >- + NGinx est installé sur ce serveur. Voir /etc/nginx. - name: Clean old files file: @@ -162,3 +164,4 @@ - "/etc/nginx/snippets/options-ssl.conf" - "/var/www/custom_401.html" - "/var/www/robots.txt" +... diff --git a/roles/nginx/templates/update-motd.d/05-service.j2 b/roles/nginx/templates/update-motd.d/05-service.j2 deleted file mode 100755 index c52c655..0000000 --- a/roles/nginx/templates/update-motd.d/05-service.j2 +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/tail +14 -{{ ansible_managed | comment }} -> NGINX a été déployé sur cette machine. Voir /etc/nginx/. diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml index 670847b..d501c14 100644 --- a/roles/prometheus/handlers/main.yml +++ b/roles/prometheus/handlers/main.yml @@ -8,3 +8,4 @@ service: name: prometheus-snmp-exporter state: restarted +... diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index a3d2063..4dc518b 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -11,20 +11,16 @@ - name: Configure Prometheus template: - src: prometheus/prometheus.yml.j2 - dest: /etc/prometheus/prometheus.yml - mode: 0644 - notify: Restart Prometheus - -- name: Configure Prometheus alert rules - template: - src: "prometheus/{{ item }}.j2" + src: "{{ item }}.j2" dest: "/etc/prometheus/{{ item }}" - mode: 0644 - notify: Restart Prometheus + owner: prometheus + group: prometheus + mode: u=r,g=r,o= loop: + - prometheus.yml - alert.rules.yml - django.rules.yml + notify: Restart Prometheus - name: Make Prometheus snmp-exporter listen on localhost only lineinfile: @@ -36,10 +32,11 @@ # This file store SNMP OIDs - name: Configure Prometheus snmp-exporter template: - src: "prometheus/snmp.yml.j2" - dest: "/etc/prometheus/snmp.yml" - mode: 0600 + src: snmp.yml.j2 + dest: /etc/prometheus/snmp.yml owner: prometheus + group: prometheus + mode: u=r,g=r,o= notify: Restart prometheus-snmp-exporter # We don't need to restart Prometheus when updating nodes diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/alert.rules.yml.j2 new file mode 100644 index 0000000..84d8aa2 --- /dev/null +++ b/roles/prometheus/templates/alert.rules.yml.j2 @@ -0,0 +1,226 @@ +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + +groups: + + - name: alert.rules + rules: + + - alert: InstanceDown + expr: up{instance!~".*.borne.auro.re$"} == 0 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Invisible depuis plus de 3 minutes + + - alert: AccessPointDown + expr: up{instance=~".*.borne.auro.re$"} == 0 + for: 3m + labels: + severity: warning + annotations: + summary: >- + Invisible depuis plus de 3 minutes + + - alert: OutOfMemory + expr: >- + ( + node_memory_MemFree_bytes + + node_memory_Cached_bytes + + node_memory_Buffers_bytes + ) / node_memory_MemTotal_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ raw('$value | printf "%.1f"') }}% de mémoire + libre + + - alert: HostSwapIsFillingUp + expr: >- + ( + 1 - ( + node_memory_SwapFree_bytes + / node_memory_SwapTotal_bytes + ) + ) * 100 > 10 + for: 3m + labels: + severity: warning + annotations: + summary: >- + La mémoire swap est utilisée à {{ raw('$value | printf "%.1f"') }}% + + - alert: HostPhysicalComponentTooHot + expr: node_hwmon_temp_celsius > 79 + for: 3m + labels: + severity: critical + annotations: + summary: >- + La température de l'hôte est de {{ raw('$value') }}°C + ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }}) + + - alert: HostNodeOvertemperatureAlarm + expr: node_hwmon_temp_crit_alarm_celsius == 1 + for: 0m + labels: + severity: critical + annotations: + summary: >- + L'alarme de température de l'hôte est active + ({{ raw('$labels.chip') }}, {{ raw('$labels.sensor') }}) + + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Le noyau a tué {{ raw('$value') }} processus (OOM killer) + + - alert: HostEdacCorrectableErrorsDetected + expr: increase(node_edac_correctable_errors_total[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: >- + {{ raw('$value | print "%.1f"') }} erreur(s) ont été + corrigée(s) (EDAC) + + - alert: OutOfDiskSpace + expr: >- + node_filesystem_free_bytes / node_filesystem_size_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ raw('$value | printf "%.1f"') }}% d'espace libre pour + {{ raw('$labels.mountpoint') }} + + - alert: OutOfInodes + expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: >- + {{ raw('$value | printf "%.1f"') }}% d'inodes + restants pour {{ raw('$labels.mountpoint') }} + + - alert: CpuUsage + expr: >- + ( + 100 - avg by (instance) ( + irate(node_cpu_seconds_total{mode="idle"}[5m]) + ) * 100 + ) > 75 + for: 10m + labels: + severity: warning + annotations: + summary: >- + CPU à {{ raw('$value | printf "%.1f"') }}% + + - alert: SystemdServiceFailed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: >- + {{ raw('$labels.name') }} a échoué + + - alert: LoadUsage + expr: node_load1 > 5 + for: 2m + labels: + severity: warning + annotations: + summary: > + Charge à {{ raw('$value') }} + + - alert: UpsOutputSourceChanged + expr: upsOutputSource != 3 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Source d'alimentation changée + + - alert: UpsBatteryStatus + expr: upsBatteryStatus == 3 + for: 0m + labels: + severity: warning + annotations: + summary: >- + État de la batterie faible + + - alert: UpsBatteryStatus + expr: upsBatteryStatus == 4 + for: 0m + labels: + severity: critical + annotations: + summary: >- + État de la batterie critique + + - alert: UpsHighLoad + expr: upsOutputPercentLoad > 70 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Charge de {{ raw('$value | printf "%.1f"') }}% + + - alert: UpsWrongInputVoltage + expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) + for: 5m + labels: + severity: warning + annotations: + summary: >- + Tension d'entrée de {{ raw('$value') }}V + + - alert: UpsWrongOutputVoltage + expr: >- + abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d])) + < 3 * stddev_over_time(upsOutputVoltage[1d]) + for: 5m + labels: + severity: warning + annotations: + summary: >- + Tension de sortie de {{ raw('$value') }}V + + - alert: UpsTimeRemaining + expr: upsEstimatedMinutesRemaining < 8 + for: 0m + labels: + severity: warning + annotations: + summary: >- + Autonomie restante de {{ raw('$value') }} min + + - alert: UpsTimeRemaining + expr: upsEstimatedMinutesRemaining < 5 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Autonomie restante de {{ raw('$value') }} min + +... diff --git a/roles/prometheus/templates/prometheus/django.rules.yml.j2 b/roles/prometheus/templates/django.rules.yml.j2 similarity index 100% rename from roles/prometheus/templates/prometheus/django.rules.yml.j2 rename to roles/prometheus/templates/django.rules.yml.j2 diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 similarity index 98% rename from roles/prometheus/templates/prometheus/prometheus.yml.j2 rename to roles/prometheus/templates/prometheus.yml.j2 index 7399f48..e97e986 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -1,4 +1,5 @@ -# {{ ansible_managed }} +--- +{{ ansible_managed | comment }} global: # scrape_interval is set to the global default (60s) @@ -100,3 +101,4 @@ scrape_configs: file_sd_configs: - files: - '/etc/prometheus/targets_docker.json' +... diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 deleted file mode 100644 index bf4127b..0000000 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ /dev/null @@ -1,138 +0,0 @@ -# {{ ansible_managed }} -{# As this is also Jinja2 it will conflict without a raw block #} -{# Depending of Prometheus Node exporter version, rules can change depending of version #} -{% raw %} -groups: -- name: alert.rules - rules: - - # Alert for any instance that is unreachable for >3 minutes. - - alert: InstanceDown - expr: up == 0 - for: 3m - labels: - severity: critical - annotations: - summary: "{{ $labels.instance }} est invisible depuis plus de 3 minutes !" - - # Alert for out of memory - - alert: OutOfMemory - expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Mémoire libre de {{ $labels.instance }} à {{ humanize $value }}%." - - # Alert for out of disk space - - alert: OutOfDiskSpace - expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ humanize $value }}%." - - # Alert for out of inode space on disk - - alert: OutOfInodes - expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}." - - # Alert for high CPU usage - - alert: CpuUsage - expr: (100 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 75 - for: 10m - labels: - severity: warning - annotations: - summary: "CPU sur {{ $labels.instance }} à {{ humanize $value }}%." - - # Check systemd unit (> buster) - - alert: SystemdServiceFailed - expr: node_systemd_unit_state{state="failed"} == 1 - for: 10m - labels: - severity: warning - annotations: - summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" - - # Check load of instance - - alert: LoadUsage - expr: node_load1 > 5 - for: 2m - labels: - severity: warning - annotations: - summary: "La charge de {{ $labels.instance }} est à {{ $value }} !" - - # Check UPS - - alert: UpsOutputSourceChanged - expr: upsOutputSource != 3 - for: 1m - labels: - severity: warning - annotations: - summary: "La source d'alimentation de {{ $labels.instance }} a changé !" - - - alert: UpsBatteryStatusWarning - expr: upsBatteryStatus == 3 - for: 2m - labels: - severity: warning - annotations: - summary: "L'état de la batterie de {{ $labels.instance }} est faible !" - - - alert: UpsBatteryStatusCritical - expr: upsBatteryStatus == 4 - for: 10m - labels: - severity: warning - annotations: - summary: "L'état de la batterie de {{ $labels.instance }} est affaibli !" - - - alert: UpsHighLoad - expr: upsOutputPercentLoad > 70 - for: 5m - labels: - severity: critical - annotations: - summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !" - - - alert: UpsWrongInputVoltage - expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) - for: 10m - labels: - severity: warning - annotations: - summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V." - - - alert: UpsWrongOutputVoltage - expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) - for: 10m - labels: - severity: warning - annotations: - summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V." - - - alert: UpsTimeRemainingWarning - expr: upsEstimatedMinutesRemaining < 8 - for: 1m - labels: - severity: warning - annotations: - summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." - - - alert: UpsTimeRemainingCritical - expr: upsEstimatedMinutesRemaining < 5 - for: 1m - labels: - severity: critical - annotations: - summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." - - -{% endraw %} diff --git a/roles/prometheus/templates/prometheus/snmp.yml.j2 b/roles/prometheus/templates/snmp.yml.j2 similarity index 100% rename from roles/prometheus/templates/prometheus/snmp.yml.j2 rename to roles/prometheus/templates/snmp.yml.j2 diff --git a/roles/prometheus_federate/tasks/main.yml b/roles/prometheus_federate/tasks/main.yml index 73ae803..02ae85e 100644 --- a/roles/prometheus_federate/tasks/main.yml +++ b/roles/prometheus_federate/tasks/main.yml @@ -10,19 +10,15 @@ - name: Configure Prometheus template: - src: prometheus/prometheus.yml.j2 - dest: /etc/prometheus/prometheus.yml - mode: 0644 - notify: Restart Prometheus - -- name: Configure Prometheus alert rules - template: - src: "prometheus/{{ item }}.j2" + src: "{{ item }}.j2" dest: "/etc/prometheus/{{ item }}" - mode: 0644 - notify: Restart Prometheus + owner: prometheus + group: prometheus + mode: u=r,g=r,o= loop: + - prometheus.yml - alert.rules.yml + notify: Restart Prometheus # We don't need to restart Prometheus when updating nodes - name: Configure Prometheus Federate devices diff --git a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus_federate/templates/alert.rules.yml.j2 similarity index 67% rename from roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 rename to roles/prometheus_federate/templates/alert.rules.yml.j2 index d30511f..95f457e 100644 --- a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus_federate/templates/alert.rules.yml.j2 @@ -1,6 +1,10 @@ --- {{ ansible_managed | comment }} +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + groups: - name: alert.rules rules: @@ -11,6 +15,5 @@ groups: severity: critical annotations: summary: >- - Federate : {{ "{{" }} $labels.instance {{ "}}" }} est invisible - depuis plus de 3 minutes ! + Invisible depuis plus de 3 minutes ... diff --git a/roles/prometheus_federate/templates/prometheus/prometheus.yml.j2 b/roles/prometheus_federate/templates/prometheus.yml.j2 similarity index 100% rename from roles/prometheus_federate/templates/prometheus/prometheus.yml.j2 rename to roles/prometheus_federate/templates/prometheus.yml.j2 diff --git a/roles/re2o_service/tasks/main.yml b/roles/re2o_service/tasks/main.yml index 68e963c..882fdf9 100644 --- a/roles/re2o_service/tasks/main.yml +++ b/roles/re2o_service/tasks/main.yml @@ -40,8 +40,12 @@ group: nogroup state: link -- name: Indicate in motd service location - template: - src: update-motd.d/05-service.j2 - dest: "/etc/update-motd.d/05-re2o-{{ service_name }}" - mode: 0755 +- include_role: + name: update_motd + vars: + motd_messages: + - key: "15-re2o-service-{{ service_name }}" + message: >- + Le service re2o {{ service_name }} est dans + {{ service_homedir }}/{{ service_name }}. +... diff --git a/roles/re2o_service/templates/update-motd.d/05-service.j2 b/roles/re2o_service/templates/update-motd.d/05-service.j2 deleted file mode 100755 index 4ed8a74..0000000 --- a/roles/re2o_service/templates/update-motd.d/05-service.j2 +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -# {{ ansible_managed }} -echo "✨ Le service re2o {{ service_name }} est dans {{ service_homedir }}/{{ service_name }}" diff --git a/roles/rsyslog_collector/templates/20-collector.conf.j2 b/roles/rsyslog_collector/templates/20-collector.conf.j2 index 897945f..e8b277f 100644 --- a/roles/rsyslog_collector/templates/20-collector.conf.j2 +++ b/roles/rsyslog_collector/templates/20-collector.conf.j2 @@ -52,3 +52,6 @@ input( ruleset="handleIncomingLogs" ) {% endfor %} + +# Store local logs too +call handleIncomingLogs diff --git a/roles/unifi_controller/tasks/main.yml b/roles/unifi_controller/tasks/main.yml index 7f886f2..b43c74b 100644 --- a/roles/unifi_controller/tasks/main.yml +++ b/roles/unifi_controller/tasks/main.yml @@ -40,8 +40,11 @@ retries: 3 until: apt_result is succeeded -- name: Indicate role in motd - template: - src: update-motd.d/05-service.j2 - dest: /etc/update-motd.d/05-unifi-controller - mode: 0755 +- include_role: + name: update_motd + vars: + motd_messages: + - key: 10-unifi-controller + message: >- + Le contrôleur Unifi a été installé sur ce serveur. +... diff --git a/roles/unifi_controller/templates/update-motd.d/05-service.j2 b/roles/unifi_controller/templates/update-motd.d/05-service.j2 deleted file mode 100755 index b768773..0000000 --- a/roles/unifi_controller/templates/update-motd.d/05-service.j2 +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -# {{ ansible_managed }} -echo "> Le contrôleur Unifi a été déployé sur cette machine."