From 45041be2ab9e7888b688f27b7250a28d97c35f06 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sat, 10 Apr 2021 17:29:50 +0200 Subject: [PATCH 01/27] Install postgres exporter --- monitoring.yml | 5 +++ roles/prometheus_postgres/handlers/main.yml | 5 +++ roles/prometheus_postgres/tasks/main.yml | 41 +++++++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 roles/prometheus_postgres/handlers/main.yml create mode 100644 roles/prometheus_postgres/tasks/main.yml diff --git a/monitoring.yml b/monitoring.yml index ac0d59f..23e7844 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -151,6 +151,11 @@ roles: - prometheus_federate +#Postgres Exporters +- hosts: bdd.adm.auro.re,bdd-ovh.adm.auro.re + roles: + - prometheus_postgres + # Monitor all hosts - hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container diff --git a/roles/prometheus_postgres/handlers/main.yml b/roles/prometheus_postgres/handlers/main.yml new file mode 100644 index 0000000..05837d1 --- /dev/null +++ b/roles/prometheus_postgres/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart prometheus-postgres-exporter + service: + name: prometheus-postgres-exporter + state: restarted diff --git a/roles/prometheus_postgres/tasks/main.yml b/roles/prometheus_postgres/tasks/main.yml new file mode 100644 index 0000000..10d612c --- /dev/null +++ b/roles/prometheus_postgres/tasks/main.yml @@ -0,0 +1,41 @@ +--- +- name: Install Prometheus postgres-exporter + apt: + update_cache: true + name: prometheus-postgres-exporter + register: apt_result + retries: 3 + until: apt_result is succeeded + when: + - ansible_lsb.codename == 'bullseye' + +- name: Make Prometheus postgres-exporter connect to databases using peercred + lineinfile: + path: /etc/default/prometheus-postgres-exporter + regexp: '^DATA_SOURCE_NAME=' + line: | + DATA_SOURCE_NAME="user=postgres host=/var/run/postgresql/ sslmode=disable" + notify: Restart prometheus-postgres-exporter + +- name: Make Prometheus postgres-exporter launched by postgres user + lineinfile: + path: /lib/systemd/system/prometheus-postgres-exporter.service + regexp: '^User=' + line: | + User=postgres + notify: Restart prometheus-postgres-exporter + +- name: Make Prometheus postgres-exporter listen on adm only + lineinfile: + path: /etc/default/prometheus-postgres-exporter + regexp: '^ARGS=' + line: | + ARGS="--web.listen-address={{ ansible_hostname }}.adm.auro.re:9187" + notify: Restart prometheus-postgres-exporter + +- name: Activate prometheus-postgres-exporter service + systemd: + name: prometheus-postgres-exporter + enabled: true + daemon_reload: yes + state: started From dd48302585c49e83b7941eef7301a142e2179b53 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sat, 10 Apr 2021 18:01:55 +0200 Subject: [PATCH 02/27] Configure Prometheus and Prometheus federate to scrape Postgres Exporter --- monitoring.yml | 6 ++++++ roles/prometheus/tasks/main.yml | 8 +++++++- roles/prometheus/templates/prometheus.yml.j2 | 14 ++++++++++++++ .../templates/prometheus.yml.j2 | 1 + 4 files changed, 28 insertions(+), 1 deletion(-) diff --git a/monitoring.yml b/monitoring.yml index 23e7844..13f439d 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -99,6 +99,9 @@ prometheus_targets: - targets: | {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }} + prometheus_postgres_targets: + - targets: + - bdd.adm.auro.re prometheus_switch_snmp_targets: - targets: - yggdrasil.switch.auro.re @@ -128,6 +131,9 @@ prometheus_targets: - targets: | {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} + prometheus_postgres_targets: + - targets: + - bdd-ovh.adm.auro.re prometheus_docker_targets: - docker-ovh.adm.auro.re:8087 roles: diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 4dc518b..4db338b 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -75,6 +75,13 @@ mode: 0644 when: prometheus_docker_targets is defined +- name: Configure Prometheus postgres monitoring + copy: + content: "{{ prometheus_postgres_targets | to_nice_json }}\n" + dest: /etc/prometheus/targets_postgres.json + mode: 0644 + when: prometheus_postgres_targets is defined + - name: Activate prometheus service systemd: name: prometheus @@ -88,4 +95,3 @@ - key: 05-prometheus message: >- Prometheus est déployé sur cette machine (voir /etc/prometheus) -... diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index e97e986..8fe3424 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -101,4 +101,18 @@ scrape_configs: file_sd_configs: - files: - '/etc/prometheus/targets_docker.json' + + - job_name: postgresql + file_sd_configs: + - files: + - '/etc/prometheus/targets_postgres.json' + relabel_configs: + # Do not put :9187 in instance name, rather here + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [__param_target] + target_label: __address__ + replacement: '$1:9187' ... diff --git a/roles/prometheus_federate/templates/prometheus.yml.j2 b/roles/prometheus_federate/templates/prometheus.yml.j2 index 23e649b..71e6874 100644 --- a/roles/prometheus_federate/templates/prometheus.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus.yml.j2 @@ -31,6 +31,7 @@ scrape_configs: params: match[]: - '{job="servers"}' + - '{job="postgresql"}' - '{job="prometheus"}' - '{job="unifi_snmp"}' - '{job="django"}' From 9ebdf15bb905247328dfeb424d8c14bbe26f282e Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sun, 11 Apr 2021 15:58:35 +0200 Subject: [PATCH 03/27] Splite alerts on some files --- roles/prometheus/tasks/main.yml | 27 ++- .../templates/postgres.rules.yml.j2 | 219 ++++++++++++++++++ roles/prometheus/templates/prometheus.yml.j2 | 3 +- ...alert.rules.yml.j2 => server.rules.yml.j2} | 75 +----- roles/prometheus/templates/ups.rules.yml.j2 | 87 +++++++ roles/prometheus_postgres/tasks/main.yml.save | 21 ++ 6 files changed, 355 insertions(+), 77 deletions(-) create mode 100644 roles/prometheus/templates/postgres.rules.yml.j2 rename roles/prometheus/templates/{alert.rules.yml.j2 => server.rules.yml.j2} (67%) create mode 100644 roles/prometheus/templates/ups.rules.yml.j2 create mode 100644 roles/prometheus_postgres/tasks/main.yml.save diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 4db338b..3fe3db8 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -18,8 +18,33 @@ mode: u=r,g=r,o= loop: - prometheus.yml - - alert.rules.yml + notify: Restart Prometheus + +- name: Creates directory for alerts + file: + path: /etc/prometheus/alerts + state: directory + +- name: Remove old files + file: + path: "/etc/prometheus/{{ item }}" + state: absent + with_items: + - alerts.rules.yml - django.rules.yml + +- name: Configure Prometheus alerts + template: + src: "{{ item }}.j2" + dest: "/etc/prometheus/alerts/{{ item }}" + owner: prometheus + group: prometheus + mode: u=r,g=r,o= + loop: + - server.rules.yml + - django.rules.yml + - ups.rules.yml + - postgres.rules.yml notify: Restart Prometheus - name: Make Prometheus snmp-exporter listen on localhost only diff --git a/roles/prometheus/templates/postgres.rules.yml.j2 b/roles/prometheus/templates/postgres.rules.yml.j2 new file mode 100644 index 0000000..281e554 --- /dev/null +++ b/roles/prometheus/templates/postgres.rules.yml.j2 @@ -0,0 +1,219 @@ +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + +groups: + + - name: postgres.rules + rules: + - alert: PostgresqlDown + expr: pg_up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Serveur Postgresql down (instance {{ raw('$labels.instance') }}) + + - alert: PostgresqlRestarted + expr: time() - pg_postmaster_start_time_seconds < 60 + for: 0m + labels: + severity: critical + annotations: + summary: Serveur Postgresql redémarré (instance {{ raw('$labels.instance') }}) + + - alert: PostgresqlExporterError + expr: pg_exporter_last_scrape_error > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Erreur dans l'exporter Postgresql (instance {{ raw('$labels.instance') }}) + + - alert: PostgresqlReplicationLag + expr: + pg_replication_lag > 30 + and + ON(instance) pg_replication_is_replica == 1 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Réplication Postgresql lag ({{ raw('$value') }} > 30s) + (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname')}} ) + + - alert: PostgresqlTableNotVaccumed + expr: + time() - pg_stat_user_tables_last_autovacuum + > 60 * 60 * 24 + for: 0m + labels: + severity: warning + annotations: + summary: >- + La table n'a pas été aspirée depuis 24h + (Instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) + + - alert: PostgresqlTableNotAnalyzed + expr: + time() - pg_stat_user_tables_last_autoanalyze + > 60 * 60 * 24 + for: 0m + labels: + severity: warning + annotations: + summary: >- + Table non-analysée depuis 24h + (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}) + + - alert: PostgresqlTooManyConnections + expr: + ( + sum by (datname) + (pg_stat_activity_count{datname!~"template.*|postgres"}) + ) * 100 + > pg_settings_max_connections * 80 + for: 2m + labels: + severity: warning + annotations: + summary: >- + PostgreSQL a trop de connexions + ({{ raw('$value | printf "%.1f"') }} > 80%) + (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}) + + - alert: PostgresqlDeadLocks + expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 + for: 0m + labels: + severity: warning + annotations: + summary: >- + PostgreSQL a des cadenas morts + (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) + + - alert: PostgresqlSlowQueries + expr: pg_slow_queries > 0 + for: 2m + labels: + severity: warning + annotations: + summary: >- + Présence de requêtes lentes (slow-queries) + (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) + + - alert: PostgresqlHighRollbackRate + expr: + ( + rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / + rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) + ) * 100 + > 2 + for: 0m + labels: + severity: warning + annotations: + summary: >- + Postgresql a un taux de retour en arrière (rollback) élevé + (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} %) + + - alert: PostgresqlWaleReplicationStopped + expr: rate(pg_xlog_position_bytes[1m]) == 0 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Réplication de Postgresql WALE stopée + (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) + + - alert: PostgresqlHighRateStatementTimeout + expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Beaucoup de requêtes Postgresql sont timeout + (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} ) + + - alert: PostgresqlHighRateDeadlock + expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Postgresql a un fort taux de deadlock + (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} ) + +# - alert: PostgresqlReplicationLagBytes +# expr: +# (pg_xlog_position_bytes and pg_replication_is_replica == 0) +# - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1) +# > 1e+09 +# for: 0m +# labels: +# severity: critical +# annotations: +# summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} ) + + - alert: PostgresqlTooManyDeadTuples + expr: + ( + (pg_stat_user_tables_n_dead_tup > 10000) + / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) + ) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1) + for: 2m + labels: + severity: warning + annotations: + summary: >- + Les tuples morts PostgreSQL sont trop volumineux + (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} ) + + - alert: PostgresqlSplitBrain + expr: count(pg_replication_is_replica == 0) != 1 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Split Brain, trop de bases de données Postgresql primaires en mode lecture-écriture + (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value') }} ) + + - alert: PostgresqlPromotedNode + expr: + pg_replication_is_replica + and + changes(pg_replication_is_replica[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: >- + Le serveur de secours PostgreSQL a été promu comme nœud principal + (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value') }}) + + - alert: PostgresqlTooManyLocksAcquired + expr: + ( + (sum (pg_locks_count)) + / (pg_settings_max_locks_per_transaction * pg_settings_max_connections) + ) * 100 > 20 + for: 2m + labels: + severity: critical + annotations: + summary: >- + Trop de verrous acquis sur la base de données. + Si cette alerte se produit fréquemment, nous devrons peut-être augmenter + le paramètre postgres max_locks_per_transaction + (instance {{ raw('$labels.instance') }}, value = {{ raw('$value | printf "%.1f"') }} ) + +... + diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 8fe3424..26020dc 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -20,8 +20,7 @@ alerting: # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - - "alert.rules.yml" # Monitoring alerts, this is the file you may be searching! - - "django.rules.yml" # Custom rules specific for Django project monitoring + - "alerts/*.yml" # Monitoring alerts, this is the file you may be searching! # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/server.rules.yml.j2 similarity index 67% rename from roles/prometheus/templates/alert.rules.yml.j2 rename to roles/prometheus/templates/server.rules.yml.j2 index 84d8aa2..ac09881 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/server.rules.yml.j2 @@ -7,7 +7,7 @@ groups: - - name: alert.rules + - name: server.rules rules: - alert: InstanceDown @@ -149,78 +149,5 @@ groups: summary: > Charge à {{ raw('$value') }} - - alert: UpsOutputSourceChanged - expr: upsOutputSource != 3 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Source d'alimentation changée - - - alert: UpsBatteryStatus - expr: upsBatteryStatus == 3 - for: 0m - labels: - severity: warning - annotations: - summary: >- - État de la batterie faible - - - alert: UpsBatteryStatus - expr: upsBatteryStatus == 4 - for: 0m - labels: - severity: critical - annotations: - summary: >- - État de la batterie critique - - - alert: UpsHighLoad - expr: upsOutputPercentLoad > 70 - for: 3m - labels: - severity: critical - annotations: - summary: >- - Charge de {{ raw('$value | printf "%.1f"') }}% - - - alert: UpsWrongInputVoltage - expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) - for: 5m - labels: - severity: warning - annotations: - summary: >- - Tension d'entrée de {{ raw('$value') }}V - - - alert: UpsWrongOutputVoltage - expr: >- - abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d])) - < 3 * stddev_over_time(upsOutputVoltage[1d]) - for: 5m - labels: - severity: warning - annotations: - summary: >- - Tension de sortie de {{ raw('$value') }}V - - - alert: UpsTimeRemaining - expr: upsEstimatedMinutesRemaining < 8 - for: 0m - labels: - severity: warning - annotations: - summary: >- - Autonomie restante de {{ raw('$value') }} min - - - alert: UpsTimeRemaining - expr: upsEstimatedMinutesRemaining < 5 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Autonomie restante de {{ raw('$value') }} min ... diff --git a/roles/prometheus/templates/ups.rules.yml.j2 b/roles/prometheus/templates/ups.rules.yml.j2 new file mode 100644 index 0000000..eafdee3 --- /dev/null +++ b/roles/prometheus/templates/ups.rules.yml.j2 @@ -0,0 +1,87 @@ +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + +groups: + + - name: ups.rules + rules: + + - alert: UpsOutputSourceChanged + expr: upsOutputSource != 3 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Source d'alimentation changée + + - alert: UpsBatteryStatus + expr: upsBatteryStatus == 3 + for: 0m + labels: + severity: warning + annotations: + summary: >- + État de la batterie faible + + - alert: UpsBatteryStatus + expr: upsBatteryStatus == 4 + for: 0m + labels: + severity: critical + annotations: + summary: >- + État de la batterie critique + + - alert: UpsHighLoad + expr: upsOutputPercentLoad > 70 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Charge de {{ raw('$value | printf "%.1f"') }}% + + - alert: UpsWrongInputVoltage + expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) + for: 5m + labels: + severity: warning + annotations: + summary: >- + Tension d'entrée de {{ raw('$value') }}V + + - alert: UpsWrongOutputVoltage + expr: >- + abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d])) + < 3 * stddev_over_time(upsOutputVoltage[1d]) + for: 5m + labels: + severity: warning + annotations: + summary: >- + Tension de sortie de {{ raw('$value') }}V + + - alert: UpsTimeRemaining + expr: upsEstimatedMinutesRemaining < 8 + for: 0m + labels: + severity: warning + annotations: + summary: >- + Autonomie restante de {{ raw('$value') }} min + + - alert: UpsTimeRemaining + expr: upsEstimatedMinutesRemaining < 5 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Autonomie restante de {{ raw('$value') }} min + +... diff --git a/roles/prometheus_postgres/tasks/main.yml.save b/roles/prometheus_postgres/tasks/main.yml.save new file mode 100644 index 0000000..2ef8d87 --- /dev/null +++ b/roles/prometheus_postgres/tasks/main.yml.save @@ -0,0 +1,21 @@ +--- +- name: Install Prometheus postgres-exporter + apt: + update_cache: true + name: prometheus-postgres-exporter + register: apt_result + retries: 3 + until: apt_result is succeeded + when: + - ansible_lsb.codename != 'bullseye' + +# Doesn't work on Debian Stretch with the old prometheus package +- name: Make Prometheus node-exporter listen on adm only + lineinfile: + path: /etc/default/prometheus-node-exporter + regexp: '^ARGS=' + line: | + ARGS="--web.listen-address={{ ansible_hostname }}.adm.auro.re:9100" + notify: Restart prometheus-node-exporter + +git push --set-upstream origin add_ups_231 From 6775d9ecde8745225f2294fa30573fe48cf8dfd9 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sun, 11 Apr 2021 16:43:34 +0200 Subject: [PATCH 04/27] Add docker rules --- roles/prometheus/tasks/main.yml | 1 + .../prometheus/templates/docker.rules.yml.j2 | 50 +++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 roles/prometheus/templates/docker.rules.yml.j2 diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 3fe3db8..075da46 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -42,6 +42,7 @@ mode: u=r,g=r,o= loop: - server.rules.yml + - docker.rules.yml - django.rules.yml - ups.rules.yml - postgres.rules.yml diff --git a/roles/prometheus/templates/docker.rules.yml.j2 b/roles/prometheus/templates/docker.rules.yml.j2 new file mode 100644 index 0000000..8ccc565 --- /dev/null +++ b/roles/prometheus/templates/docker.rules.yml.j2 @@ -0,0 +1,50 @@ +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + +groups: + + - name: docker.rules + rules: + + - alert: ContainerDown + expr: docker_container_running_state + for: 0m + labels: + severity: critical + annotations: + summary: >- + Container Docker mort + (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}) + + - alert: ContainerFailed + expr: sum(increase(docker_container_restart_count[5m])) > 2 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Container Docker redémarre souvent + (instance raw('{{ $labels.instance') }}, container {{ raw('$labels.name') }}) + + - alert: ContainerFailed + expr: + ( + docker_container_cpu_used_total + / + docker_container_cpu_capacity_total + ) * 100 + > 30 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Container Docker utilise beaucoup de CPU + (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}, + value {{ raw('$value | printf "%.1f"'') }}) + +... From 9d18ebb7f14a5b4f543ed9a9066711c46877620b Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sun, 11 Apr 2021 17:18:32 +0200 Subject: [PATCH 05/27] Fix docker rules --- roles/prometheus/tasks/main.yml | 1 + .../prometheus/templates/docker.rules.yml.j2 | 74 +++++++++---------- 2 files changed, 38 insertions(+), 37 deletions(-) diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 075da46..0c76907 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -33,6 +33,7 @@ - alerts.rules.yml - django.rules.yml + - name: Configure Prometheus alerts template: src: "{{ item }}.j2" diff --git a/roles/prometheus/templates/docker.rules.yml.j2 b/roles/prometheus/templates/docker.rules.yml.j2 index 8ccc565..ce825ad 100644 --- a/roles/prometheus/templates/docker.rules.yml.j2 +++ b/roles/prometheus/templates/docker.rules.yml.j2 @@ -10,41 +10,41 @@ groups: - name: docker.rules rules: - - alert: ContainerDown - expr: docker_container_running_state - for: 0m - labels: - severity: critical - annotations: - summary: >- - Container Docker mort - (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}) - - - alert: ContainerFailed - expr: sum(increase(docker_container_restart_count[5m])) > 2 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Container Docker redémarre souvent - (instance raw('{{ $labels.instance') }}, container {{ raw('$labels.name') }}) - - - alert: ContainerFailed - expr: - ( - docker_container_cpu_used_total - / - docker_container_cpu_capacity_total - ) * 100 - > 30 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Container Docker utilise beaucoup de CPU - (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}, - value {{ raw('$value | printf "%.1f"'') }}) - + - alert: ContainerDown + expr: docker_container_running_state != 1 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Container Docker mort + (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}) + + - alert: ContainerFailed + expr: sum(increase(docker_container_restart_count[5m])) > 2 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Container Docker redémarre souvent + (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}) + + - alert: ContainerFailed + expr: + ( + docker_container_cpu_used_total + / + docker_container_cpu_capacity_total + ) * 100 + > 30 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Container Docker utilise beaucoup de CPU + (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}, + value {{ raw('$value | printf "%.1f"') }}) + ... From 304437da978d89c6e59afe4716cfed59c39473cf Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sun, 11 Apr 2021 20:56:40 +0200 Subject: [PATCH 06/27] Remove .save file --- roles/prometheus_postgres/tasks/main.yml.save | 21 ------------------- 1 file changed, 21 deletions(-) delete mode 100644 roles/prometheus_postgres/tasks/main.yml.save diff --git a/roles/prometheus_postgres/tasks/main.yml.save b/roles/prometheus_postgres/tasks/main.yml.save deleted file mode 100644 index 2ef8d87..0000000 --- a/roles/prometheus_postgres/tasks/main.yml.save +++ /dev/null @@ -1,21 +0,0 @@ ---- -- name: Install Prometheus postgres-exporter - apt: - update_cache: true - name: prometheus-postgres-exporter - register: apt_result - retries: 3 - until: apt_result is succeeded - when: - - ansible_lsb.codename != 'bullseye' - -# Doesn't work on Debian Stretch with the old prometheus package -- name: Make Prometheus node-exporter listen on adm only - lineinfile: - path: /etc/default/prometheus-node-exporter - regexp: '^ARGS=' - line: | - ARGS="--web.listen-address={{ ansible_hostname }}.adm.auro.re:9100" - notify: Restart prometheus-node-exporter - -git push --set-upstream origin add_ups_231 From c48fe1ae1772b21282d8349e88633ed11ab12798 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sun, 11 Apr 2021 20:57:53 +0200 Subject: [PATCH 07/27] 7% rollback for the warning --- roles/prometheus/templates/postgres.rules.yml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/prometheus/templates/postgres.rules.yml.j2 b/roles/prometheus/templates/postgres.rules.yml.j2 index 281e554..61af5dc 100644 --- a/roles/prometheus/templates/postgres.rules.yml.j2 +++ b/roles/prometheus/templates/postgres.rules.yml.j2 @@ -112,7 +112,7 @@ groups: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) ) * 100 - > 2 + > 7 for: 0m labels: severity: warning From 749188e297a2c59bf410dd85863dc4bb58d850c5 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sun, 11 Apr 2021 21:27:43 +0200 Subject: [PATCH 08/27] Add a group with all radius --- hosts | 13 +++++++++++++ monitoring.yml | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/hosts b/hosts index 0f1c61a..5403ae1 100644 --- a/hosts +++ b/hosts @@ -549,3 +549,16 @@ proxy.adm.auro.re bdd.adm.auro.re bdd-ovh.adm.auro.re re2o-db.adm.auro.re + +[radius] +radius-aurore.adm.auro.re +radius-fleming.adm.auro.re +radius-fleming-backup.adm.auro.re +radius-edc.adm.auro.re +radius-edc-backup.adm.auro.re +radius-gs.adm.auro.re +radius-gs-backup.adm.auro.re +radius-pacaterie.adm.auro.re +radius-pacaterie-backup.adm.auro.re +radius-rives.adm.auro.re +radis-rives-backup.adm.auro.re diff --git a/monitoring.yml b/monitoring.yml index 13f439d..9d7495a 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -101,7 +101,7 @@ {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }} prometheus_postgres_targets: - targets: - - bdd.adm.auro.re + - {{ groups['bdd'] | list | sort }} prometheus_switch_snmp_targets: - targets: - yggdrasil.switch.auro.re @@ -158,7 +158,7 @@ - prometheus_federate #Postgres Exporters -- hosts: bdd.adm.auro.re,bdd-ovh.adm.auro.re +- hosts: bdd,radius-*.adm.auro.re roles: - prometheus_postgres From ca3d89e671acf49e3581a7717342f8ed5925d224 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sun, 11 Apr 2021 21:29:02 +0200 Subject: [PATCH 09/27] Install postgresql on radius. Monitore it on prometheus-aurore --- monitoring.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monitoring.yml b/monitoring.yml index 9d7495a..6628165 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -101,7 +101,7 @@ {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }} prometheus_postgres_targets: - targets: - - {{ groups['bdd'] | list | sort }} + - {{ groups['bdd'] + groups['radius'] | list | sort }} prometheus_switch_snmp_targets: - targets: - yggdrasil.switch.auro.re @@ -158,7 +158,7 @@ - prometheus_federate #Postgres Exporters -- hosts: bdd,radius-*.adm.auro.re +- hosts: bdd,radius roles: - prometheus_postgres From 7d99cef57c681830f9c3c58030edf7caa3d3916d Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sun, 11 Apr 2021 21:32:20 +0200 Subject: [PATCH 10/27] Fix typo --- monitoring.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monitoring.yml b/monitoring.yml index 6628165..6b88b07 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -100,8 +100,8 @@ - targets: | {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }} prometheus_postgres_targets: - - targets: - - {{ groups['bdd'] + groups['radius'] | list | sort }} + - targets: | + {{ groups['bdd'] + groups['radius'] | list | sort }} prometheus_switch_snmp_targets: - targets: - yggdrasil.switch.auro.re From bdcdb8ceaeb6d5f93b90eebf99a04a8cc03c8554 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sun, 11 Apr 2021 21:32:53 +0200 Subject: [PATCH 11/27] Radius, not a radis ! Fix typo... --- hosts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hosts b/hosts index 5403ae1..75243c8 100644 --- a/hosts +++ b/hosts @@ -561,4 +561,4 @@ radius-gs-backup.adm.auro.re radius-pacaterie.adm.auro.re radius-pacaterie-backup.adm.auro.re radius-rives.adm.auro.re -radis-rives-backup.adm.auro.re +radius-rives-backup.adm.auro.re From 764f0f106d7aff4a63db00d3d4f15a2c929cbaa5 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sun, 11 Apr 2021 21:38:29 +0200 Subject: [PATCH 12/27] Install postgres exporter when it is bullseye or buster --- roles/prometheus_postgres/tasks/main.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/roles/prometheus_postgres/tasks/main.yml b/roles/prometheus_postgres/tasks/main.yml index 10d612c..80136bd 100644 --- a/roles/prometheus_postgres/tasks/main.yml +++ b/roles/prometheus_postgres/tasks/main.yml @@ -6,8 +6,6 @@ register: apt_result retries: 3 until: apt_result is succeeded - when: - - ansible_lsb.codename == 'bullseye' - name: Make Prometheus postgres-exporter connect to databases using peercred lineinfile: From 6c64bb214c0232d1721942a1f77cb94925b45ecf Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sun, 11 Apr 2021 22:01:21 +0200 Subject: [PATCH 13/27] fix CI --- monitoring.yml | 4 ++-- roles/prometheus/tasks/main.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/monitoring.yml b/monitoring.yml index 6b88b07..16cd489 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -130,7 +130,7 @@ # Prometheus targets.json prometheus_targets: - targets: | - {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} + {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} prometheus_postgres_targets: - targets: - bdd-ovh.adm.auro.re @@ -145,7 +145,7 @@ prometheus_alertmanager: docker-ovh.adm.auro.re:9093 snmp_unifi_password: "{{ vault_snmp_unifi_password }}" - # Prometheus targets.json + #Prometheus targets.json prometheus_targets: - prometheus-edc.adm.auro.re - prometheus-gs.adm.auro.re diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 0c76907..b1fa112 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -20,12 +20,12 @@ - prometheus.yml notify: Restart Prometheus -- name: Creates directory for alerts +- name: Creates directory for alerts file: path: /etc/prometheus/alerts state: directory -- name: Remove old files +- name: Remove old files file: path: "/etc/prometheus/{{ item }}" state: absent From 6e376a72e31936c20c02de972a582ef876c44f33 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Sun, 11 Apr 2021 22:04:05 +0200 Subject: [PATCH 14/27] fix CI --- monitoring.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monitoring.yml b/monitoring.yml index 16cd489..c2f607b 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -129,8 +129,8 @@ # Prometheus targets.json prometheus_targets: - - targets: | - {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} + - targets: | + {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} prometheus_postgres_targets: - targets: - bdd-ovh.adm.auro.re From 1908deee9c5ace7ec2a9eaac7a9e77da06a5667e Mon Sep 17 00:00:00 2001 From: pz2891 Date: Mon, 12 Apr 2021 10:01:39 +0200 Subject: [PATCH 15/27] fix CI --- monitoring.yml | 8 ++++---- roles/prometheus/tasks/main.yml | 1 + roles/prometheus_postgres/tasks/main.yml | 2 +- roles/radius/tasks/main.yml | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/monitoring.yml b/monitoring.yml index c2f607b..978197f 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -132,8 +132,8 @@ - targets: | {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} prometheus_postgres_targets: - - targets: - - bdd-ovh.adm.auro.re + - targets: + - bdd-ovh.adm.auro.re prometheus_docker_targets: - docker-ovh.adm.auro.re:8087 roles: @@ -145,7 +145,7 @@ prometheus_alertmanager: docker-ovh.adm.auro.re:9093 snmp_unifi_password: "{{ vault_snmp_unifi_password }}" - #Prometheus targets.json + # Prometheus targets.json prometheus_targets: - prometheus-edc.adm.auro.re - prometheus-gs.adm.auro.re @@ -157,7 +157,7 @@ roles: - prometheus_federate -#Postgres Exporters +# Postgres Exporters - hosts: bdd,radius roles: - prometheus_postgres diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index b1fa112..1d55290 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -24,6 +24,7 @@ file: path: /etc/prometheus/alerts state: directory + mode: 0644 - name: Remove old files file: diff --git a/roles/prometheus_postgres/tasks/main.yml b/roles/prometheus_postgres/tasks/main.yml index 80136bd..734fa3c 100644 --- a/roles/prometheus_postgres/tasks/main.yml +++ b/roles/prometheus_postgres/tasks/main.yml @@ -35,5 +35,5 @@ systemd: name: prometheus-postgres-exporter enabled: true - daemon_reload: yes + daemon_reload: true state: started diff --git a/roles/radius/tasks/main.yml b/roles/radius/tasks/main.yml index e79d742..bafb166 100644 --- a/roles/radius/tasks/main.yml +++ b/roles/radius/tasks/main.yml @@ -129,7 +129,7 @@ name: - postgresql - postgresql-client-11=11.7-0+deb10u1 - force: yes + force: true - name: Install postgresql ansible module requirement(s) pip: From d891559e286201d7a46ff3b2a88b16862755bafe Mon Sep 17 00:00:00 2001 From: pz2891 Date: Mon, 12 Apr 2021 10:03:53 +0200 Subject: [PATCH 16/27] Fix CI --- monitoring.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monitoring.yml b/monitoring.yml index 978197f..c01fb0b 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -129,7 +129,7 @@ # Prometheus targets.json prometheus_targets: - - targets: | + - targets: | {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} prometheus_postgres_targets: - targets: From 954e3e08923f1f8032f72b513a483f998220e879 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Mon, 12 Apr 2021 10:58:59 +0200 Subject: [PATCH 17/27] End of yaml file (bad copy/paste) --- roles/prometheus/tasks/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 1d55290..e78248e 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -123,3 +123,4 @@ - key: 05-prometheus message: >- Prometheus est déployé sur cette machine (voir /etc/prometheus) +... From 676cc716cf05de238232f7b8bf253669e728ea6e Mon Sep 17 00:00:00 2001 From: pz2891 Date: Mon, 12 Apr 2021 11:00:31 +0200 Subject: [PATCH 18/27] Modify label for the alert --- roles/prometheus/templates/docker.rules.yml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/prometheus/templates/docker.rules.yml.j2 b/roles/prometheus/templates/docker.rules.yml.j2 index ce825ad..23eec40 100644 --- a/roles/prometheus/templates/docker.rules.yml.j2 +++ b/roles/prometheus/templates/docker.rules.yml.j2 @@ -17,7 +17,7 @@ groups: severity: critical annotations: summary: >- - Container Docker mort + Container Docker éteint / tombé (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}) - alert: ContainerFailed From 3320e3e0c65a5d580c8250c7528531625685746d Mon Sep 17 00:00:00 2001 From: pz2891 Date: Mon, 12 Apr 2021 11:01:43 +0200 Subject: [PATCH 19/27] Update the labels for the alert (make complete tenses) --- roles/prometheus/templates/docker.rules.yml.j2 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/roles/prometheus/templates/docker.rules.yml.j2 b/roles/prometheus/templates/docker.rules.yml.j2 index 23eec40..e891b1c 100644 --- a/roles/prometheus/templates/docker.rules.yml.j2 +++ b/roles/prometheus/templates/docker.rules.yml.j2 @@ -17,7 +17,7 @@ groups: severity: critical annotations: summary: >- - Container Docker éteint / tombé + Le container Docker est éteint / tombé (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}) - alert: ContainerFailed @@ -27,7 +27,7 @@ groups: severity: critical annotations: summary: >- - Container Docker redémarre souvent + Le container Docker redémarre souvent (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}) - alert: ContainerFailed @@ -43,7 +43,7 @@ groups: severity: critical annotations: summary: >- - Container Docker utilise beaucoup de CPU + Le container Docker utilise beaucoup de CPU (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}, value {{ raw('$value | printf "%.1f"') }}) From 5d9a6599e834beee9451d3c41a44fa5d269fa9c3 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Mon, 12 Apr 2021 11:10:15 +0200 Subject: [PATCH 20/27] Fix some typos, in accordance to Solal's comments --- .../templates/postgres.rules.yml.j2 | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/roles/prometheus/templates/postgres.rules.yml.j2 b/roles/prometheus/templates/postgres.rules.yml.j2 index 61af5dc..16695cf 100644 --- a/roles/prometheus/templates/postgres.rules.yml.j2 +++ b/roles/prometheus/templates/postgres.rules.yml.j2 @@ -15,7 +15,7 @@ groups: labels: severity: critical annotations: - summary: Serveur Postgresql down (instance {{ raw('$labels.instance') }}) + summary: Serveur PostgreSQL down (instance {{ raw('$labels.instance') }}) - alert: PostgresqlRestarted expr: time() - pg_postmaster_start_time_seconds < 60 @@ -23,7 +23,7 @@ groups: labels: severity: critical annotations: - summary: Serveur Postgresql redémarré (instance {{ raw('$labels.instance') }}) + summary: Serveur PostgreSQL redémarré (instance {{ raw('$labels.instance') }}) - alert: PostgresqlExporterError expr: pg_exporter_last_scrape_error > 0 @@ -43,7 +43,7 @@ groups: severity: critical annotations: summary: >- - Réplication Postgresql lag ({{ raw('$value') }} > 30s) + La réplication PostgreSQL lag ({{ raw('$value') }} > 30s) (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname')}} ) - alert: PostgresqlTableNotVaccumed @@ -55,7 +55,7 @@ groups: severity: warning annotations: summary: >- - La table n'a pas été aspirée depuis 24h + Le démon autovacuum n'a pas été lancé depuis 24h (Instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) - alert: PostgresqlTableNotAnalyzed @@ -93,7 +93,7 @@ groups: severity: warning annotations: summary: >- - PostgreSQL a des cadenas morts + PostgreSQL a plus de 5 deadlocks. (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) - alert: PostgresqlSlowQueries @@ -118,7 +118,7 @@ groups: severity: warning annotations: summary: >- - Postgresql a un taux de retour en arrière (rollback) élevé + PostgreSQL a un taux de retour en arrière (rollback) élevé (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} %) - alert: PostgresqlWaleReplicationStopped @@ -128,7 +128,7 @@ groups: severity: critical annotations: summary: >- - Réplication de Postgresql WALE stopée + Réplication de PostgreSQL WALE stopée (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) - alert: PostgresqlHighRateStatementTimeout @@ -138,7 +138,7 @@ groups: severity: critical annotations: summary: >- - Beaucoup de requêtes Postgresql sont timeout + Beaucoup de requêtes PostgreSQL sont timeout (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} ) - alert: PostgresqlHighRateDeadlock @@ -148,7 +148,7 @@ groups: severity: critical annotations: summary: >- - Postgresql a un fort taux de deadlock + PostgreSQL a un fort taux de deadlock (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} ) # - alert: PostgresqlReplicationLagBytes @@ -183,7 +183,7 @@ groups: severity: critical annotations: summary: >- - Split Brain, trop de bases de données Postgresql primaires en mode lecture-écriture + Split Brain : trop de bases de données PostgreSQL primaires en mode lecture-écriture (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value') }} ) - alert: PostgresqlPromotedNode @@ -210,7 +210,7 @@ groups: severity: critical annotations: summary: >- - Trop de verrous acquis sur la base de données. + Trop de deadlocks acquis sur la base de données. Si cette alerte se produit fréquemment, nous devrons peut-être augmenter le paramètre postgres max_locks_per_transaction (instance {{ raw('$labels.instance') }}, value = {{ raw('$value | printf "%.1f"') }} ) From 226b55b0d123f43d5aba81edae207f7c196d59c0 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 14 Apr 2021 19:10:42 +0200 Subject: [PATCH 21/27] Update alerts (remove instance, translations) --- .../templates/postgres.rules.yml.j2 | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/roles/prometheus/templates/postgres.rules.yml.j2 b/roles/prometheus/templates/postgres.rules.yml.j2 index 16695cf..bf10d2d 100644 --- a/roles/prometheus/templates/postgres.rules.yml.j2 +++ b/roles/prometheus/templates/postgres.rules.yml.j2 @@ -15,7 +15,7 @@ groups: labels: severity: critical annotations: - summary: Serveur PostgreSQL down (instance {{ raw('$labels.instance') }}) + summary: Serveur PostgreSQL down - alert: PostgresqlRestarted expr: time() - pg_postmaster_start_time_seconds < 60 @@ -23,7 +23,7 @@ groups: labels: severity: critical annotations: - summary: Serveur PostgreSQL redémarré (instance {{ raw('$labels.instance') }}) + summary: Serveur PostgreSQL redémarré - alert: PostgresqlExporterError expr: pg_exporter_last_scrape_error > 0 @@ -31,7 +31,7 @@ groups: labels: severity: critical annotations: - summary: Erreur dans l'exporter Postgresql (instance {{ raw('$labels.instance') }}) + summary: Erreur dans l'exporter PostgreSQL - alert: PostgresqlReplicationLag expr: @@ -44,7 +44,7 @@ groups: annotations: summary: >- La réplication PostgreSQL lag ({{ raw('$value') }} > 30s) - (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname')}} ) + (base de données {{ raw('$labels.datname')}} ) - alert: PostgresqlTableNotVaccumed expr: @@ -56,7 +56,7 @@ groups: annotations: summary: >- Le démon autovacuum n'a pas été lancé depuis 24h - (Instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) + (base de données {{ raw('$labels.datname') }} ) - alert: PostgresqlTableNotAnalyzed expr: @@ -68,7 +68,7 @@ groups: annotations: summary: >- Table non-analysée depuis 24h - (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}) + (base de données {{ raw('$labels.datname') }}) - alert: PostgresqlTooManyConnections expr: @@ -84,7 +84,7 @@ groups: summary: >- PostgreSQL a trop de connexions ({{ raw('$value | printf "%.1f"') }} > 80%) - (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}) + (base de données {{ raw('$labels.datname') }}) - alert: PostgresqlDeadLocks expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 @@ -94,7 +94,7 @@ groups: annotations: summary: >- PostgreSQL a plus de 5 deadlocks. - (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) + (base de données {{ raw('$labels.datname') }} ) - alert: PostgresqlSlowQueries expr: pg_slow_queries > 0 @@ -104,7 +104,7 @@ groups: annotations: summary: >- Présence de requêtes lentes (slow-queries) - (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) + (base de données {{ raw('$labels.datname') }} ) - alert: PostgresqlHighRollbackRate expr: @@ -119,7 +119,7 @@ groups: annotations: summary: >- PostgreSQL a un taux de retour en arrière (rollback) élevé - (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} %) + (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} %) - alert: PostgresqlWaleReplicationStopped expr: rate(pg_xlog_position_bytes[1m]) == 0 @@ -129,7 +129,7 @@ groups: annotations: summary: >- Réplication de PostgreSQL WALE stopée - (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) + (base de données {{ raw('$labels.datname') }} ) - alert: PostgresqlHighRateStatementTimeout expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3 @@ -139,7 +139,7 @@ groups: annotations: summary: >- Beaucoup de requêtes PostgreSQL sont timeout - (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} ) + (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} ) - alert: PostgresqlHighRateDeadlock expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1 @@ -149,7 +149,7 @@ groups: annotations: summary: >- PostgreSQL a un fort taux de deadlock - (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} ) + (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} ) # - alert: PostgresqlReplicationLagBytes # expr: @@ -174,7 +174,7 @@ groups: annotations: summary: >- Les tuples morts PostgreSQL sont trop volumineux - (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} ) + (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} ) - alert: PostgresqlSplitBrain expr: count(pg_replication_is_replica == 0) != 1 @@ -184,7 +184,7 @@ groups: annotations: summary: >- Split Brain : trop de bases de données PostgreSQL primaires en mode lecture-écriture - (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value') }} ) + (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }} ) - alert: PostgresqlPromotedNode expr: @@ -197,7 +197,7 @@ groups: annotations: summary: >- Le serveur de secours PostgreSQL a été promu comme nœud principal - (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value') }}) + (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }}) - alert: PostgresqlTooManyLocksAcquired expr: @@ -213,7 +213,7 @@ groups: Trop de deadlocks acquis sur la base de données. Si cette alerte se produit fréquemment, nous devrons peut-être augmenter le paramètre postgres max_locks_per_transaction - (instance {{ raw('$labels.instance') }}, value = {{ raw('$value | printf "%.1f"') }} ) + (Valeur = {{ raw('$value | printf "%.1f"') }} ) ... From e4d2416722f0ec5c7cc641c26db091c1b5c8f9a3 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 14 Apr 2021 19:27:13 +0200 Subject: [PATCH 22/27] fix typo --- roles/prometheus/templates/postgres.rules.yml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/prometheus/templates/postgres.rules.yml.j2 b/roles/prometheus/templates/postgres.rules.yml.j2 index bf10d2d..99103c5 100644 --- a/roles/prometheus/templates/postgres.rules.yml.j2 +++ b/roles/prometheus/templates/postgres.rules.yml.j2 @@ -128,7 +128,7 @@ groups: severity: critical annotations: summary: >- - Réplication de PostgreSQL WALE stopée + Réplication de PostgreSQL WALE stoppée (base de données {{ raw('$labels.datname') }} ) - alert: PostgresqlHighRateStatementTimeout From fde52f2e42dc754c7d3f4d3225091282e5c84ae3 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 14 Apr 2021 19:29:12 +0200 Subject: [PATCH 23/27] Alerts repository owned by prometheus --- roles/prometheus/tasks/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index e78248e..17e377f 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -24,6 +24,8 @@ file: path: /etc/prometheus/alerts state: directory + owner: prometheus + group: prometheus mode: 0644 - name: Remove old files From 1b0bff4c51019628f897e35161226a9e35aeacae Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 14 Apr 2021 19:51:47 +0200 Subject: [PATCH 24/27] Fix deployment and add prometheus groups for hosts --- hosts | 10 ++++++++++ roles/prometheus/tasks/main.yml | 4 ++-- roles/prometheus/templates/postgres.rules.yml.j2 | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/hosts b/hosts index 75243c8..e6ca0b7 100644 --- a/hosts +++ b/hosts @@ -562,3 +562,13 @@ radius-pacaterie.adm.auro.re radius-pacaterie-backup.adm.auro.re radius-rives.adm.auro.re radius-rives-backup.adm.auro.re + +[prometheus] +prometheus-ovh.adm.auro.re +prometheus-aurore.adm.auro.re +prometheus-rives.adm.auro.re +prometheus-gs.adm.auro.re +prometheus-edc.adm.auro.re +prometheus-pacaterie.adm.auro.re +prometheus-fleming.adm.auro.re +prometheus-federate.adm.auro.re diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 17e377f..0946165 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -26,14 +26,14 @@ state: directory owner: prometheus group: prometheus - mode: 0644 + mode: 0755 - name: Remove old files file: path: "/etc/prometheus/{{ item }}" state: absent with_items: - - alerts.rules.yml + - alert.rules.yml - django.rules.yml diff --git a/roles/prometheus/templates/postgres.rules.yml.j2 b/roles/prometheus/templates/postgres.rules.yml.j2 index 99103c5..0ec4952 100644 --- a/roles/prometheus/templates/postgres.rules.yml.j2 +++ b/roles/prometheus/templates/postgres.rules.yml.j2 @@ -44,7 +44,7 @@ groups: annotations: summary: >- La réplication PostgreSQL lag ({{ raw('$value') }} > 30s) - (base de données {{ raw('$labels.datname')}} ) + (base de données {{ raw('$labels.datname') }} ) - alert: PostgresqlTableNotVaccumed expr: From 013743f910367704131695d1d811b983c50584a1 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 14 Apr 2021 19:54:37 +0200 Subject: [PATCH 25/27] typo in docker rules --- roles/prometheus/templates/docker.rules.yml.j2 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/roles/prometheus/templates/docker.rules.yml.j2 b/roles/prometheus/templates/docker.rules.yml.j2 index e891b1c..d911698 100644 --- a/roles/prometheus/templates/docker.rules.yml.j2 +++ b/roles/prometheus/templates/docker.rules.yml.j2 @@ -18,7 +18,7 @@ groups: annotations: summary: >- Le container Docker est éteint / tombé - (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}) + (container {{ raw('$labels.name') }}) - alert: ContainerFailed expr: sum(increase(docker_container_restart_count[5m])) > 2 @@ -28,7 +28,7 @@ groups: annotations: summary: >- Le container Docker redémarre souvent - (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}) + (container {{ raw('$labels.name') }}) - alert: ContainerFailed expr: @@ -44,7 +44,7 @@ groups: annotations: summary: >- Le container Docker utilise beaucoup de CPU - (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}, - value {{ raw('$value | printf "%.1f"') }}) + (container {{ raw('$labels.name') }}, + valeur {{ raw('$value | printf "%.1f"') }}) ... From 11d0b46ef0a267ecb339b4fa4507a99c72a3e20a Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 14 Apr 2021 20:00:16 +0200 Subject: [PATCH 26/27] Remove port for docker instances. Remove 'remove old files' tasks --- roles/prometheus/tasks/main.yml | 9 --------- roles/prometheus/templates/prometheus.yml.j2 | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 0946165..d141ecc 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -28,15 +28,6 @@ group: prometheus mode: 0755 -- name: Remove old files - file: - path: "/etc/prometheus/{{ item }}" - state: absent - with_items: - - alert.rules.yml - - django.rules.yml - - - name: Configure Prometheus alerts template: src: "{{ item }}.j2" diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 26020dc..bae1d2b 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -100,6 +100,15 @@ scrape_configs: file_sd_configs: - files: - '/etc/prometheus/targets_docker.json' + relabel_configs: + # Do not put :8087 in instance name, rather here + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [__param_target] + target_label: __address__ + replacement: '$1:8087' - job_name: postgresql file_sd_configs: From f409fb53cbc295a34010747e6456bfd5c71fa6e4 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 14 Apr 2021 20:11:23 +0200 Subject: [PATCH 27/27] remove port for docker --- monitoring.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monitoring.yml b/monitoring.yml index c01fb0b..114945d 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -135,7 +135,7 @@ - targets: - bdd-ovh.adm.auro.re prometheus_docker_targets: - - docker-ovh.adm.auro.re:8087 + - docker-ovh.adm.auro.re roles: - prometheus