diff --git a/hosts b/hosts index 0f1c61a..e6ca0b7 100644 --- a/hosts +++ b/hosts @@ -549,3 +549,26 @@ proxy.adm.auro.re bdd.adm.auro.re bdd-ovh.adm.auro.re re2o-db.adm.auro.re + +[radius] +radius-aurore.adm.auro.re +radius-fleming.adm.auro.re +radius-fleming-backup.adm.auro.re +radius-edc.adm.auro.re +radius-edc-backup.adm.auro.re +radius-gs.adm.auro.re +radius-gs-backup.adm.auro.re +radius-pacaterie.adm.auro.re +radius-pacaterie-backup.adm.auro.re +radius-rives.adm.auro.re +radius-rives-backup.adm.auro.re + +[prometheus] +prometheus-ovh.adm.auro.re +prometheus-aurore.adm.auro.re +prometheus-rives.adm.auro.re +prometheus-gs.adm.auro.re +prometheus-edc.adm.auro.re +prometheus-pacaterie.adm.auro.re +prometheus-fleming.adm.auro.re +prometheus-federate.adm.auro.re diff --git a/monitoring.yml b/monitoring.yml index ac0d59f..114945d 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -99,6 +99,9 @@ prometheus_targets: - targets: | {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }} + prometheus_postgres_targets: + - targets: | + {{ groups['bdd'] + groups['radius'] | list | sort }} prometheus_switch_snmp_targets: - targets: - yggdrasil.switch.auro.re @@ -128,8 +131,11 @@ prometheus_targets: - targets: | {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} + prometheus_postgres_targets: + - targets: + - bdd-ovh.adm.auro.re prometheus_docker_targets: - - docker-ovh.adm.auro.re:8087 + - docker-ovh.adm.auro.re roles: - prometheus @@ -151,6 +157,11 @@ roles: - prometheus_federate +# Postgres Exporters +- hosts: bdd,radius + roles: + - prometheus_postgres + # Monitor all hosts - hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 4dc518b..d141ecc 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -18,8 +18,29 @@ mode: u=r,g=r,o= loop: - prometheus.yml - - alert.rules.yml + notify: Restart Prometheus + +- name: Creates directory for alerts + file: + path: /etc/prometheus/alerts + state: directory + owner: prometheus + group: prometheus + mode: 0755 + +- name: Configure Prometheus alerts + template: + src: "{{ item }}.j2" + dest: "/etc/prometheus/alerts/{{ item }}" + owner: prometheus + group: prometheus + mode: u=r,g=r,o= + loop: + - server.rules.yml + - docker.rules.yml - django.rules.yml + - ups.rules.yml + - postgres.rules.yml notify: Restart Prometheus - name: Make Prometheus snmp-exporter listen on localhost only @@ -75,6 +96,13 @@ mode: 0644 when: prometheus_docker_targets is defined +- name: Configure Prometheus postgres monitoring + copy: + content: "{{ prometheus_postgres_targets | to_nice_json }}\n" + dest: /etc/prometheus/targets_postgres.json + mode: 0644 + when: prometheus_postgres_targets is defined + - name: Activate prometheus service systemd: name: prometheus diff --git a/roles/prometheus/templates/docker.rules.yml.j2 b/roles/prometheus/templates/docker.rules.yml.j2 new file mode 100644 index 0000000..d911698 --- /dev/null +++ b/roles/prometheus/templates/docker.rules.yml.j2 @@ -0,0 +1,50 @@ +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + +groups: + + - name: docker.rules + rules: + + - alert: ContainerDown + expr: docker_container_running_state != 1 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Le container Docker est éteint / tombé + (container {{ raw('$labels.name') }}) + + - alert: ContainerFailed + expr: sum(increase(docker_container_restart_count[5m])) > 2 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Le container Docker redémarre souvent + (container {{ raw('$labels.name') }}) + + - alert: ContainerFailed + expr: + ( + docker_container_cpu_used_total + / + docker_container_cpu_capacity_total + ) * 100 + > 30 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Le container Docker utilise beaucoup de CPU + (container {{ raw('$labels.name') }}, + valeur {{ raw('$value | printf "%.1f"') }}) + +... diff --git a/roles/prometheus/templates/postgres.rules.yml.j2 b/roles/prometheus/templates/postgres.rules.yml.j2 new file mode 100644 index 0000000..0ec4952 --- /dev/null +++ b/roles/prometheus/templates/postgres.rules.yml.j2 @@ -0,0 +1,219 @@ +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + +groups: + + - name: postgres.rules + rules: + - alert: PostgresqlDown + expr: pg_up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Serveur PostgreSQL down + + - alert: PostgresqlRestarted + expr: time() - pg_postmaster_start_time_seconds < 60 + for: 0m + labels: + severity: critical + annotations: + summary: Serveur PostgreSQL redémarré + + - alert: PostgresqlExporterError + expr: pg_exporter_last_scrape_error > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Erreur dans l'exporter PostgreSQL + + - alert: PostgresqlReplicationLag + expr: + pg_replication_lag > 30 + and + ON(instance) pg_replication_is_replica == 1 + for: 0m + labels: + severity: critical + annotations: + summary: >- + La réplication PostgreSQL lag ({{ raw('$value') }} > 30s) + (base de données {{ raw('$labels.datname') }} ) + + - alert: PostgresqlTableNotVaccumed + expr: + time() - pg_stat_user_tables_last_autovacuum + > 60 * 60 * 24 + for: 0m + labels: + severity: warning + annotations: + summary: >- + Le démon autovacuum n'a pas été lancé depuis 24h + (base de données {{ raw('$labels.datname') }} ) + + - alert: PostgresqlTableNotAnalyzed + expr: + time() - pg_stat_user_tables_last_autoanalyze + > 60 * 60 * 24 + for: 0m + labels: + severity: warning + annotations: + summary: >- + Table non-analysée depuis 24h + (base de données {{ raw('$labels.datname') }}) + + - alert: PostgresqlTooManyConnections + expr: + ( + sum by (datname) + (pg_stat_activity_count{datname!~"template.*|postgres"}) + ) * 100 + > pg_settings_max_connections * 80 + for: 2m + labels: + severity: warning + annotations: + summary: >- + PostgreSQL a trop de connexions + ({{ raw('$value | printf "%.1f"') }} > 80%) + (base de données {{ raw('$labels.datname') }}) + + - alert: PostgresqlDeadLocks + expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 + for: 0m + labels: + severity: warning + annotations: + summary: >- + PostgreSQL a plus de 5 deadlocks. + (base de données {{ raw('$labels.datname') }} ) + + - alert: PostgresqlSlowQueries + expr: pg_slow_queries > 0 + for: 2m + labels: + severity: warning + annotations: + summary: >- + Présence de requêtes lentes (slow-queries) + (base de données {{ raw('$labels.datname') }} ) + + - alert: PostgresqlHighRollbackRate + expr: + ( + rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / + rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) + ) * 100 + > 7 + for: 0m + labels: + severity: warning + annotations: + summary: >- + PostgreSQL a un taux de retour en arrière (rollback) élevé + (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} %) + + - alert: PostgresqlWaleReplicationStopped + expr: rate(pg_xlog_position_bytes[1m]) == 0 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Réplication de PostgreSQL WALE stoppée + (base de données {{ raw('$labels.datname') }} ) + + - alert: PostgresqlHighRateStatementTimeout + expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Beaucoup de requêtes PostgreSQL sont timeout + (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} ) + + - alert: PostgresqlHighRateDeadlock + expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1 + for: 0m + labels: + severity: critical + annotations: + summary: >- + PostgreSQL a un fort taux de deadlock + (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} ) + +# - alert: PostgresqlReplicationLagBytes +# expr: +# (pg_xlog_position_bytes and pg_replication_is_replica == 0) +# - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1) +# > 1e+09 +# for: 0m +# labels: +# severity: critical +# annotations: +# summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} ) + + - alert: PostgresqlTooManyDeadTuples + expr: + ( + (pg_stat_user_tables_n_dead_tup > 10000) + / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) + ) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1) + for: 2m + labels: + severity: warning + annotations: + summary: >- + Les tuples morts PostgreSQL sont trop volumineux + (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} ) + + - alert: PostgresqlSplitBrain + expr: count(pg_replication_is_replica == 0) != 1 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Split Brain : trop de bases de données PostgreSQL primaires en mode lecture-écriture + (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }} ) + + - alert: PostgresqlPromotedNode + expr: + pg_replication_is_replica + and + changes(pg_replication_is_replica[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: >- + Le serveur de secours PostgreSQL a été promu comme nœud principal + (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }}) + + - alert: PostgresqlTooManyLocksAcquired + expr: + ( + (sum (pg_locks_count)) + / (pg_settings_max_locks_per_transaction * pg_settings_max_connections) + ) * 100 > 20 + for: 2m + labels: + severity: critical + annotations: + summary: >- + Trop de deadlocks acquis sur la base de données. + Si cette alerte se produit fréquemment, nous devrons peut-être augmenter + le paramètre postgres max_locks_per_transaction + (Valeur = {{ raw('$value | printf "%.1f"') }} ) + +... + diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index e97e986..bae1d2b 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -20,8 +20,7 @@ alerting: # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - - "alert.rules.yml" # Monitoring alerts, this is the file you may be searching! - - "django.rules.yml" # Custom rules specific for Django project monitoring + - "alerts/*.yml" # Monitoring alerts, this is the file you may be searching! # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. @@ -101,4 +100,27 @@ scrape_configs: file_sd_configs: - files: - '/etc/prometheus/targets_docker.json' + relabel_configs: + # Do not put :8087 in instance name, rather here + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [__param_target] + target_label: __address__ + replacement: '$1:8087' + + - job_name: postgresql + file_sd_configs: + - files: + - '/etc/prometheus/targets_postgres.json' + relabel_configs: + # Do not put :9187 in instance name, rather here + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [__param_target] + target_label: __address__ + replacement: '$1:9187' ... diff --git a/roles/prometheus/templates/alert.rules.yml.j2 b/roles/prometheus/templates/server.rules.yml.j2 similarity index 67% rename from roles/prometheus/templates/alert.rules.yml.j2 rename to roles/prometheus/templates/server.rules.yml.j2 index 84d8aa2..ac09881 100644 --- a/roles/prometheus/templates/alert.rules.yml.j2 +++ b/roles/prometheus/templates/server.rules.yml.j2 @@ -7,7 +7,7 @@ groups: - - name: alert.rules + - name: server.rules rules: - alert: InstanceDown @@ -149,78 +149,5 @@ groups: summary: > Charge à {{ raw('$value') }} - - alert: UpsOutputSourceChanged - expr: upsOutputSource != 3 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Source d'alimentation changée - - - alert: UpsBatteryStatus - expr: upsBatteryStatus == 3 - for: 0m - labels: - severity: warning - annotations: - summary: >- - État de la batterie faible - - - alert: UpsBatteryStatus - expr: upsBatteryStatus == 4 - for: 0m - labels: - severity: critical - annotations: - summary: >- - État de la batterie critique - - - alert: UpsHighLoad - expr: upsOutputPercentLoad > 70 - for: 3m - labels: - severity: critical - annotations: - summary: >- - Charge de {{ raw('$value | printf "%.1f"') }}% - - - alert: UpsWrongInputVoltage - expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) - for: 5m - labels: - severity: warning - annotations: - summary: >- - Tension d'entrée de {{ raw('$value') }}V - - - alert: UpsWrongOutputVoltage - expr: >- - abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d])) - < 3 * stddev_over_time(upsOutputVoltage[1d]) - for: 5m - labels: - severity: warning - annotations: - summary: >- - Tension de sortie de {{ raw('$value') }}V - - - alert: UpsTimeRemaining - expr: upsEstimatedMinutesRemaining < 8 - for: 0m - labels: - severity: warning - annotations: - summary: >- - Autonomie restante de {{ raw('$value') }} min - - - alert: UpsTimeRemaining - expr: upsEstimatedMinutesRemaining < 5 - for: 0m - labels: - severity: critical - annotations: - summary: >- - Autonomie restante de {{ raw('$value') }} min ... diff --git a/roles/prometheus/templates/ups.rules.yml.j2 b/roles/prometheus/templates/ups.rules.yml.j2 new file mode 100644 index 0000000..eafdee3 --- /dev/null +++ b/roles/prometheus/templates/ups.rules.yml.j2 @@ -0,0 +1,87 @@ +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + +groups: + + - name: ups.rules + rules: + + - alert: UpsOutputSourceChanged + expr: upsOutputSource != 3 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Source d'alimentation changée + + - alert: UpsBatteryStatus + expr: upsBatteryStatus == 3 + for: 0m + labels: + severity: warning + annotations: + summary: >- + État de la batterie faible + + - alert: UpsBatteryStatus + expr: upsBatteryStatus == 4 + for: 0m + labels: + severity: critical + annotations: + summary: >- + État de la batterie critique + + - alert: UpsHighLoad + expr: upsOutputPercentLoad > 70 + for: 3m + labels: + severity: critical + annotations: + summary: >- + Charge de {{ raw('$value | printf "%.1f"') }}% + + - alert: UpsWrongInputVoltage + expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) + for: 5m + labels: + severity: warning + annotations: + summary: >- + Tension d'entrée de {{ raw('$value') }}V + + - alert: UpsWrongOutputVoltage + expr: >- + abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d])) + < 3 * stddev_over_time(upsOutputVoltage[1d]) + for: 5m + labels: + severity: warning + annotations: + summary: >- + Tension de sortie de {{ raw('$value') }}V + + - alert: UpsTimeRemaining + expr: upsEstimatedMinutesRemaining < 8 + for: 0m + labels: + severity: warning + annotations: + summary: >- + Autonomie restante de {{ raw('$value') }} min + + - alert: UpsTimeRemaining + expr: upsEstimatedMinutesRemaining < 5 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Autonomie restante de {{ raw('$value') }} min + +... diff --git a/roles/prometheus_federate/templates/prometheus.yml.j2 b/roles/prometheus_federate/templates/prometheus.yml.j2 index 23e649b..71e6874 100644 --- a/roles/prometheus_federate/templates/prometheus.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus.yml.j2 @@ -31,6 +31,7 @@ scrape_configs: params: match[]: - '{job="servers"}' + - '{job="postgresql"}' - '{job="prometheus"}' - '{job="unifi_snmp"}' - '{job="django"}' diff --git a/roles/prometheus_postgres/handlers/main.yml b/roles/prometheus_postgres/handlers/main.yml new file mode 100644 index 0000000..05837d1 --- /dev/null +++ b/roles/prometheus_postgres/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart prometheus-postgres-exporter + service: + name: prometheus-postgres-exporter + state: restarted diff --git a/roles/prometheus_postgres/tasks/main.yml b/roles/prometheus_postgres/tasks/main.yml new file mode 100644 index 0000000..734fa3c --- /dev/null +++ b/roles/prometheus_postgres/tasks/main.yml @@ -0,0 +1,39 @@ +--- +- name: Install Prometheus postgres-exporter + apt: + update_cache: true + name: prometheus-postgres-exporter + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Make Prometheus postgres-exporter connect to databases using peercred + lineinfile: + path: /etc/default/prometheus-postgres-exporter + regexp: '^DATA_SOURCE_NAME=' + line: | + DATA_SOURCE_NAME="user=postgres host=/var/run/postgresql/ sslmode=disable" + notify: Restart prometheus-postgres-exporter + +- name: Make Prometheus postgres-exporter launched by postgres user + lineinfile: + path: /lib/systemd/system/prometheus-postgres-exporter.service + regexp: '^User=' + line: | + User=postgres + notify: Restart prometheus-postgres-exporter + +- name: Make Prometheus postgres-exporter listen on adm only + lineinfile: + path: /etc/default/prometheus-postgres-exporter + regexp: '^ARGS=' + line: | + ARGS="--web.listen-address={{ ansible_hostname }}.adm.auro.re:9187" + notify: Restart prometheus-postgres-exporter + +- name: Activate prometheus-postgres-exporter service + systemd: + name: prometheus-postgres-exporter + enabled: true + daemon_reload: true + state: started diff --git a/roles/radius/tasks/main.yml b/roles/radius/tasks/main.yml index e79d742..bafb166 100644 --- a/roles/radius/tasks/main.yml +++ b/roles/radius/tasks/main.yml @@ -129,7 +129,7 @@ name: - postgresql - postgresql-client-11=11.7-0+deb10u1 - force: yes + force: true - name: Install postgresql ansible module requirement(s) pip: