Installation Prometheus-postgres-exporter #53

Merged
pz2891 merged 29 commits from prometheus_postgres_exporter into master 2021-04-14 20:19:59 +02:00
12 changed files with 491 additions and 79 deletions

23
hosts
View file

@ -549,3 +549,26 @@ proxy.adm.auro.re
bdd.adm.auro.re
bdd-ovh.adm.auro.re
re2o-db.adm.auro.re
[radius]
radius-aurore.adm.auro.re
radius-fleming.adm.auro.re
radius-fleming-backup.adm.auro.re
radius-edc.adm.auro.re
radius-edc-backup.adm.auro.re
radius-gs.adm.auro.re
radius-gs-backup.adm.auro.re
radius-pacaterie.adm.auro.re
radius-pacaterie-backup.adm.auro.re
radius-rives.adm.auro.re
radius-rives-backup.adm.auro.re
[prometheus]
prometheus-ovh.adm.auro.re
prometheus-aurore.adm.auro.re
prometheus-rives.adm.auro.re
prometheus-gs.adm.auro.re
prometheus-edc.adm.auro.re
prometheus-pacaterie.adm.auro.re
prometheus-fleming.adm.auro.re
prometheus-federate.adm.auro.re

View file

@ -99,6 +99,9 @@
prometheus_targets:
- targets: |
{{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
prometheus_postgres_targets:
- targets: |
{{ groups['bdd'] + groups['radius'] | list | sort }}
prometheus_switch_snmp_targets:
- targets:
- yggdrasil.switch.auro.re
@ -128,8 +131,11 @@
prometheus_targets:
- targets: |
{{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
prometheus_postgres_targets:
- targets:
- bdd-ovh.adm.auro.re
prometheus_docker_targets:
- docker-ovh.adm.auro.re:8087
- docker-ovh.adm.auro.re
roles:
- prometheus
@ -151,6 +157,11 @@
roles:
- prometheus_federate
# Postgres Exporters
- hosts: bdd,radius
roles:
- prometheus_postgres
# Monitor all hosts
- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container

View file

@ -18,8 +18,29 @@
mode: u=r,g=r,o=
loop:
- prometheus.yml
- alert.rules.yml
notify: Restart Prometheus
- name: Creates directory for alerts
file:
pz2891 marked this conversation as resolved
Review

Ça pourrait être bien de préciser le propriétaire et le groupe du dossier.

Ça pourrait être bien de préciser le propriétaire et le groupe du dossier.
path: /etc/prometheus/alerts
state: directory
owner: prometheus
group: prometheus
mode: 0755
- name: Configure Prometheus alerts
template:
src: "{{ item }}.j2"
dest: "/etc/prometheus/alerts/{{ item }}"
owner: prometheus
group: prometheus
mode: u=r,g=r,o=
loop:
- server.rules.yml
- docker.rules.yml
- django.rules.yml
- ups.rules.yml
- postgres.rules.yml
notify: Restart Prometheus
- name: Make Prometheus snmp-exporter listen on localhost only
@ -75,6 +96,13 @@
mode: 0644
when: prometheus_docker_targets is defined
- name: Configure Prometheus postgres monitoring
copy:
content: "{{ prometheus_postgres_targets | to_nice_json }}\n"
dest: /etc/prometheus/targets_postgres.json
mode: 0644
when: prometheus_postgres_targets is defined
- name: Activate prometheus service
systemd:
name: prometheus

View file

@ -0,0 +1,50 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: docker.rules
rules:
- alert: ContainerDown
expr: docker_container_running_state != 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
Le container Docker est éteint / tombé
(container {{ raw('$labels.name') }})
- alert: ContainerFailed
expr: sum(increase(docker_container_restart_count[5m])) > 2
for: 0m
labels:
severity: critical
annotations:
summary: >-
Le container Docker redémarre souvent
(container {{ raw('$labels.name') }})
- alert: ContainerFailed
expr:
(
docker_container_cpu_used_total
/
docker_container_cpu_capacity_total
) * 100
> 30
for: 0m
labels:
severity: critical
annotations:
summary: >-
Le container Docker utilise beaucoup de CPU
(container {{ raw('$labels.name') }},
valeur {{ raw('$value | printf "%.1f"') }})
...

View file

@ -0,0 +1,219 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: postgres.rules
rules:
- alert: PostgresqlDown
expr: pg_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Serveur PostgreSQL down
- alert: PostgresqlRestarted
expr: time() - pg_postmaster_start_time_seconds < 60
for: 0m
labels:
severity: critical
annotations:
summary: Serveur PostgreSQL redémarré
- alert: PostgresqlExporterError
expr: pg_exporter_last_scrape_error > 0
for: 0m
labels:
severity: critical
annotations:
summary: Erreur dans l'exporter PostgreSQL
- alert: PostgresqlReplicationLag
expr:
pg_replication_lag > 30
and
ON(instance) pg_replication_is_replica == 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
La réplication PostgreSQL lag ({{ raw('$value') }} > 30s)
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlTableNotVaccumed
expr:
time() - pg_stat_user_tables_last_autovacuum
> 60 * 60 * 24
for: 0m
labels:
severity: warning
annotations:
summary: >-
Le démon autovacuum n'a pas été lancé depuis 24h
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlTableNotAnalyzed
expr:
time() - pg_stat_user_tables_last_autoanalyze
> 60 * 60 * 24
for: 0m
labels:
severity: warning
annotations:
summary: >-
Table non-analysée depuis 24h
(base de données {{ raw('$labels.datname') }})
- alert: PostgresqlTooManyConnections
expr:
(
sum by (datname)
(pg_stat_activity_count{datname!~"template.*|postgres"})
) * 100
> pg_settings_max_connections * 80
for: 2m
labels:
severity: warning
annotations:
summary: >-
PostgreSQL a trop de connexions
({{ raw('$value | printf "%.1f"') }} > 80%)
(base de données {{ raw('$labels.datname') }})
- alert: PostgresqlDeadLocks
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
for: 0m
labels:
severity: warning
annotations:
summary: >-
PostgreSQL a plus de 5 deadlocks.
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlSlowQueries
expr: pg_slow_queries > 0
for: 2m
labels:
severity: warning
annotations:
summary: >-
Présence de requêtes lentes (slow-queries)
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlHighRollbackRate
expr:
(
rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) /
rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m])
) * 100
> 7
for: 0m
labels:
severity: warning
annotations:
summary: >-
PostgreSQL a un taux de retour en arrière (rollback) élevé
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} %)
- alert: PostgresqlWaleReplicationStopped
expr: rate(pg_xlog_position_bytes[1m]) == 0
for: 0m
labels:
severity: critical
annotations:
summary: >-
Réplication de PostgreSQL WALE stoppée
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlHighRateStatementTimeout
expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Beaucoup de requêtes PostgreSQL sont timeout
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
- alert: PostgresqlHighRateDeadlock
expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
PostgreSQL a un fort taux de deadlock
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
# - alert: PostgresqlReplicationLagBytes
# expr:
# (pg_xlog_position_bytes and pg_replication_is_replica == 0)
# - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1)
# > 1e+09
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} )
pz2891 marked this conversation as resolved
Review

Idem (instance superflue).
Mettre "database" et "value" en français ?

Idem (instance superflue). Mettre "database" et "value" en français ?
- alert: PostgresqlTooManyDeadTuples
expr:
(
(pg_stat_user_tables_n_dead_tup > 10000)
/ (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)
) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
for: 2m
labels:
severity: warning
annotations:
summary: >-
Les tuples morts PostgreSQL sont trop volumineux
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
- alert: PostgresqlSplitBrain
expr: count(pg_replication_is_replica == 0) != 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
Split Brain : trop de bases de données PostgreSQL primaires en mode lecture-écriture
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }} )
- alert: PostgresqlPromotedNode
expr:
pg_replication_is_replica
and
changes(pg_replication_is_replica[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: >-
Le serveur de secours PostgreSQL a été promu comme nœud principal
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }})
- alert: PostgresqlTooManyLocksAcquired
expr:
(
(sum (pg_locks_count))
/ (pg_settings_max_locks_per_transaction * pg_settings_max_connections)
) * 100 > 20
for: 2m
labels:
severity: critical
annotations:
summary: >-
Trop de deadlocks acquis sur la base de données.
Si cette alerte se produit fréquemment, nous devrons peut-être augmenter
le paramètre postgres max_locks_per_transaction
(Valeur = {{ raw('$value | printf "%.1f"') }} )
...

View file

@ -20,8 +20,7 @@ alerting:
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "alert.rules.yml" # Monitoring alerts, this is the file you may be searching!
- "django.rules.yml" # Custom rules specific for Django project monitoring
- "alerts/*.yml" # Monitoring alerts, this is the file you may be searching!
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
@ -101,4 +100,27 @@ scrape_configs:
file_sd_configs:
- files:
- '/etc/prometheus/targets_docker.json'
relabel_configs:
# Do not put :8087 in instance name, rather here
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:8087'
- job_name: postgresql
file_sd_configs:
- files:
- '/etc/prometheus/targets_postgres.json'
relabel_configs:
# Do not put :9187 in instance name, rather here
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9187'
...

View file

@ -7,7 +7,7 @@
groups:
- name: alert.rules
- name: server.rules
rules:
- alert: InstanceDown
@ -149,78 +149,5 @@ groups:
summary: >
Charge à {{ raw('$value') }}
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Source d'alimentation changée
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 3
for: 0m
labels:
severity: warning
annotations:
summary: >-
État de la batterie faible
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 4
for: 0m
labels:
severity: critical
annotations:
summary: >-
État de la batterie critique
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 3m
labels:
severity: critical
annotations:
summary: >-
Charge de {{ raw('$value | printf "%.1f"') }}%
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension d'entrée de {{ raw('$value') }}V
- alert: UpsWrongOutputVoltage
expr: >-
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
< 3 * stddev_over_time(upsOutputVoltage[1d])
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension de sortie de {{ raw('$value') }}V
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 8
for: 0m
labels:
severity: warning
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 5
for: 0m
labels:
severity: critical
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
...

View file

@ -0,0 +1,87 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: ups.rules
rules:
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Source d'alimentation changée
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 3
for: 0m
labels:
severity: warning
annotations:
summary: >-
État de la batterie faible
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 4
for: 0m
labels:
severity: critical
annotations:
summary: >-
État de la batterie critique
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 3m
labels:
severity: critical
annotations:
summary: >-
Charge de {{ raw('$value | printf "%.1f"') }}%
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension d'entrée de {{ raw('$value') }}V
- alert: UpsWrongOutputVoltage
expr: >-
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
< 3 * stddev_over_time(upsOutputVoltage[1d])
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension de sortie de {{ raw('$value') }}V
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 8
for: 0m
labels:
severity: warning
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 5
for: 0m
labels:
severity: critical
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
...

View file

@ -31,6 +31,7 @@ scrape_configs:
params:
match[]:
- '{job="servers"}'
- '{job="postgresql"}'
- '{job="prometheus"}'
- '{job="unifi_snmp"}'
- '{job="django"}'

View file

@ -0,0 +1,5 @@
---
- name: Restart prometheus-postgres-exporter
service:
name: prometheus-postgres-exporter
state: restarted

View file

@ -0,0 +1,39 @@
---
- name: Install Prometheus postgres-exporter
apt:
update_cache: true
name: prometheus-postgres-exporter
register: apt_result
retries: 3
until: apt_result is succeeded
- name: Make Prometheus postgres-exporter connect to databases using peercred
lineinfile:
path: /etc/default/prometheus-postgres-exporter
regexp: '^DATA_SOURCE_NAME='
line: |
DATA_SOURCE_NAME="user=postgres host=/var/run/postgresql/ sslmode=disable"
notify: Restart prometheus-postgres-exporter
- name: Make Prometheus postgres-exporter launched by postgres user
lineinfile:
path: /lib/systemd/system/prometheus-postgres-exporter.service
regexp: '^User='
line: |
User=postgres
notify: Restart prometheus-postgres-exporter
- name: Make Prometheus postgres-exporter listen on adm only
lineinfile:
path: /etc/default/prometheus-postgres-exporter
regexp: '^ARGS='
line: |
ARGS="--web.listen-address={{ ansible_hostname }}.adm.auro.re:9187"
notify: Restart prometheus-postgres-exporter
- name: Activate prometheus-postgres-exporter service
systemd:
name: prometheus-postgres-exporter
enabled: true
daemon_reload: true
state: started

View file

@ -129,7 +129,7 @@
name:
- postgresql
- postgresql-client-11=11.7-0+deb10u1
force: yes
force: true
- name: Install postgresql ansible module requirement(s)
pip: