--- {{ ansible_managed | comment }} {% macro raw(string) -%} {{ "{{" }} {{ string }} {{ "}}" }} {%- endmacro %} groups: - name: postgres.rules rules: - alert: PostgresqlDown expr: pg_up == 0 for: 0m labels: severity: critical annotations: summary: Serveur PostgreSQL down - alert: PostgresqlRestarted expr: time() - pg_postmaster_start_time_seconds < 60 for: 0m labels: severity: critical annotations: summary: Serveur PostgreSQL redémarré - alert: PostgresqlExporterError expr: pg_exporter_last_scrape_error > 0 for: 0m labels: severity: critical annotations: summary: Erreur dans l'exporter PostgreSQL - alert: PostgresqlReplicationLag expr: pg_replication_lag > 30 and ON(instance) pg_replication_is_replica == 1 for: 0m labels: severity: critical annotations: summary: >- La réplication PostgreSQL lag ({{ raw('$value') }} > 30s) (base de données {{ raw('$labels.datname')}} ) - alert: PostgresqlTableNotVaccumed expr: time() - pg_stat_user_tables_last_autovacuum > 60 * 60 * 24 for: 0m labels: severity: warning annotations: summary: >- Le démon autovacuum n'a pas été lancé depuis 24h (base de données {{ raw('$labels.datname') }} ) - alert: PostgresqlTableNotAnalyzed expr: time() - pg_stat_user_tables_last_autoanalyze > 60 * 60 * 24 for: 0m labels: severity: warning annotations: summary: >- Table non-analysée depuis 24h (base de données {{ raw('$labels.datname') }}) - alert: PostgresqlTooManyConnections expr: ( sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) ) * 100 > pg_settings_max_connections * 80 for: 2m labels: severity: warning annotations: summary: >- PostgreSQL a trop de connexions ({{ raw('$value | printf "%.1f"') }} > 80%) (base de données {{ raw('$labels.datname') }}) - alert: PostgresqlDeadLocks expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 for: 0m labels: severity: warning annotations: summary: >- PostgreSQL a plus de 5 deadlocks. (base de données {{ raw('$labels.datname') }} ) - alert: PostgresqlSlowQueries expr: pg_slow_queries > 0 for: 2m labels: severity: warning annotations: summary: >- Présence de requêtes lentes (slow-queries) (base de données {{ raw('$labels.datname') }} ) - alert: PostgresqlHighRollbackRate expr: ( rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) ) * 100 > 7 for: 0m labels: severity: warning annotations: summary: >- PostgreSQL a un taux de retour en arrière (rollback) élevé (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} %) - alert: PostgresqlWaleReplicationStopped expr: rate(pg_xlog_position_bytes[1m]) == 0 for: 0m labels: severity: critical annotations: summary: >- Réplication de PostgreSQL WALE stoppée (base de données {{ raw('$labels.datname') }} ) - alert: PostgresqlHighRateStatementTimeout expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3 for: 0m labels: severity: critical annotations: summary: >- Beaucoup de requêtes PostgreSQL sont timeout (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} ) - alert: PostgresqlHighRateDeadlock expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1 for: 0m labels: severity: critical annotations: summary: >- PostgreSQL a un fort taux de deadlock (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} ) # - alert: PostgresqlReplicationLagBytes # expr: # (pg_xlog_position_bytes and pg_replication_is_replica == 0) # - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1) # > 1e+09 # for: 0m # labels: # severity: critical # annotations: # summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} ) - alert: PostgresqlTooManyDeadTuples expr: ( (pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) ) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1) for: 2m labels: severity: warning annotations: summary: >- Les tuples morts PostgreSQL sont trop volumineux (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} ) - alert: PostgresqlSplitBrain expr: count(pg_replication_is_replica == 0) != 1 for: 0m labels: severity: critical annotations: summary: >- Split Brain : trop de bases de données PostgreSQL primaires en mode lecture-écriture (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }} ) - alert: PostgresqlPromotedNode expr: pg_replication_is_replica and changes(pg_replication_is_replica[1m]) > 0 for: 0m labels: severity: warning annotations: summary: >- Le serveur de secours PostgreSQL a été promu comme nœud principal (base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }}) - alert: PostgresqlTooManyLocksAcquired expr: ( (sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections) ) * 100 > 20 for: 2m labels: severity: critical annotations: summary: >- Trop de deadlocks acquis sur la base de données. Si cette alerte se produit fréquemment, nous devrons peut-être augmenter le paramètre postgres max_locks_per_transaction (Valeur = {{ raw('$value | printf "%.1f"') }} ) ...