--- {{ ansible_managed | comment }} {% macro raw(string) -%} {{ "{{" }} {{ string }} {{ "}}" }} {%- endmacro %} groups: - name: postgres.rules rules: - alert: PostgresqlDown expr: pg_up == 0 for: 0m labels: severity: critical annotations: summary: Serveur Postgresql down (instance {{ raw('$labels.instance') }}) - alert: PostgresqlRestarted expr: time() - pg_postmaster_start_time_seconds < 60 for: 0m labels: severity: critical annotations: summary: Serveur Postgresql redémarré (instance {{ raw('$labels.instance') }}) - alert: PostgresqlExporterError expr: pg_exporter_last_scrape_error > 0 for: 0m labels: severity: critical annotations: summary: Erreur dans l'exporter Postgresql (instance {{ raw('$labels.instance') }}) - alert: PostgresqlReplicationLag expr: pg_replication_lag > 30 and ON(instance) pg_replication_is_replica == 1 for: 0m labels: severity: critical annotations: summary: >- Réplication Postgresql lag ({{ raw('$value') }} > 30s) (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname')}} ) - alert: PostgresqlTableNotVaccumed expr: time() - pg_stat_user_tables_last_autovacuum > 60 * 60 * 24 for: 0m labels: severity: warning annotations: summary: >- La table n'a pas été aspirée depuis 24h (Instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) - alert: PostgresqlTableNotAnalyzed expr: time() - pg_stat_user_tables_last_autoanalyze > 60 * 60 * 24 for: 0m labels: severity: warning annotations: summary: >- Table non-analysée depuis 24h (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}) - alert: PostgresqlTooManyConnections expr: ( sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) ) * 100 > pg_settings_max_connections * 80 for: 2m labels: severity: warning annotations: summary: >- PostgreSQL a trop de connexions ({{ raw('$value | printf "%.1f"') }} > 80%) (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}) - alert: PostgresqlDeadLocks expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 for: 0m labels: severity: warning annotations: summary: >- PostgreSQL a des cadenas morts (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) - alert: PostgresqlSlowQueries expr: pg_slow_queries > 0 for: 2m labels: severity: warning annotations: summary: >- Présence de requêtes lentes (slow-queries) (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) - alert: PostgresqlHighRollbackRate expr: ( rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) ) * 100 > 2 for: 0m labels: severity: warning annotations: summary: >- Postgresql a un taux de retour en arrière (rollback) élevé (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} %) - alert: PostgresqlWaleReplicationStopped expr: rate(pg_xlog_position_bytes[1m]) == 0 for: 0m labels: severity: critical annotations: summary: >- Réplication de Postgresql WALE stopée (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} ) - alert: PostgresqlHighRateStatementTimeout expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3 for: 0m labels: severity: critical annotations: summary: >- Beaucoup de requêtes Postgresql sont timeout (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} ) - alert: PostgresqlHighRateDeadlock expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1 for: 0m labels: severity: critical annotations: summary: >- Postgresql a un fort taux de deadlock (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} ) # - alert: PostgresqlReplicationLagBytes # expr: # (pg_xlog_position_bytes and pg_replication_is_replica == 0) # - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1) # > 1e+09 # for: 0m # labels: # severity: critical # annotations: # summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} ) - alert: PostgresqlTooManyDeadTuples expr: ( (pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) ) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1) for: 2m labels: severity: warning annotations: summary: >- Les tuples morts PostgreSQL sont trop volumineux (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} ) - alert: PostgresqlSplitBrain expr: count(pg_replication_is_replica == 0) != 1 for: 0m labels: severity: critical annotations: summary: >- Split Brain, trop de bases de données Postgresql primaires en mode lecture-écriture (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value') }} ) - alert: PostgresqlPromotedNode expr: pg_replication_is_replica and changes(pg_replication_is_replica[1m]) > 0 for: 0m labels: severity: warning annotations: summary: >- Le serveur de secours PostgreSQL a été promu comme nœud principal (instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value') }}) - alert: PostgresqlTooManyLocksAcquired expr: ( (sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections) ) * 100 > 20 for: 2m labels: severity: critical annotations: summary: >- Trop de verrous acquis sur la base de données. Si cette alerte se produit fréquemment, nous devrons peut-être augmenter le paramètre postgres max_locks_per_transaction (instance {{ raw('$labels.instance') }}, value = {{ raw('$value | printf "%.1f"') }} ) ...