2021-04-11 15:58:35 +02:00
|
|
|
---
|
|
|
|
{{ ansible_managed | comment }}
|
|
|
|
|
|
|
|
{% macro raw(string) -%}
|
|
|
|
{{ "{{" }} {{ string }} {{ "}}" }}
|
|
|
|
{%- endmacro %}
|
|
|
|
|
|
|
|
groups:
|
|
|
|
|
|
|
|
- name: postgres.rules
|
|
|
|
rules:
|
|
|
|
- alert: PostgresqlDown
|
|
|
|
expr: pg_up == 0
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: Serveur Postgresql down (instance {{ raw('$labels.instance') }})
|
|
|
|
|
|
|
|
- alert: PostgresqlRestarted
|
|
|
|
expr: time() - pg_postmaster_start_time_seconds < 60
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: Serveur Postgresql redémarré (instance {{ raw('$labels.instance') }})
|
|
|
|
|
|
|
|
- alert: PostgresqlExporterError
|
|
|
|
expr: pg_exporter_last_scrape_error > 0
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: Erreur dans l'exporter Postgresql (instance {{ raw('$labels.instance') }})
|
|
|
|
|
|
|
|
- alert: PostgresqlReplicationLag
|
|
|
|
expr:
|
|
|
|
pg_replication_lag > 30
|
|
|
|
and
|
|
|
|
ON(instance) pg_replication_is_replica == 1
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
Réplication Postgresql lag ({{ raw('$value') }} > 30s)
|
|
|
|
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname')}} )
|
|
|
|
|
|
|
|
- alert: PostgresqlTableNotVaccumed
|
|
|
|
expr:
|
|
|
|
time() - pg_stat_user_tables_last_autovacuum
|
|
|
|
> 60 * 60 * 24
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
La table n'a pas été aspirée depuis 24h
|
|
|
|
(Instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} )
|
|
|
|
|
|
|
|
- alert: PostgresqlTableNotAnalyzed
|
|
|
|
expr:
|
|
|
|
time() - pg_stat_user_tables_last_autoanalyze
|
|
|
|
> 60 * 60 * 24
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
Table non-analysée depuis 24h
|
|
|
|
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }})
|
|
|
|
|
|
|
|
- alert: PostgresqlTooManyConnections
|
|
|
|
expr:
|
|
|
|
(
|
|
|
|
sum by (datname)
|
|
|
|
(pg_stat_activity_count{datname!~"template.*|postgres"})
|
|
|
|
) * 100
|
|
|
|
> pg_settings_max_connections * 80
|
|
|
|
for: 2m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
PostgreSQL a trop de connexions
|
|
|
|
({{ raw('$value | printf "%.1f"') }} > 80%)
|
|
|
|
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }})
|
|
|
|
|
|
|
|
- alert: PostgresqlDeadLocks
|
|
|
|
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
PostgreSQL a des cadenas morts
|
|
|
|
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} )
|
|
|
|
|
|
|
|
- alert: PostgresqlSlowQueries
|
|
|
|
expr: pg_slow_queries > 0
|
|
|
|
for: 2m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
Présence de requêtes lentes (slow-queries)
|
|
|
|
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} )
|
|
|
|
|
|
|
|
- alert: PostgresqlHighRollbackRate
|
|
|
|
expr:
|
|
|
|
(
|
|
|
|
rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) /
|
|
|
|
rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m])
|
|
|
|
) * 100
|
2021-04-11 20:57:53 +02:00
|
|
|
> 7
|
2021-04-11 15:58:35 +02:00
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
Postgresql a un taux de retour en arrière (rollback) élevé
|
|
|
|
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} %)
|
|
|
|
|
|
|
|
- alert: PostgresqlWaleReplicationStopped
|
|
|
|
expr: rate(pg_xlog_position_bytes[1m]) == 0
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
Réplication de Postgresql WALE stopée
|
|
|
|
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} )
|
|
|
|
|
|
|
|
- alert: PostgresqlHighRateStatementTimeout
|
|
|
|
expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
Beaucoup de requêtes Postgresql sont timeout
|
|
|
|
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} )
|
|
|
|
|
|
|
|
- alert: PostgresqlHighRateDeadlock
|
|
|
|
expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
Postgresql a un fort taux de deadlock
|
|
|
|
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} )
|
|
|
|
|
|
|
|
# - alert: PostgresqlReplicationLagBytes
|
|
|
|
# expr:
|
|
|
|
# (pg_xlog_position_bytes and pg_replication_is_replica == 0)
|
|
|
|
# - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1)
|
|
|
|
# > 1e+09
|
|
|
|
# for: 0m
|
|
|
|
# labels:
|
|
|
|
# severity: critical
|
|
|
|
# annotations:
|
|
|
|
# summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} )
|
|
|
|
|
|
|
|
- alert: PostgresqlTooManyDeadTuples
|
|
|
|
expr:
|
|
|
|
(
|
|
|
|
(pg_stat_user_tables_n_dead_tup > 10000)
|
|
|
|
/ (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)
|
|
|
|
) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
|
|
|
|
for: 2m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
Les tuples morts PostgreSQL sont trop volumineux
|
|
|
|
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} )
|
|
|
|
|
|
|
|
- alert: PostgresqlSplitBrain
|
|
|
|
expr: count(pg_replication_is_replica == 0) != 1
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
Split Brain, trop de bases de données Postgresql primaires en mode lecture-écriture
|
|
|
|
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value') }} )
|
|
|
|
|
|
|
|
- alert: PostgresqlPromotedNode
|
|
|
|
expr:
|
|
|
|
pg_replication_is_replica
|
|
|
|
and
|
|
|
|
changes(pg_replication_is_replica[1m]) > 0
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
Le serveur de secours PostgreSQL a été promu comme nœud principal
|
|
|
|
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value') }})
|
|
|
|
|
|
|
|
- alert: PostgresqlTooManyLocksAcquired
|
|
|
|
expr:
|
|
|
|
(
|
|
|
|
(sum (pg_locks_count))
|
|
|
|
/ (pg_settings_max_locks_per_transaction * pg_settings_max_connections)
|
|
|
|
) * 100 > 20
|
|
|
|
for: 2m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: >-
|
|
|
|
Trop de verrous acquis sur la base de données.
|
|
|
|
Si cette alerte se produit fréquemment, nous devrons peut-être augmenter
|
|
|
|
le paramètre postgres max_locks_per_transaction
|
|
|
|
(instance {{ raw('$labels.instance') }}, value = {{ raw('$value | printf "%.1f"') }} )
|
|
|
|
|
|
|
|
...
|
|
|
|
|