Installation Prometheus-postgres-exporter #53

Merged
pz2891 merged 29 commits from prometheus_postgres_exporter into master 2021-04-14 20:19:59 +02:00
6 changed files with 355 additions and 77 deletions
Showing only changes of commit 9ebdf15bb9 - Show all commits

View file

@ -18,8 +18,33 @@
mode: u=r,g=r,o=
loop:
- prometheus.yml
- alert.rules.yml
notify: Restart Prometheus
- name: Creates directory for alerts
file:
pz2891 marked this conversation as resolved
Review

Ça pourrait être bien de préciser le propriétaire et le groupe du dossier.

Ça pourrait être bien de préciser le propriétaire et le groupe du dossier.
path: /etc/prometheus/alerts
state: directory
- name: Remove old files
file:
path: "/etc/prometheus/{{ item }}"
state: absent
with_items:
- alerts.rules.yml
- django.rules.yml
- name: Configure Prometheus alerts
template:
src: "{{ item }}.j2"
dest: "/etc/prometheus/alerts/{{ item }}"
owner: prometheus
group: prometheus
mode: u=r,g=r,o=
loop:
- server.rules.yml
- django.rules.yml
- ups.rules.yml
- postgres.rules.yml
notify: Restart Prometheus
- name: Make Prometheus snmp-exporter listen on localhost only

View file

@ -0,0 +1,219 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: postgres.rules
rules:
- alert: PostgresqlDown
expr: pg_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Serveur Postgresql down (instance {{ raw('$labels.instance') }})
- alert: PostgresqlRestarted
expr: time() - pg_postmaster_start_time_seconds < 60
for: 0m
labels:
severity: critical
annotations:
summary: Serveur Postgresql redémarré (instance {{ raw('$labels.instance') }})
- alert: PostgresqlExporterError
expr: pg_exporter_last_scrape_error > 0
for: 0m
labels:
severity: critical
annotations:
summary: Erreur dans l'exporter Postgresql (instance {{ raw('$labels.instance') }})
- alert: PostgresqlReplicationLag
expr:
pg_replication_lag > 30
and
ON(instance) pg_replication_is_replica == 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
Réplication Postgresql lag ({{ raw('$value') }} > 30s)
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname')}} )
- alert: PostgresqlTableNotVaccumed
expr:
time() - pg_stat_user_tables_last_autovacuum
> 60 * 60 * 24
for: 0m
labels:
severity: warning
annotations:
summary: >-
La table n'a pas été aspirée depuis 24h
(Instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} )
- alert: PostgresqlTableNotAnalyzed
expr:
time() - pg_stat_user_tables_last_autoanalyze
> 60 * 60 * 24
for: 0m
labels:
severity: warning
annotations:
summary: >-
Table non-analysée depuis 24h
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }})
- alert: PostgresqlTooManyConnections
expr:
(
sum by (datname)
(pg_stat_activity_count{datname!~"template.*|postgres"})
) * 100
> pg_settings_max_connections * 80
for: 2m
labels:
severity: warning
annotations:
summary: >-
PostgreSQL a trop de connexions
({{ raw('$value | printf "%.1f"') }} > 80%)
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }})
- alert: PostgresqlDeadLocks
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
for: 0m
labels:
severity: warning
annotations:
summary: >-
PostgreSQL a des cadenas morts
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} )
- alert: PostgresqlSlowQueries
expr: pg_slow_queries > 0
for: 2m
labels:
severity: warning
annotations:
summary: >-
Présence de requêtes lentes (slow-queries)
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} )
- alert: PostgresqlHighRollbackRate
expr:
(
rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) /
rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m])
) * 100
> 2
for: 0m
labels:
severity: warning
annotations:
summary: >-
Postgresql a un taux de retour en arrière (rollback) élevé
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} %)
- alert: PostgresqlWaleReplicationStopped
expr: rate(pg_xlog_position_bytes[1m]) == 0
for: 0m
labels:
severity: critical
annotations:
summary: >-
Réplication de Postgresql WALE stopée
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} )
- alert: PostgresqlHighRateStatementTimeout
expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Beaucoup de requêtes Postgresql sont timeout
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} )
- alert: PostgresqlHighRateDeadlock
expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
Postgresql a un fort taux de deadlock
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} )
# - alert: PostgresqlReplicationLagBytes
# expr:
# (pg_xlog_position_bytes and pg_replication_is_replica == 0)
# - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1)
# > 1e+09
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} )
pz2891 marked this conversation as resolved
Review

Idem (instance superflue).
Mettre "database" et "value" en français ?

Idem (instance superflue). Mettre "database" et "value" en français ?
- alert: PostgresqlTooManyDeadTuples
expr:
(
(pg_stat_user_tables_n_dead_tup > 10000)
/ (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)
) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
for: 2m
labels:
severity: warning
annotations:
summary: >-
Les tuples morts PostgreSQL sont trop volumineux
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} )
- alert: PostgresqlSplitBrain
expr: count(pg_replication_is_replica == 0) != 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
Split Brain, trop de bases de données Postgresql primaires en mode lecture-écriture
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value') }} )
- alert: PostgresqlPromotedNode
expr:
pg_replication_is_replica
and
changes(pg_replication_is_replica[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: >-
Le serveur de secours PostgreSQL a été promu comme nœud principal
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value') }})
- alert: PostgresqlTooManyLocksAcquired
expr:
(
(sum (pg_locks_count))
/ (pg_settings_max_locks_per_transaction * pg_settings_max_connections)
) * 100 > 20
for: 2m
labels:
severity: critical
annotations:
summary: >-
Trop de verrous acquis sur la base de données.
Si cette alerte se produit fréquemment, nous devrons peut-être augmenter
le paramètre postgres max_locks_per_transaction
(instance {{ raw('$labels.instance') }}, value = {{ raw('$value | printf "%.1f"') }} )
...

View file

@ -20,8 +20,7 @@ alerting:
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "alert.rules.yml" # Monitoring alerts, this is the file you may be searching!
- "django.rules.yml" # Custom rules specific for Django project monitoring
- "alerts/*.yml" # Monitoring alerts, this is the file you may be searching!
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.

View file

@ -7,7 +7,7 @@
groups:
- name: alert.rules
- name: server.rules
rules:
- alert: InstanceDown
@ -149,78 +149,5 @@ groups:
summary: >
Charge à {{ raw('$value') }}
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Source d'alimentation changée
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 3
for: 0m
labels:
severity: warning
annotations:
summary: >-
État de la batterie faible
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 4
for: 0m
labels:
severity: critical
annotations:
summary: >-
État de la batterie critique
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 3m
labels:
severity: critical
annotations:
summary: >-
Charge de {{ raw('$value | printf "%.1f"') }}%
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension d'entrée de {{ raw('$value') }}V
- alert: UpsWrongOutputVoltage
expr: >-
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
< 3 * stddev_over_time(upsOutputVoltage[1d])
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension de sortie de {{ raw('$value') }}V
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 8
for: 0m
labels:
severity: warning
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 5
for: 0m
labels:
severity: critical
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
...

View file

@ -0,0 +1,87 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: ups.rules
rules:
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Source d'alimentation changée
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 3
for: 0m
labels:
severity: warning
annotations:
summary: >-
État de la batterie faible
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 4
for: 0m
labels:
severity: critical
annotations:
summary: >-
État de la batterie critique
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 3m
labels:
severity: critical
annotations:
summary: >-
Charge de {{ raw('$value | printf "%.1f"') }}%
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension d'entrée de {{ raw('$value') }}V
- alert: UpsWrongOutputVoltage
expr: >-
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
< 3 * stddev_over_time(upsOutputVoltage[1d])
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension de sortie de {{ raw('$value') }}V
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 8
for: 0m
labels:
severity: warning
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 5
for: 0m
labels:
severity: critical
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
...

View file

@ -0,0 +1,21 @@
---
- name: Install Prometheus postgres-exporter
apt:
update_cache: true
name: prometheus-postgres-exporter
register: apt_result
retries: 3
until: apt_result is succeeded
when:
- ansible_lsb.codename != 'bullseye'
# Doesn't work on Debian Stretch with the old prometheus package
- name: Make Prometheus node-exporter listen on adm only
lineinfile:
path: /etc/default/prometheus-node-exporter
regexp: '^ARGS='
line: |
ARGS="--web.listen-address={{ ansible_hostname }}.adm.auro.re:9100"
notify: Restart prometheus-node-exporter
git push --set-upstream origin add_ups_231