Installation Prometheus-postgres-exporter #53
23
hosts
|
@ -549,3 +549,26 @@ proxy.adm.auro.re
|
||||||
bdd.adm.auro.re
|
bdd.adm.auro.re
|
||||||
bdd-ovh.adm.auro.re
|
bdd-ovh.adm.auro.re
|
||||||
re2o-db.adm.auro.re
|
re2o-db.adm.auro.re
|
||||||
|
|
||||||
|
[radius]
|
||||||
|
radius-aurore.adm.auro.re
|
||||||
|
radius-fleming.adm.auro.re
|
||||||
|
radius-fleming-backup.adm.auro.re
|
||||||
|
radius-edc.adm.auro.re
|
||||||
|
radius-edc-backup.adm.auro.re
|
||||||
|
radius-gs.adm.auro.re
|
||||||
|
radius-gs-backup.adm.auro.re
|
||||||
|
radius-pacaterie.adm.auro.re
|
||||||
|
radius-pacaterie-backup.adm.auro.re
|
||||||
|
radius-rives.adm.auro.re
|
||||||
|
radius-rives-backup.adm.auro.re
|
||||||
|
|
||||||
|
[prometheus]
|
||||||
|
prometheus-ovh.adm.auro.re
|
||||||
|
prometheus-aurore.adm.auro.re
|
||||||
|
prometheus-rives.adm.auro.re
|
||||||
|
prometheus-gs.adm.auro.re
|
||||||
|
prometheus-edc.adm.auro.re
|
||||||
|
prometheus-pacaterie.adm.auro.re
|
||||||
|
prometheus-fleming.adm.auro.re
|
||||||
|
prometheus-federate.adm.auro.re
|
||||||
|
|
|
@ -99,6 +99,9 @@
|
||||||
prometheus_targets:
|
prometheus_targets:
|
||||||
- targets: |
|
- targets: |
|
||||||
{{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
|
{{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
|
||||||
|
prometheus_postgres_targets:
|
||||||
|
- targets: |
|
||||||
|
{{ groups['bdd'] + groups['radius'] | list | sort }}
|
||||||
prometheus_switch_snmp_targets:
|
prometheus_switch_snmp_targets:
|
||||||
- targets:
|
- targets:
|
||||||
- yggdrasil.switch.auro.re
|
- yggdrasil.switch.auro.re
|
||||||
|
@ -128,8 +131,11 @@
|
||||||
prometheus_targets:
|
prometheus_targets:
|
||||||
- targets: |
|
- targets: |
|
||||||
{{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
|
{{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
|
||||||
|
prometheus_postgres_targets:
|
||||||
|
- targets:
|
||||||
|
- bdd-ovh.adm.auro.re
|
||||||
prometheus_docker_targets:
|
prometheus_docker_targets:
|
||||||
- docker-ovh.adm.auro.re:8087
|
- docker-ovh.adm.auro.re
|
||||||
roles:
|
roles:
|
||||||
- prometheus
|
- prometheus
|
||||||
|
|
||||||
|
@ -151,6 +157,11 @@
|
||||||
roles:
|
roles:
|
||||||
- prometheus_federate
|
- prometheus_federate
|
||||||
|
|
||||||
|
# Postgres Exporters
|
||||||
|
- hosts: bdd,radius
|
||||||
|
roles:
|
||||||
|
- prometheus_postgres
|
||||||
|
|
||||||
|
|
||||||
# Monitor all hosts
|
# Monitor all hosts
|
||||||
- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
|
- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
|
||||||
|
|
|
@ -18,8 +18,29 @@
|
||||||
mode: u=r,g=r,o=
|
mode: u=r,g=r,o=
|
||||||
loop:
|
loop:
|
||||||
- prometheus.yml
|
- prometheus.yml
|
||||||
- alert.rules.yml
|
notify: Restart Prometheus
|
||||||
|
|
||||||
|
- name: Creates directory for alerts
|
||||||
|
file:
|
||||||
pz2891 marked this conversation as resolved
|
|||||||
|
path: /etc/prometheus/alerts
|
||||||
|
state: directory
|
||||||
|
owner: prometheus
|
||||||
|
group: prometheus
|
||||||
|
mode: 0755
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Est-ce que c'est la peine de garder ces règles une fois que ça a été exécuté une fois sur le parc ? Peut-être que le fichier Est-ce que c'est la peine de garder ces règles une fois que ça a été exécuté une fois sur le parc ?
Peut-être que le fichier `alert.rules.yml` est créé par défaut dans l'installation Debian ?
|
|||||||
|
|
||||||
|
- name: Configure Prometheus alerts
|
||||||
|
template:
|
||||||
|
src: "{{ item }}.j2"
|
||||||
|
dest: "/etc/prometheus/alerts/{{ item }}"
|
||||||
|
owner: prometheus
|
||||||
|
group: prometheus
|
||||||
|
mode: u=r,g=r,o=
|
||||||
|
loop:
|
||||||
|
- server.rules.yml
|
||||||
|
- docker.rules.yml
|
||||||
- django.rules.yml
|
- django.rules.yml
|
||||||
|
- ups.rules.yml
|
||||||
|
- postgres.rules.yml
|
||||||
notify: Restart Prometheus
|
notify: Restart Prometheus
|
||||||
|
|
||||||
- name: Make Prometheus snmp-exporter listen on localhost only
|
- name: Make Prometheus snmp-exporter listen on localhost only
|
||||||
|
@ -75,6 +96,13 @@
|
||||||
mode: 0644
|
mode: 0644
|
||||||
when: prometheus_docker_targets is defined
|
when: prometheus_docker_targets is defined
|
||||||
|
|
||||||
|
- name: Configure Prometheus postgres monitoring
|
||||||
|
copy:
|
||||||
|
content: "{{ prometheus_postgres_targets | to_nice_json }}\n"
|
||||||
|
dest: /etc/prometheus/targets_postgres.json
|
||||||
|
mode: 0644
|
||||||
|
when: prometheus_postgres_targets is defined
|
||||||
|
|
||||||
- name: Activate prometheus service
|
- name: Activate prometheus service
|
||||||
systemd:
|
systemd:
|
||||||
jeltz
commented
Ajout propriétaire et groupe ? Ajout propriétaire et groupe ?
|
|||||||
name: prometheus
|
name: prometheus
|
||||||
|
|
50
roles/prometheus/templates/docker.rules.yml.j2
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
---
|
||||||
|
{{ ansible_managed | comment }}
|
||||||
|
|
||||||
|
{% macro raw(string) -%}
|
||||||
|
{{ "{{" }} {{ string }} {{ "}}" }}
|
||||||
|
{%- endmacro %}
|
||||||
|
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: docker.rules
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: ContainerDown
|
||||||
|
expr: docker_container_running_state != 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Le container Docker est éteint / tombé
|
||||||
pz2891 marked this conversation as resolved
Outdated
otthorn
commented
éteind ? éteind ?
otthorn
commented
Je sais pas trop, je trouve que mort en français ça fait bizzare. Mais en même temps "dead" semble être le terme correct en Anglais. Je sais pas trop, je trouve que mort en français ça fait bizzare. Mais en même temps "dead" semble être le terme correct en Anglais.
otthorn
commented
Pour postgres en dessous on utilise "down", ce qui est un Anglissisme mais c'est pas mal. Pour postgres en dessous on utilise "down", ce qui est un Anglissisme mais c'est pas mal.
Sinon on peut dire "tombé" qui est un traduction un peu abusive, mais ça je trouve pas ça choquant.
|
|||||||
|
(container {{ raw('$labels.name') }})
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Pas la peine de mettre l'instance, on l'affiche déjà avec Alertbot automatiquemenet. Pas la peine de mettre l'instance, on l'affiche déjà avec Alertbot automatiquemenet.
|
|||||||
|
|
||||||
|
- alert: ContainerFailed
|
||||||
|
expr: sum(increase(docker_container_restart_count[5m])) > 2
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Le container Docker redémarre souvent
|
||||||
|
(container {{ raw('$labels.name') }})
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
|
|||||||
|
|
||||||
|
- alert: ContainerFailed
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
docker_container_cpu_used_total
|
||||||
|
/
|
||||||
|
docker_container_cpu_capacity_total
|
||||||
|
) * 100
|
||||||
|
> 30
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Le container Docker utilise beaucoup de CPU
|
||||||
pz2891 marked this conversation as resolved
Outdated
otthorn
commented
Faudrait peut être faire des phrases. C'est comme si on disait "Server utilise beaucoup de CPU", c'est pas syntaxiquement correct. Faudrait peut être faire des phrases.
C'est comme si on disait "Server utilise beaucoup de CPU", c'est pas syntaxiquement correct.
"Le container Docker utilise beaucoup de CPU" peut etre ?
|
|||||||
|
(container {{ raw('$labels.name') }},
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
|
|||||||
|
valeur {{ raw('$value | printf "%.1f"') }})
|
||||||
|
|
||||||
|
...
|
219
roles/prometheus/templates/postgres.rules.yml.j2
Normal file
|
@ -0,0 +1,219 @@
|
||||||
|
---
|
||||||
|
{{ ansible_managed | comment }}
|
||||||
|
|
||||||
|
{% macro raw(string) -%}
|
||||||
|
{{ "{{" }} {{ string }} {{ "}}" }}
|
||||||
|
{%- endmacro %}
|
||||||
|
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: postgres.rules
|
||||||
|
rules:
|
||||||
|
- alert: PostgresqlDown
|
||||||
|
expr: pg_up == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Serveur PostgreSQL down
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
|
|||||||
|
|
||||||
|
- alert: PostgresqlRestarted
|
||||||
|
expr: time() - pg_postmaster_start_time_seconds < 60
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Serveur PostgreSQL redémarré
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
|
|||||||
|
|
||||||
|
- alert: PostgresqlExporterError
|
||||||
|
expr: pg_exporter_last_scrape_error > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Erreur dans l'exporter PostgreSQL
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
|
|||||||
|
|
||||||
|
- alert: PostgresqlReplicationLag
|
||||||
|
expr:
|
||||||
|
pg_replication_lag > 30
|
||||||
|
and
|
||||||
|
ON(instance) pg_replication_is_replica == 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
La réplication PostgreSQL lag ({{ raw('$value') }} > 30s)
|
||||||
|
(base de données {{ raw('$labels.datname') }} )
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" en français ?
|
|||||||
|
|
||||||
|
- alert: PostgresqlTableNotVaccumed
|
||||||
|
expr:
|
||||||
|
time() - pg_stat_user_tables_last_autovacuum
|
||||||
|
> 60 * 60 * 24
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Le démon autovacuum n'a pas été lancé depuis 24h
|
||||||
pz2891 marked this conversation as resolved
Outdated
otthorn
commented
Je suis pas sûr que des gens utilisent vraiment "aspirée" pour traduire "autovacuum" https://docs.postgresql.fr/8.1/runtime-config-autovacuum.html (seul source que j'ai trouvé en français) Je suis pas sûr que des gens utilisent vraiment "aspirée" pour traduire "autovacuum"
https://docs.postgresql.fr/8.1/runtime-config-autovacuum.html (seul source que j'ai trouvé en français)
|
|||||||
|
(base de données {{ raw('$labels.datname') }} )
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" en français ?
|
|||||||
|
|
||||||
|
- alert: PostgresqlTableNotAnalyzed
|
||||||
|
expr:
|
||||||
|
time() - pg_stat_user_tables_last_autoanalyze
|
||||||
|
> 60 * 60 * 24
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Table non-analysée depuis 24h
|
||||||
|
(base de données {{ raw('$labels.datname') }})
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" en français ?
|
|||||||
|
|
||||||
|
- alert: PostgresqlTooManyConnections
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
sum by (datname)
|
||||||
|
(pg_stat_activity_count{datname!~"template.*|postgres"})
|
||||||
|
) * 100
|
||||||
|
> pg_settings_max_connections * 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
PostgreSQL a trop de connexions
|
||||||
|
({{ raw('$value | printf "%.1f"') }} > 80%)
|
||||||
|
(base de données {{ raw('$labels.datname') }})
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" en français ?
|
|||||||
|
|
||||||
|
- alert: PostgresqlDeadLocks
|
||||||
|
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
PostgreSQL a plus de 5 deadlocks.
|
||||||
pz2891 marked this conversation as resolved
Outdated
otthorn
commented
Pareil je pense pas que ça se traduise Source random https://forum.postgresql.fr/viewtopic.php?id=5643 Pareil je pense pas que ça se traduise
Source random https://forum.postgresql.fr/viewtopic.php?id=5643
|
|||||||
|
(base de données {{ raw('$labels.datname') }} )
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" en français ?
|
|||||||
|
|
||||||
|
- alert: PostgresqlSlowQueries
|
||||||
|
expr: pg_slow_queries > 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Présence de requêtes lentes (slow-queries)
|
||||||
|
(base de données {{ raw('$labels.datname') }} )
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" en français ?
|
|||||||
|
|
||||||
|
- alert: PostgresqlHighRollbackRate
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) /
|
||||||
|
rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m])
|
||||||
|
) * 100
|
||||||
|
> 7
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
PostgreSQL a un taux de retour en arrière (rollback) élevé
|
||||||
|
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} %)
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" et "value" en français ?
|
|||||||
|
|
||||||
|
- alert: PostgresqlWaleReplicationStopped
|
||||||
|
expr: rate(pg_xlog_position_bytes[1m]) == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Réplication de PostgreSQL WALE stoppée
|
||||||
pz2891 marked this conversation as resolved
Outdated
otthorn
commented
WALE ? WAL peut être ? WALE ?
https://www.postgresql.org/search/?u=%2Fdocs%2F13%2F&q=WALE
WAL peut être ?
Sinon je ne sais pas ce que c'est.
pz2891
commented
J'ai repris la source : https://awesome-prometheus-alerts.grep.to/rules.html#rule-postgresql-15 J'ai repris la source : https://awesome-prometheus-alerts.grep.to/rules.html#rule-postgresql-15
|
|||||||
|
(base de données {{ raw('$labels.datname') }} )
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" en français ?
"stoppée".
|
|||||||
|
|
||||||
|
- alert: PostgresqlHighRateStatementTimeout
|
||||||
|
expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Beaucoup de requêtes PostgreSQL sont timeout
|
||||||
|
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" et "value" en français ?
|
|||||||
|
|
||||||
|
- alert: PostgresqlHighRateDeadlock
|
||||||
|
expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
PostgreSQL a un fort taux de deadlock
|
||||||
|
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" et "value" en français ?
|
|||||||
|
|
||||||
|
# - alert: PostgresqlReplicationLagBytes
|
||||||
|
# expr:
|
||||||
|
# (pg_xlog_position_bytes and pg_replication_is_replica == 0)
|
||||||
|
# - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1)
|
||||||
|
# > 1e+09
|
||||||
|
# for: 0m
|
||||||
|
# labels:
|
||||||
|
# severity: critical
|
||||||
|
# annotations:
|
||||||
|
# summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} )
|
||||||
pz2891 marked this conversation as resolved
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" et "value" en français ?
|
|||||||
|
|
||||||
|
- alert: PostgresqlTooManyDeadTuples
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
(pg_stat_user_tables_n_dead_tup > 10000)
|
||||||
|
/ (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)
|
||||||
|
) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Les tuples morts PostgreSQL sont trop volumineux
|
||||||
|
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" et "value" en français ?
|
|||||||
|
|
||||||
|
- alert: PostgresqlSplitBrain
|
||||||
|
expr: count(pg_replication_is_replica == 0) != 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Split Brain : trop de bases de données PostgreSQL primaires en mode lecture-écriture
|
||||||
pz2891 marked this conversation as resolved
Outdated
otthorn
commented
proposition: "Split Brain : trop de bases [...]" proposition: "Split Brain : trop de bases [...]"
|
|||||||
|
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }} )
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" et "value" en français ?
|
|||||||
|
|
||||||
|
- alert: PostgresqlPromotedNode
|
||||||
|
expr:
|
||||||
|
pg_replication_is_replica
|
||||||
|
and
|
||||||
|
changes(pg_replication_is_replica[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Le serveur de secours PostgreSQL a été promu comme nœud principal
|
||||||
|
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }})
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" et "value" en français ?
|
|||||||
|
|
||||||
|
- alert: PostgresqlTooManyLocksAcquired
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
(sum (pg_locks_count))
|
||||||
|
/ (pg_settings_max_locks_per_transaction * pg_settings_max_connections)
|
||||||
|
) * 100 > 20
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Trop de deadlocks acquis sur la base de données.
|
||||||
pz2891 marked this conversation as resolved
Outdated
otthorn
commented
Pareil, je ne pense pas que ça se traduise. Ou alors peut etre plutot par "Verouillages" non ? Pareil, je ne pense pas que ça se traduise. Ou alors peut etre plutot par "Verouillages" non ?
J'ai pas de source pour ça.
|
|||||||
|
Si cette alerte se produit fréquemment, nous devrons peut-être augmenter
|
||||||
|
le paramètre postgres max_locks_per_transaction
|
||||||
|
(Valeur = {{ raw('$value | printf "%.1f"') }} )
|
||||||
pz2891 marked this conversation as resolved
Outdated
jeltz
commented
Idem (instance superflue). Idem (instance superflue).
Mettre "database" et "value" en français ?
|
|||||||
|
|
||||||
|
...
|
||||||
|
|
|
@ -20,8 +20,7 @@ alerting:
|
||||||
|
|
||||||
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||||
rule_files:
|
rule_files:
|
||||||
- "alert.rules.yml" # Monitoring alerts, this is the file you may be searching!
|
- "alerts/*.yml" # Monitoring alerts, this is the file you may be searching!
|
||||||
- "django.rules.yml" # Custom rules specific for Django project monitoring
|
|
||||||
|
|
||||||
# A scrape configuration containing exactly one endpoint to scrape:
|
# A scrape configuration containing exactly one endpoint to scrape:
|
||||||
# Here it's Prometheus itself.
|
# Here it's Prometheus itself.
|
||||||
|
@ -101,4 +100,27 @@ scrape_configs:
|
||||||
file_sd_configs:
|
file_sd_configs:
|
||||||
- files:
|
- files:
|
||||||
- '/etc/prometheus/targets_docker.json'
|
- '/etc/prometheus/targets_docker.json'
|
||||||
|
relabel_configs:
|
||||||
|
# Do not put :8087 in instance name, rather here
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: __address__
|
||||||
|
replacement: '$1:8087'
|
||||||
|
|
||||||
|
- job_name: postgresql
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- '/etc/prometheus/targets_postgres.json'
|
||||||
|
relabel_configs:
|
||||||
|
# Do not put :9187 in instance name, rather here
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: __address__
|
||||||
|
replacement: '$1:9187'
|
||||||
...
|
...
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
|
|
||||||
groups:
|
groups:
|
||||||
|
|
||||||
- name: alert.rules
|
- name: server.rules
|
||||||
rules:
|
rules:
|
||||||
|
|
||||||
- alert: InstanceDown
|
- alert: InstanceDown
|
||||||
|
@ -149,78 +149,5 @@ groups:
|
||||||
summary: >
|
summary: >
|
||||||
Charge à {{ raw('$value') }}
|
Charge à {{ raw('$value') }}
|
||||||
|
|
||||||
- alert: UpsOutputSourceChanged
|
|
||||||
expr: upsOutputSource != 3
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
Source d'alimentation changée
|
|
||||||
|
|
||||||
- alert: UpsBatteryStatus
|
|
||||||
expr: upsBatteryStatus == 3
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
État de la batterie faible
|
|
||||||
|
|
||||||
- alert: UpsBatteryStatus
|
|
||||||
expr: upsBatteryStatus == 4
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
État de la batterie critique
|
|
||||||
|
|
||||||
- alert: UpsHighLoad
|
|
||||||
expr: upsOutputPercentLoad > 70
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
Charge de {{ raw('$value | printf "%.1f"') }}%
|
|
||||||
|
|
||||||
- alert: UpsWrongInputVoltage
|
|
||||||
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
Tension d'entrée de {{ raw('$value') }}V
|
|
||||||
|
|
||||||
- alert: UpsWrongOutputVoltage
|
|
||||||
expr: >-
|
|
||||||
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
|
|
||||||
< 3 * stddev_over_time(upsOutputVoltage[1d])
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
Tension de sortie de {{ raw('$value') }}V
|
|
||||||
|
|
||||||
- alert: UpsTimeRemaining
|
|
||||||
expr: upsEstimatedMinutesRemaining < 8
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
Autonomie restante de {{ raw('$value') }} min
|
|
||||||
|
|
||||||
- alert: UpsTimeRemaining
|
|
||||||
expr: upsEstimatedMinutesRemaining < 5
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
Autonomie restante de {{ raw('$value') }} min
|
|
||||||
|
|
||||||
...
|
...
|
87
roles/prometheus/templates/ups.rules.yml.j2
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
---
|
||||||
|
{{ ansible_managed | comment }}
|
||||||
|
|
||||||
|
{% macro raw(string) -%}
|
||||||
|
{{ "{{" }} {{ string }} {{ "}}" }}
|
||||||
|
{%- endmacro %}
|
||||||
|
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: ups.rules
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: UpsOutputSourceChanged
|
||||||
|
expr: upsOutputSource != 3
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Source d'alimentation changée
|
||||||
|
|
||||||
|
- alert: UpsBatteryStatus
|
||||||
|
expr: upsBatteryStatus == 3
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
État de la batterie faible
|
||||||
|
|
||||||
|
- alert: UpsBatteryStatus
|
||||||
|
expr: upsBatteryStatus == 4
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
État de la batterie critique
|
||||||
|
|
||||||
|
- alert: UpsHighLoad
|
||||||
|
expr: upsOutputPercentLoad > 70
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Charge de {{ raw('$value | printf "%.1f"') }}%
|
||||||
|
|
||||||
|
- alert: UpsWrongInputVoltage
|
||||||
|
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Tension d'entrée de {{ raw('$value') }}V
|
||||||
|
|
||||||
|
- alert: UpsWrongOutputVoltage
|
||||||
|
expr: >-
|
||||||
|
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
|
||||||
|
< 3 * stddev_over_time(upsOutputVoltage[1d])
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Tension de sortie de {{ raw('$value') }}V
|
||||||
|
|
||||||
|
- alert: UpsTimeRemaining
|
||||||
|
expr: upsEstimatedMinutesRemaining < 8
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Autonomie restante de {{ raw('$value') }} min
|
||||||
|
|
||||||
|
- alert: UpsTimeRemaining
|
||||||
|
expr: upsEstimatedMinutesRemaining < 5
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Autonomie restante de {{ raw('$value') }} min
|
||||||
|
|
||||||
|
...
|
|
@ -31,6 +31,7 @@ scrape_configs:
|
||||||
params:
|
params:
|
||||||
match[]:
|
match[]:
|
||||||
- '{job="servers"}'
|
- '{job="servers"}'
|
||||||
|
- '{job="postgresql"}'
|
||||||
- '{job="prometheus"}'
|
- '{job="prometheus"}'
|
||||||
- '{job="unifi_snmp"}'
|
- '{job="unifi_snmp"}'
|
||||||
- '{job="django"}'
|
- '{job="django"}'
|
||||||
|
|
5
roles/prometheus_postgres/handlers/main.yml
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
---
|
||||||
|
- name: Restart prometheus-postgres-exporter
|
||||||
|
service:
|
||||||
|
name: prometheus-postgres-exporter
|
||||||
|
state: restarted
|
39
roles/prometheus_postgres/tasks/main.yml
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
---
|
||||||
|
- name: Install Prometheus postgres-exporter
|
||||||
|
apt:
|
||||||
|
update_cache: true
|
||||||
|
name: prometheus-postgres-exporter
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Make Prometheus postgres-exporter connect to databases using peercred
|
||||||
|
lineinfile:
|
||||||
|
path: /etc/default/prometheus-postgres-exporter
|
||||||
|
regexp: '^DATA_SOURCE_NAME='
|
||||||
|
line: |
|
||||||
|
DATA_SOURCE_NAME="user=postgres host=/var/run/postgresql/ sslmode=disable"
|
||||||
|
notify: Restart prometheus-postgres-exporter
|
||||||
|
|
||||||
|
- name: Make Prometheus postgres-exporter launched by postgres user
|
||||||
|
lineinfile:
|
||||||
|
path: /lib/systemd/system/prometheus-postgres-exporter.service
|
||||||
|
regexp: '^User='
|
||||||
|
line: |
|
||||||
|
User=postgres
|
||||||
|
notify: Restart prometheus-postgres-exporter
|
||||||
|
|
||||||
|
- name: Make Prometheus postgres-exporter listen on adm only
|
||||||
|
lineinfile:
|
||||||
|
path: /etc/default/prometheus-postgres-exporter
|
||||||
|
regexp: '^ARGS='
|
||||||
|
line: |
|
||||||
|
ARGS="--web.listen-address={{ ansible_hostname }}.adm.auro.re:9187"
|
||||||
|
notify: Restart prometheus-postgres-exporter
|
||||||
|
|
||||||
|
- name: Activate prometheus-postgres-exporter service
|
||||||
|
systemd:
|
||||||
|
name: prometheus-postgres-exporter
|
||||||
|
enabled: true
|
||||||
|
daemon_reload: true
|
||||||
|
state: started
|
|
@ -129,7 +129,7 @@
|
||||||
name:
|
name:
|
||||||
- postgresql
|
- postgresql
|
||||||
- postgresql-client-11=11.7-0+deb10u1
|
- postgresql-client-11=11.7-0+deb10u1
|
||||||
force: yes
|
force: true
|
||||||
|
|
||||||
- name: Install postgresql ansible module requirement(s)
|
- name: Install postgresql ansible module requirement(s)
|
||||||
pip:
|
pip:
|
||||||
|
|
Ça pourrait être bien de préciser le propriétaire et le groupe du dossier.