Merge pull request 'Installation Prometheus-postgres-exporter' (#53) from prometheus_postgres_exporter into master
Some checks failed
continuous-integration/drone/push Build is failing
Some checks failed
continuous-integration/drone/push Build is failing
Reviewed-on: Aurore/ansible#53 Reviewed-by: otthorn <otthorn@noreply@auro.re> Reviewed-by: jeltz <jeltz@noreply@auro.re>
This commit is contained in:
commit
1520ad92c8
12 changed files with 491 additions and 79 deletions
23
hosts
23
hosts
|
@ -549,3 +549,26 @@ proxy.adm.auro.re
|
||||||
bdd.adm.auro.re
|
bdd.adm.auro.re
|
||||||
bdd-ovh.adm.auro.re
|
bdd-ovh.adm.auro.re
|
||||||
re2o-db.adm.auro.re
|
re2o-db.adm.auro.re
|
||||||
|
|
||||||
|
[radius]
|
||||||
|
radius-aurore.adm.auro.re
|
||||||
|
radius-fleming.adm.auro.re
|
||||||
|
radius-fleming-backup.adm.auro.re
|
||||||
|
radius-edc.adm.auro.re
|
||||||
|
radius-edc-backup.adm.auro.re
|
||||||
|
radius-gs.adm.auro.re
|
||||||
|
radius-gs-backup.adm.auro.re
|
||||||
|
radius-pacaterie.adm.auro.re
|
||||||
|
radius-pacaterie-backup.adm.auro.re
|
||||||
|
radius-rives.adm.auro.re
|
||||||
|
radius-rives-backup.adm.auro.re
|
||||||
|
|
||||||
|
[prometheus]
|
||||||
|
prometheus-ovh.adm.auro.re
|
||||||
|
prometheus-aurore.adm.auro.re
|
||||||
|
prometheus-rives.adm.auro.re
|
||||||
|
prometheus-gs.adm.auro.re
|
||||||
|
prometheus-edc.adm.auro.re
|
||||||
|
prometheus-pacaterie.adm.auro.re
|
||||||
|
prometheus-fleming.adm.auro.re
|
||||||
|
prometheus-federate.adm.auro.re
|
||||||
|
|
|
@ -99,6 +99,9 @@
|
||||||
prometheus_targets:
|
prometheus_targets:
|
||||||
- targets: |
|
- targets: |
|
||||||
{{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
|
{{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
|
||||||
|
prometheus_postgres_targets:
|
||||||
|
- targets: |
|
||||||
|
{{ groups['bdd'] + groups['radius'] | list | sort }}
|
||||||
prometheus_switch_snmp_targets:
|
prometheus_switch_snmp_targets:
|
||||||
- targets:
|
- targets:
|
||||||
- yggdrasil.switch.auro.re
|
- yggdrasil.switch.auro.re
|
||||||
|
@ -128,8 +131,11 @@
|
||||||
prometheus_targets:
|
prometheus_targets:
|
||||||
- targets: |
|
- targets: |
|
||||||
{{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
|
{{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
|
||||||
|
prometheus_postgres_targets:
|
||||||
|
- targets:
|
||||||
|
- bdd-ovh.adm.auro.re
|
||||||
prometheus_docker_targets:
|
prometheus_docker_targets:
|
||||||
- docker-ovh.adm.auro.re:8087
|
- docker-ovh.adm.auro.re
|
||||||
roles:
|
roles:
|
||||||
- prometheus
|
- prometheus
|
||||||
|
|
||||||
|
@ -151,6 +157,11 @@
|
||||||
roles:
|
roles:
|
||||||
- prometheus_federate
|
- prometheus_federate
|
||||||
|
|
||||||
|
# Postgres Exporters
|
||||||
|
- hosts: bdd,radius
|
||||||
|
roles:
|
||||||
|
- prometheus_postgres
|
||||||
|
|
||||||
|
|
||||||
# Monitor all hosts
|
# Monitor all hosts
|
||||||
- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
|
- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
|
||||||
|
|
|
@ -18,8 +18,29 @@
|
||||||
mode: u=r,g=r,o=
|
mode: u=r,g=r,o=
|
||||||
loop:
|
loop:
|
||||||
- prometheus.yml
|
- prometheus.yml
|
||||||
- alert.rules.yml
|
notify: Restart Prometheus
|
||||||
|
|
||||||
|
- name: Creates directory for alerts
|
||||||
|
file:
|
||||||
|
path: /etc/prometheus/alerts
|
||||||
|
state: directory
|
||||||
|
owner: prometheus
|
||||||
|
group: prometheus
|
||||||
|
mode: 0755
|
||||||
|
|
||||||
|
- name: Configure Prometheus alerts
|
||||||
|
template:
|
||||||
|
src: "{{ item }}.j2"
|
||||||
|
dest: "/etc/prometheus/alerts/{{ item }}"
|
||||||
|
owner: prometheus
|
||||||
|
group: prometheus
|
||||||
|
mode: u=r,g=r,o=
|
||||||
|
loop:
|
||||||
|
- server.rules.yml
|
||||||
|
- docker.rules.yml
|
||||||
- django.rules.yml
|
- django.rules.yml
|
||||||
|
- ups.rules.yml
|
||||||
|
- postgres.rules.yml
|
||||||
notify: Restart Prometheus
|
notify: Restart Prometheus
|
||||||
|
|
||||||
- name: Make Prometheus snmp-exporter listen on localhost only
|
- name: Make Prometheus snmp-exporter listen on localhost only
|
||||||
|
@ -75,6 +96,13 @@
|
||||||
mode: 0644
|
mode: 0644
|
||||||
when: prometheus_docker_targets is defined
|
when: prometheus_docker_targets is defined
|
||||||
|
|
||||||
|
- name: Configure Prometheus postgres monitoring
|
||||||
|
copy:
|
||||||
|
content: "{{ prometheus_postgres_targets | to_nice_json }}\n"
|
||||||
|
dest: /etc/prometheus/targets_postgres.json
|
||||||
|
mode: 0644
|
||||||
|
when: prometheus_postgres_targets is defined
|
||||||
|
|
||||||
- name: Activate prometheus service
|
- name: Activate prometheus service
|
||||||
systemd:
|
systemd:
|
||||||
name: prometheus
|
name: prometheus
|
||||||
|
|
50
roles/prometheus/templates/docker.rules.yml.j2
Normal file
50
roles/prometheus/templates/docker.rules.yml.j2
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
---
|
||||||
|
{{ ansible_managed | comment }}
|
||||||
|
|
||||||
|
{% macro raw(string) -%}
|
||||||
|
{{ "{{" }} {{ string }} {{ "}}" }}
|
||||||
|
{%- endmacro %}
|
||||||
|
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: docker.rules
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: ContainerDown
|
||||||
|
expr: docker_container_running_state != 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Le container Docker est éteint / tombé
|
||||||
|
(container {{ raw('$labels.name') }})
|
||||||
|
|
||||||
|
- alert: ContainerFailed
|
||||||
|
expr: sum(increase(docker_container_restart_count[5m])) > 2
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Le container Docker redémarre souvent
|
||||||
|
(container {{ raw('$labels.name') }})
|
||||||
|
|
||||||
|
- alert: ContainerFailed
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
docker_container_cpu_used_total
|
||||||
|
/
|
||||||
|
docker_container_cpu_capacity_total
|
||||||
|
) * 100
|
||||||
|
> 30
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Le container Docker utilise beaucoup de CPU
|
||||||
|
(container {{ raw('$labels.name') }},
|
||||||
|
valeur {{ raw('$value | printf "%.1f"') }})
|
||||||
|
|
||||||
|
...
|
219
roles/prometheus/templates/postgres.rules.yml.j2
Normal file
219
roles/prometheus/templates/postgres.rules.yml.j2
Normal file
|
@ -0,0 +1,219 @@
|
||||||
|
---
|
||||||
|
{{ ansible_managed | comment }}
|
||||||
|
|
||||||
|
{% macro raw(string) -%}
|
||||||
|
{{ "{{" }} {{ string }} {{ "}}" }}
|
||||||
|
{%- endmacro %}
|
||||||
|
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: postgres.rules
|
||||||
|
rules:
|
||||||
|
- alert: PostgresqlDown
|
||||||
|
expr: pg_up == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Serveur PostgreSQL down
|
||||||
|
|
||||||
|
- alert: PostgresqlRestarted
|
||||||
|
expr: time() - pg_postmaster_start_time_seconds < 60
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Serveur PostgreSQL redémarré
|
||||||
|
|
||||||
|
- alert: PostgresqlExporterError
|
||||||
|
expr: pg_exporter_last_scrape_error > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Erreur dans l'exporter PostgreSQL
|
||||||
|
|
||||||
|
- alert: PostgresqlReplicationLag
|
||||||
|
expr:
|
||||||
|
pg_replication_lag > 30
|
||||||
|
and
|
||||||
|
ON(instance) pg_replication_is_replica == 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
La réplication PostgreSQL lag ({{ raw('$value') }} > 30s)
|
||||||
|
(base de données {{ raw('$labels.datname') }} )
|
||||||
|
|
||||||
|
- alert: PostgresqlTableNotVaccumed
|
||||||
|
expr:
|
||||||
|
time() - pg_stat_user_tables_last_autovacuum
|
||||||
|
> 60 * 60 * 24
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Le démon autovacuum n'a pas été lancé depuis 24h
|
||||||
|
(base de données {{ raw('$labels.datname') }} )
|
||||||
|
|
||||||
|
- alert: PostgresqlTableNotAnalyzed
|
||||||
|
expr:
|
||||||
|
time() - pg_stat_user_tables_last_autoanalyze
|
||||||
|
> 60 * 60 * 24
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Table non-analysée depuis 24h
|
||||||
|
(base de données {{ raw('$labels.datname') }})
|
||||||
|
|
||||||
|
- alert: PostgresqlTooManyConnections
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
sum by (datname)
|
||||||
|
(pg_stat_activity_count{datname!~"template.*|postgres"})
|
||||||
|
) * 100
|
||||||
|
> pg_settings_max_connections * 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
PostgreSQL a trop de connexions
|
||||||
|
({{ raw('$value | printf "%.1f"') }} > 80%)
|
||||||
|
(base de données {{ raw('$labels.datname') }})
|
||||||
|
|
||||||
|
- alert: PostgresqlDeadLocks
|
||||||
|
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
PostgreSQL a plus de 5 deadlocks.
|
||||||
|
(base de données {{ raw('$labels.datname') }} )
|
||||||
|
|
||||||
|
- alert: PostgresqlSlowQueries
|
||||||
|
expr: pg_slow_queries > 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Présence de requêtes lentes (slow-queries)
|
||||||
|
(base de données {{ raw('$labels.datname') }} )
|
||||||
|
|
||||||
|
- alert: PostgresqlHighRollbackRate
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) /
|
||||||
|
rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m])
|
||||||
|
) * 100
|
||||||
|
> 7
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
PostgreSQL a un taux de retour en arrière (rollback) élevé
|
||||||
|
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} %)
|
||||||
|
|
||||||
|
- alert: PostgresqlWaleReplicationStopped
|
||||||
|
expr: rate(pg_xlog_position_bytes[1m]) == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Réplication de PostgreSQL WALE stoppée
|
||||||
|
(base de données {{ raw('$labels.datname') }} )
|
||||||
|
|
||||||
|
- alert: PostgresqlHighRateStatementTimeout
|
||||||
|
expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Beaucoup de requêtes PostgreSQL sont timeout
|
||||||
|
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
|
||||||
|
|
||||||
|
- alert: PostgresqlHighRateDeadlock
|
||||||
|
expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
PostgreSQL a un fort taux de deadlock
|
||||||
|
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
|
||||||
|
|
||||||
|
# - alert: PostgresqlReplicationLagBytes
|
||||||
|
# expr:
|
||||||
|
# (pg_xlog_position_bytes and pg_replication_is_replica == 0)
|
||||||
|
# - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1)
|
||||||
|
# > 1e+09
|
||||||
|
# for: 0m
|
||||||
|
# labels:
|
||||||
|
# severity: critical
|
||||||
|
# annotations:
|
||||||
|
# summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} )
|
||||||
|
|
||||||
|
- alert: PostgresqlTooManyDeadTuples
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
(pg_stat_user_tables_n_dead_tup > 10000)
|
||||||
|
/ (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)
|
||||||
|
) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Les tuples morts PostgreSQL sont trop volumineux
|
||||||
|
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
|
||||||
|
|
||||||
|
- alert: PostgresqlSplitBrain
|
||||||
|
expr: count(pg_replication_is_replica == 0) != 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Split Brain : trop de bases de données PostgreSQL primaires en mode lecture-écriture
|
||||||
|
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }} )
|
||||||
|
|
||||||
|
- alert: PostgresqlPromotedNode
|
||||||
|
expr:
|
||||||
|
pg_replication_is_replica
|
||||||
|
and
|
||||||
|
changes(pg_replication_is_replica[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Le serveur de secours PostgreSQL a été promu comme nœud principal
|
||||||
|
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }})
|
||||||
|
|
||||||
|
- alert: PostgresqlTooManyLocksAcquired
|
||||||
|
expr:
|
||||||
|
(
|
||||||
|
(sum (pg_locks_count))
|
||||||
|
/ (pg_settings_max_locks_per_transaction * pg_settings_max_connections)
|
||||||
|
) * 100 > 20
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Trop de deadlocks acquis sur la base de données.
|
||||||
|
Si cette alerte se produit fréquemment, nous devrons peut-être augmenter
|
||||||
|
le paramètre postgres max_locks_per_transaction
|
||||||
|
(Valeur = {{ raw('$value | printf "%.1f"') }} )
|
||||||
|
|
||||||
|
...
|
||||||
|
|
|
@ -20,8 +20,7 @@ alerting:
|
||||||
|
|
||||||
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||||
rule_files:
|
rule_files:
|
||||||
- "alert.rules.yml" # Monitoring alerts, this is the file you may be searching!
|
- "alerts/*.yml" # Monitoring alerts, this is the file you may be searching!
|
||||||
- "django.rules.yml" # Custom rules specific for Django project monitoring
|
|
||||||
|
|
||||||
# A scrape configuration containing exactly one endpoint to scrape:
|
# A scrape configuration containing exactly one endpoint to scrape:
|
||||||
# Here it's Prometheus itself.
|
# Here it's Prometheus itself.
|
||||||
|
@ -101,4 +100,27 @@ scrape_configs:
|
||||||
file_sd_configs:
|
file_sd_configs:
|
||||||
- files:
|
- files:
|
||||||
- '/etc/prometheus/targets_docker.json'
|
- '/etc/prometheus/targets_docker.json'
|
||||||
|
relabel_configs:
|
||||||
|
# Do not put :8087 in instance name, rather here
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: __address__
|
||||||
|
replacement: '$1:8087'
|
||||||
|
|
||||||
|
- job_name: postgresql
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- '/etc/prometheus/targets_postgres.json'
|
||||||
|
relabel_configs:
|
||||||
|
# Do not put :9187 in instance name, rather here
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: __address__
|
||||||
|
replacement: '$1:9187'
|
||||||
...
|
...
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
|
|
||||||
groups:
|
groups:
|
||||||
|
|
||||||
- name: alert.rules
|
- name: server.rules
|
||||||
rules:
|
rules:
|
||||||
|
|
||||||
- alert: InstanceDown
|
- alert: InstanceDown
|
||||||
|
@ -149,78 +149,5 @@ groups:
|
||||||
summary: >
|
summary: >
|
||||||
Charge à {{ raw('$value') }}
|
Charge à {{ raw('$value') }}
|
||||||
|
|
||||||
- alert: UpsOutputSourceChanged
|
|
||||||
expr: upsOutputSource != 3
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
Source d'alimentation changée
|
|
||||||
|
|
||||||
- alert: UpsBatteryStatus
|
|
||||||
expr: upsBatteryStatus == 3
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
État de la batterie faible
|
|
||||||
|
|
||||||
- alert: UpsBatteryStatus
|
|
||||||
expr: upsBatteryStatus == 4
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
État de la batterie critique
|
|
||||||
|
|
||||||
- alert: UpsHighLoad
|
|
||||||
expr: upsOutputPercentLoad > 70
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
Charge de {{ raw('$value | printf "%.1f"') }}%
|
|
||||||
|
|
||||||
- alert: UpsWrongInputVoltage
|
|
||||||
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
Tension d'entrée de {{ raw('$value') }}V
|
|
||||||
|
|
||||||
- alert: UpsWrongOutputVoltage
|
|
||||||
expr: >-
|
|
||||||
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
|
|
||||||
< 3 * stddev_over_time(upsOutputVoltage[1d])
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
Tension de sortie de {{ raw('$value') }}V
|
|
||||||
|
|
||||||
- alert: UpsTimeRemaining
|
|
||||||
expr: upsEstimatedMinutesRemaining < 8
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
Autonomie restante de {{ raw('$value') }} min
|
|
||||||
|
|
||||||
- alert: UpsTimeRemaining
|
|
||||||
expr: upsEstimatedMinutesRemaining < 5
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: >-
|
|
||||||
Autonomie restante de {{ raw('$value') }} min
|
|
||||||
|
|
||||||
...
|
...
|
87
roles/prometheus/templates/ups.rules.yml.j2
Normal file
87
roles/prometheus/templates/ups.rules.yml.j2
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
---
|
||||||
|
{{ ansible_managed | comment }}
|
||||||
|
|
||||||
|
{% macro raw(string) -%}
|
||||||
|
{{ "{{" }} {{ string }} {{ "}}" }}
|
||||||
|
{%- endmacro %}
|
||||||
|
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: ups.rules
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: UpsOutputSourceChanged
|
||||||
|
expr: upsOutputSource != 3
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Source d'alimentation changée
|
||||||
|
|
||||||
|
- alert: UpsBatteryStatus
|
||||||
|
expr: upsBatteryStatus == 3
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
État de la batterie faible
|
||||||
|
|
||||||
|
- alert: UpsBatteryStatus
|
||||||
|
expr: upsBatteryStatus == 4
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
État de la batterie critique
|
||||||
|
|
||||||
|
- alert: UpsHighLoad
|
||||||
|
expr: upsOutputPercentLoad > 70
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Charge de {{ raw('$value | printf "%.1f"') }}%
|
||||||
|
|
||||||
|
- alert: UpsWrongInputVoltage
|
||||||
|
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Tension d'entrée de {{ raw('$value') }}V
|
||||||
|
|
||||||
|
- alert: UpsWrongOutputVoltage
|
||||||
|
expr: >-
|
||||||
|
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
|
||||||
|
< 3 * stddev_over_time(upsOutputVoltage[1d])
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Tension de sortie de {{ raw('$value') }}V
|
||||||
|
|
||||||
|
- alert: UpsTimeRemaining
|
||||||
|
expr: upsEstimatedMinutesRemaining < 8
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Autonomie restante de {{ raw('$value') }} min
|
||||||
|
|
||||||
|
- alert: UpsTimeRemaining
|
||||||
|
expr: upsEstimatedMinutesRemaining < 5
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: >-
|
||||||
|
Autonomie restante de {{ raw('$value') }} min
|
||||||
|
|
||||||
|
...
|
|
@ -31,6 +31,7 @@ scrape_configs:
|
||||||
params:
|
params:
|
||||||
match[]:
|
match[]:
|
||||||
- '{job="servers"}'
|
- '{job="servers"}'
|
||||||
|
- '{job="postgresql"}'
|
||||||
- '{job="prometheus"}'
|
- '{job="prometheus"}'
|
||||||
- '{job="unifi_snmp"}'
|
- '{job="unifi_snmp"}'
|
||||||
- '{job="django"}'
|
- '{job="django"}'
|
||||||
|
|
5
roles/prometheus_postgres/handlers/main.yml
Normal file
5
roles/prometheus_postgres/handlers/main.yml
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
---
|
||||||
|
- name: Restart prometheus-postgres-exporter
|
||||||
|
service:
|
||||||
|
name: prometheus-postgres-exporter
|
||||||
|
state: restarted
|
39
roles/prometheus_postgres/tasks/main.yml
Normal file
39
roles/prometheus_postgres/tasks/main.yml
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
---
|
||||||
|
- name: Install Prometheus postgres-exporter
|
||||||
|
apt:
|
||||||
|
update_cache: true
|
||||||
|
name: prometheus-postgres-exporter
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Make Prometheus postgres-exporter connect to databases using peercred
|
||||||
|
lineinfile:
|
||||||
|
path: /etc/default/prometheus-postgres-exporter
|
||||||
|
regexp: '^DATA_SOURCE_NAME='
|
||||||
|
line: |
|
||||||
|
DATA_SOURCE_NAME="user=postgres host=/var/run/postgresql/ sslmode=disable"
|
||||||
|
notify: Restart prometheus-postgres-exporter
|
||||||
|
|
||||||
|
- name: Make Prometheus postgres-exporter launched by postgres user
|
||||||
|
lineinfile:
|
||||||
|
path: /lib/systemd/system/prometheus-postgres-exporter.service
|
||||||
|
regexp: '^User='
|
||||||
|
line: |
|
||||||
|
User=postgres
|
||||||
|
notify: Restart prometheus-postgres-exporter
|
||||||
|
|
||||||
|
- name: Make Prometheus postgres-exporter listen on adm only
|
||||||
|
lineinfile:
|
||||||
|
path: /etc/default/prometheus-postgres-exporter
|
||||||
|
regexp: '^ARGS='
|
||||||
|
line: |
|
||||||
|
ARGS="--web.listen-address={{ ansible_hostname }}.adm.auro.re:9187"
|
||||||
|
notify: Restart prometheus-postgres-exporter
|
||||||
|
|
||||||
|
- name: Activate prometheus-postgres-exporter service
|
||||||
|
systemd:
|
||||||
|
name: prometheus-postgres-exporter
|
||||||
|
enabled: true
|
||||||
|
daemon_reload: true
|
||||||
|
state: started
|
|
@ -129,7 +129,7 @@
|
||||||
name:
|
name:
|
||||||
- postgresql
|
- postgresql
|
||||||
- postgresql-client-11=11.7-0+deb10u1
|
- postgresql-client-11=11.7-0+deb10u1
|
||||||
force: yes
|
force: true
|
||||||
|
|
||||||
- name: Install postgresql ansible module requirement(s)
|
- name: Install postgresql ansible module requirement(s)
|
||||||
pip:
|
pip:
|
||||||
|
|
Loading…
Reference in a new issue