Merge branch 'master' into update_ansible_lint
continuous-integration/drone/push Build is passing Details
continuous-integration/drone/pr Build is passing Details

pull/54/head
otthorn 3 years ago
commit dada40e005

23
hosts

@ -549,3 +549,26 @@ proxy.adm.auro.re
bdd.adm.auro.re
bdd-ovh.adm.auro.re
re2o-db.adm.auro.re
[radius]
radius-aurore.adm.auro.re
radius-fleming.adm.auro.re
radius-fleming-backup.adm.auro.re
radius-edc.adm.auro.re
radius-edc-backup.adm.auro.re
radius-gs.adm.auro.re
radius-gs-backup.adm.auro.re
radius-pacaterie.adm.auro.re
radius-pacaterie-backup.adm.auro.re
radius-rives.adm.auro.re
radius-rives-backup.adm.auro.re
[prometheus]
prometheus-ovh.adm.auro.re
prometheus-aurore.adm.auro.re
prometheus-rives.adm.auro.re
prometheus-gs.adm.auro.re
prometheus-edc.adm.auro.re
prometheus-pacaterie.adm.auro.re
prometheus-fleming.adm.auro.re
prometheus-federate.adm.auro.re

@ -99,6 +99,9 @@
prometheus_targets:
- targets: |
{{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
prometheus_postgres_targets:
- targets: |
{{ groups['bdd'] + groups['radius'] | list | sort }}
prometheus_switch_snmp_targets:
- targets:
- yggdrasil.switch.auro.re
@ -128,8 +131,11 @@
prometheus_targets:
- targets: |
{{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
prometheus_postgres_targets:
- targets:
- bdd-ovh.adm.auro.re
prometheus_docker_targets:
- docker-ovh.adm.auro.re:8087
- docker-ovh.adm.auro.re
roles:
- prometheus
@ -151,6 +157,11 @@
roles:
- prometheus_federate
# Postgres Exporters
- hosts: bdd,radius
roles:
- prometheus_postgres
# Monitor all hosts
- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container

@ -18,8 +18,29 @@
mode: u=r,g=r,o=
loop:
- prometheus.yml
- alert.rules.yml
notify: Restart Prometheus
- name: Creates directory for alerts
file:
path: /etc/prometheus/alerts
state: directory
owner: prometheus
group: prometheus
mode: 0755
- name: Configure Prometheus alerts
template:
src: "{{ item }}.j2"
dest: "/etc/prometheus/alerts/{{ item }}"
owner: prometheus
group: prometheus
mode: u=r,g=r,o=
loop:
- server.rules.yml
- docker.rules.yml
- django.rules.yml
- ups.rules.yml
- postgres.rules.yml
notify: Restart Prometheus
- name: Make Prometheus snmp-exporter listen on localhost only
@ -75,6 +96,13 @@
mode: 0644
when: prometheus_docker_targets is defined
- name: Configure Prometheus postgres monitoring
copy:
content: "{{ prometheus_postgres_targets | to_nice_json }}\n"
dest: /etc/prometheus/targets_postgres.json
mode: 0644
when: prometheus_postgres_targets is defined
- name: Activate prometheus service
systemd:
name: prometheus

@ -0,0 +1,50 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: docker.rules
rules:
- alert: ContainerDown
expr: docker_container_running_state != 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
Le container Docker est éteint / tombé
(container {{ raw('$labels.name') }})
- alert: ContainerFailed
expr: sum(increase(docker_container_restart_count[5m])) > 2
for: 0m
labels:
severity: critical
annotations:
summary: >-
Le container Docker redémarre souvent
(container {{ raw('$labels.name') }})
- alert: ContainerFailed
expr:
(
docker_container_cpu_used_total
/
docker_container_cpu_capacity_total
) * 100
> 30
for: 0m
labels:
severity: critical
annotations:
summary: >-
Le container Docker utilise beaucoup de CPU
(container {{ raw('$labels.name') }},
valeur {{ raw('$value | printf "%.1f"') }})
...

@ -0,0 +1,219 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: postgres.rules
rules:
- alert: PostgresqlDown
expr: pg_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Serveur PostgreSQL down
- alert: PostgresqlRestarted
expr: time() - pg_postmaster_start_time_seconds < 60
for: 0m
labels:
severity: critical
annotations:
summary: Serveur PostgreSQL redémarré
- alert: PostgresqlExporterError
expr: pg_exporter_last_scrape_error > 0
for: 0m
labels:
severity: critical
annotations:
summary: Erreur dans l'exporter PostgreSQL
- alert: PostgresqlReplicationLag
expr:
pg_replication_lag > 30
and
ON(instance) pg_replication_is_replica == 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
La réplication PostgreSQL lag ({{ raw('$value') }} > 30s)
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlTableNotVaccumed
expr:
time() - pg_stat_user_tables_last_autovacuum
> 60 * 60 * 24
for: 0m
labels:
severity: warning
annotations:
summary: >-
Le démon autovacuum n'a pas été lancé depuis 24h
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlTableNotAnalyzed
expr:
time() - pg_stat_user_tables_last_autoanalyze
> 60 * 60 * 24
for: 0m
labels:
severity: warning
annotations:
summary: >-
Table non-analysée depuis 24h
(base de données {{ raw('$labels.datname') }})
- alert: PostgresqlTooManyConnections
expr:
(
sum by (datname)
(pg_stat_activity_count{datname!~"template.*|postgres"})
) * 100
> pg_settings_max_connections * 80
for: 2m
labels:
severity: warning
annotations:
summary: >-
PostgreSQL a trop de connexions
({{ raw('$value | printf "%.1f"') }} > 80%)
(base de données {{ raw('$labels.datname') }})
- alert: PostgresqlDeadLocks
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
for: 0m
labels:
severity: warning
annotations:
summary: >-
PostgreSQL a plus de 5 deadlocks.
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlSlowQueries
expr: pg_slow_queries > 0
for: 2m
labels:
severity: warning
annotations:
summary: >-
Présence de requêtes lentes (slow-queries)
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlHighRollbackRate
expr:
(
rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) /
rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m])
) * 100
> 7
for: 0m
labels:
severity: warning
annotations:
summary: >-
PostgreSQL a un taux de retour en arrière (rollback) élevé
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} %)
- alert: PostgresqlWaleReplicationStopped
expr: rate(pg_xlog_position_bytes[1m]) == 0
for: 0m
labels:
severity: critical
annotations:
summary: >-
Réplication de PostgreSQL WALE stoppée
(base de données {{ raw('$labels.datname') }} )
- alert: PostgresqlHighRateStatementTimeout
expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Beaucoup de requêtes PostgreSQL sont timeout
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
- alert: PostgresqlHighRateDeadlock
expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
PostgreSQL a un fort taux de deadlock
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
# - alert: PostgresqlReplicationLagBytes
# expr:
# (pg_xlog_position_bytes and pg_replication_is_replica == 0)
# - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1)
# > 1e+09
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} )
- alert: PostgresqlTooManyDeadTuples
expr:
(
(pg_stat_user_tables_n_dead_tup > 10000)
/ (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)
) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
for: 2m
labels:
severity: warning
annotations:
summary: >-
Les tuples morts PostgreSQL sont trop volumineux
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value | printf "%.1f"') }} )
- alert: PostgresqlSplitBrain
expr: count(pg_replication_is_replica == 0) != 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
Split Brain : trop de bases de données PostgreSQL primaires en mode lecture-écriture
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }} )
- alert: PostgresqlPromotedNode
expr:
pg_replication_is_replica
and
changes(pg_replication_is_replica[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: >-
Le serveur de secours PostgreSQL a été promu comme nœud principal
(base de données {{ raw('$labels.datname') }}, valeur {{ raw('$value') }})
- alert: PostgresqlTooManyLocksAcquired
expr:
(
(sum (pg_locks_count))
/ (pg_settings_max_locks_per_transaction * pg_settings_max_connections)
) * 100 > 20
for: 2m
labels:
severity: critical
annotations:
summary: >-
Trop de deadlocks acquis sur la base de données.
Si cette alerte se produit fréquemment, nous devrons peut-être augmenter
le paramètre postgres max_locks_per_transaction
(Valeur = {{ raw('$value | printf "%.1f"') }} )
...

@ -20,8 +20,7 @@ alerting:
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "alert.rules.yml" # Monitoring alerts, this is the file you may be searching!
- "django.rules.yml" # Custom rules specific for Django project monitoring
- "alerts/*.yml" # Monitoring alerts, this is the file you may be searching!
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
@ -101,4 +100,27 @@ scrape_configs:
file_sd_configs:
- files:
- '/etc/prometheus/targets_docker.json'
relabel_configs:
# Do not put :8087 in instance name, rather here
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:8087'
- job_name: postgresql
file_sd_configs:
- files:
- '/etc/prometheus/targets_postgres.json'
relabel_configs:
# Do not put :9187 in instance name, rather here
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9187'
...

@ -7,7 +7,7 @@
groups:
- name: alert.rules
- name: server.rules
rules:
- alert: InstanceDown
@ -149,78 +149,5 @@ groups:
summary: >
Charge à {{ raw('$value') }}
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Source d'alimentation changée
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 3
for: 0m
labels:
severity: warning
annotations:
summary: >-
État de la batterie faible
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 4
for: 0m
labels:
severity: critical
annotations:
summary: >-
État de la batterie critique
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 3m
labels:
severity: critical
annotations:
summary: >-
Charge de {{ raw('$value | printf "%.1f"') }}%
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension d'entrée de {{ raw('$value') }}V
- alert: UpsWrongOutputVoltage
expr: >-
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
< 3 * stddev_over_time(upsOutputVoltage[1d])
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension de sortie de {{ raw('$value') }}V
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 8
for: 0m
labels:
severity: warning
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 5
for: 0m
labels:
severity: critical
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
...

@ -0,0 +1,87 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: ups.rules
rules:
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Source d'alimentation changée
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 3
for: 0m
labels:
severity: warning
annotations:
summary: >-
État de la batterie faible
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 4
for: 0m
labels:
severity: critical
annotations:
summary: >-
État de la batterie critique
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 3m
labels:
severity: critical
annotations:
summary: >-
Charge de {{ raw('$value | printf "%.1f"') }}%
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension d'entrée de {{ raw('$value') }}V
- alert: UpsWrongOutputVoltage
expr: >-
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
< 3 * stddev_over_time(upsOutputVoltage[1d])
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension de sortie de {{ raw('$value') }}V
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 8
for: 0m
labels:
severity: warning
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 5
for: 0m
labels:
severity: critical
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
...

@ -31,6 +31,7 @@ scrape_configs:
params:
match[]:
- '{job="servers"}'
- '{job="postgresql"}'
- '{job="prometheus"}'
- '{job="unifi_snmp"}'
- '{job="django"}'

@ -0,0 +1,5 @@
---
- name: Restart prometheus-postgres-exporter
service:
name: prometheus-postgres-exporter
state: restarted

@ -0,0 +1,39 @@
---
- name: Install Prometheus postgres-exporter
apt:
update_cache: true
name: prometheus-postgres-exporter
register: apt_result
retries: 3
until: apt_result is succeeded
- name: Make Prometheus postgres-exporter connect to databases using peercred
lineinfile:
path: /etc/default/prometheus-postgres-exporter
regexp: '^DATA_SOURCE_NAME='
line: |
DATA_SOURCE_NAME="user=postgres host=/var/run/postgresql/ sslmode=disable"
notify: Restart prometheus-postgres-exporter
- name: Make Prometheus postgres-exporter launched by postgres user
lineinfile:
path: /lib/systemd/system/prometheus-postgres-exporter.service
regexp: '^User='
line: |
User=postgres
notify: Restart prometheus-postgres-exporter
- name: Make Prometheus postgres-exporter listen on adm only
lineinfile:
path: /etc/default/prometheus-postgres-exporter
regexp: '^ARGS='
line: |
ARGS="--web.listen-address={{ ansible_hostname }}.adm.auro.re:9187"
notify: Restart prometheus-postgres-exporter
- name: Activate prometheus-postgres-exporter service
systemd:
name: prometheus-postgres-exporter
enabled: true
daemon_reload: true
state: started

@ -129,7 +129,7 @@
name:
- postgresql
- postgresql-client-11=11.7-0+deb10u1
force: yes
force: true
- name: Install postgresql ansible module requirement(s)
pip:

Loading…
Cancel
Save