Splite alerts on some files

This commit is contained in:
pz2891 2021-04-11 15:58:35 +02:00
parent dd48302585
commit 9ebdf15bb9
6 changed files with 355 additions and 77 deletions

View file

@ -18,8 +18,33 @@
mode: u=r,g=r,o= mode: u=r,g=r,o=
loop: loop:
- prometheus.yml - prometheus.yml
- alert.rules.yml notify: Restart Prometheus
- name: Creates directory for alerts
file:
path: /etc/prometheus/alerts
state: directory
- name: Remove old files
file:
path: "/etc/prometheus/{{ item }}"
state: absent
with_items:
- alerts.rules.yml
- django.rules.yml - django.rules.yml
- name: Configure Prometheus alerts
template:
src: "{{ item }}.j2"
dest: "/etc/prometheus/alerts/{{ item }}"
owner: prometheus
group: prometheus
mode: u=r,g=r,o=
loop:
- server.rules.yml
- django.rules.yml
- ups.rules.yml
- postgres.rules.yml
notify: Restart Prometheus notify: Restart Prometheus
- name: Make Prometheus snmp-exporter listen on localhost only - name: Make Prometheus snmp-exporter listen on localhost only

View file

@ -0,0 +1,219 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: postgres.rules
rules:
- alert: PostgresqlDown
expr: pg_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Serveur Postgresql down (instance {{ raw('$labels.instance') }})
- alert: PostgresqlRestarted
expr: time() - pg_postmaster_start_time_seconds < 60
for: 0m
labels:
severity: critical
annotations:
summary: Serveur Postgresql redémarré (instance {{ raw('$labels.instance') }})
- alert: PostgresqlExporterError
expr: pg_exporter_last_scrape_error > 0
for: 0m
labels:
severity: critical
annotations:
summary: Erreur dans l'exporter Postgresql (instance {{ raw('$labels.instance') }})
- alert: PostgresqlReplicationLag
expr:
pg_replication_lag > 30
and
ON(instance) pg_replication_is_replica == 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
Réplication Postgresql lag ({{ raw('$value') }} > 30s)
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname')}} )
- alert: PostgresqlTableNotVaccumed
expr:
time() - pg_stat_user_tables_last_autovacuum
> 60 * 60 * 24
for: 0m
labels:
severity: warning
annotations:
summary: >-
La table n'a pas été aspirée depuis 24h
(Instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} )
- alert: PostgresqlTableNotAnalyzed
expr:
time() - pg_stat_user_tables_last_autoanalyze
> 60 * 60 * 24
for: 0m
labels:
severity: warning
annotations:
summary: >-
Table non-analysée depuis 24h
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }})
- alert: PostgresqlTooManyConnections
expr:
(
sum by (datname)
(pg_stat_activity_count{datname!~"template.*|postgres"})
) * 100
> pg_settings_max_connections * 80
for: 2m
labels:
severity: warning
annotations:
summary: >-
PostgreSQL a trop de connexions
({{ raw('$value | printf "%.1f"') }} > 80%)
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }})
- alert: PostgresqlDeadLocks
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
for: 0m
labels:
severity: warning
annotations:
summary: >-
PostgreSQL a des cadenas morts
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} )
- alert: PostgresqlSlowQueries
expr: pg_slow_queries > 0
for: 2m
labels:
severity: warning
annotations:
summary: >-
Présence de requêtes lentes (slow-queries)
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} )
- alert: PostgresqlHighRollbackRate
expr:
(
rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) /
rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m])
) * 100
> 2
for: 0m
labels:
severity: warning
annotations:
summary: >-
Postgresql a un taux de retour en arrière (rollback) élevé
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} %)
- alert: PostgresqlWaleReplicationStopped
expr: rate(pg_xlog_position_bytes[1m]) == 0
for: 0m
labels:
severity: critical
annotations:
summary: >-
Réplication de Postgresql WALE stopée
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }} )
- alert: PostgresqlHighRateStatementTimeout
expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Beaucoup de requêtes Postgresql sont timeout
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} )
- alert: PostgresqlHighRateDeadlock
expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
Postgresql a un fort taux de deadlock
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} )
# - alert: PostgresqlReplicationLagBytes
# expr:
# (pg_xlog_position_bytes and pg_replication_is_replica == 0)
# - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1)
# > 1e+09
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: La réplication Postgresql a des octets de retard (instance {{ raw('$labels.name') }}, value {{ raw('$value') }} )
- alert: PostgresqlTooManyDeadTuples
expr:
(
(pg_stat_user_tables_n_dead_tup > 10000)
/ (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)
) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
for: 2m
labels:
severity: warning
annotations:
summary: >-
Les tuples morts PostgreSQL sont trop volumineux
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value | printf "%.1f"') }} )
- alert: PostgresqlSplitBrain
expr: count(pg_replication_is_replica == 0) != 1
for: 0m
labels:
severity: critical
annotations:
summary: >-
Split Brain, trop de bases de données Postgresql primaires en mode lecture-écriture
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value') }} )
- alert: PostgresqlPromotedNode
expr:
pg_replication_is_replica
and
changes(pg_replication_is_replica[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: >-
Le serveur de secours PostgreSQL a été promu comme nœud principal
(instance {{ raw('$labels.instance') }}, database {{ raw('$labels.datname') }}, value {{ raw('$value') }})
- alert: PostgresqlTooManyLocksAcquired
expr:
(
(sum (pg_locks_count))
/ (pg_settings_max_locks_per_transaction * pg_settings_max_connections)
) * 100 > 20
for: 2m
labels:
severity: critical
annotations:
summary: >-
Trop de verrous acquis sur la base de données.
Si cette alerte se produit fréquemment, nous devrons peut-être augmenter
le paramètre postgres max_locks_per_transaction
(instance {{ raw('$labels.instance') }}, value = {{ raw('$value | printf "%.1f"') }} )
...

View file

@ -20,8 +20,7 @@ alerting:
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files: rule_files:
- "alert.rules.yml" # Monitoring alerts, this is the file you may be searching! - "alerts/*.yml" # Monitoring alerts, this is the file you may be searching!
- "django.rules.yml" # Custom rules specific for Django project monitoring
# A scrape configuration containing exactly one endpoint to scrape: # A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself. # Here it's Prometheus itself.

View file

@ -7,7 +7,7 @@
groups: groups:
- name: alert.rules - name: server.rules
rules: rules:
- alert: InstanceDown - alert: InstanceDown
@ -149,78 +149,5 @@ groups:
summary: > summary: >
Charge à {{ raw('$value') }} Charge à {{ raw('$value') }}
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Source d'alimentation changée
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 3
for: 0m
labels:
severity: warning
annotations:
summary: >-
État de la batterie faible
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 4
for: 0m
labels:
severity: critical
annotations:
summary: >-
État de la batterie critique
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 3m
labels:
severity: critical
annotations:
summary: >-
Charge de {{ raw('$value | printf "%.1f"') }}%
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension d'entrée de {{ raw('$value') }}V
- alert: UpsWrongOutputVoltage
expr: >-
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
< 3 * stddev_over_time(upsOutputVoltage[1d])
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension de sortie de {{ raw('$value') }}V
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 8
for: 0m
labels:
severity: warning
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 5
for: 0m
labels:
severity: critical
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
... ...

View file

@ -0,0 +1,87 @@
---
{{ ansible_managed | comment }}
{% macro raw(string) -%}
{{ "{{" }} {{ string }} {{ "}}" }}
{%- endmacro %}
groups:
- name: ups.rules
rules:
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 0m
labels:
severity: critical
annotations:
summary: >-
Source d'alimentation changée
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 3
for: 0m
labels:
severity: warning
annotations:
summary: >-
État de la batterie faible
- alert: UpsBatteryStatus
expr: upsBatteryStatus == 4
for: 0m
labels:
severity: critical
annotations:
summary: >-
État de la batterie critique
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 3m
labels:
severity: critical
annotations:
summary: >-
Charge de {{ raw('$value | printf "%.1f"') }}%
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension d'entrée de {{ raw('$value') }}V
- alert: UpsWrongOutputVoltage
expr: >-
abs(upsInputVoltage - avg_over_time(upsOutputVoltage[1d]))
< 3 * stddev_over_time(upsOutputVoltage[1d])
for: 5m
labels:
severity: warning
annotations:
summary: >-
Tension de sortie de {{ raw('$value') }}V
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 8
for: 0m
labels:
severity: warning
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
- alert: UpsTimeRemaining
expr: upsEstimatedMinutesRemaining < 5
for: 0m
labels:
severity: critical
annotations:
summary: >-
Autonomie restante de {{ raw('$value') }} min
...

View file

@ -0,0 +1,21 @@
---
- name: Install Prometheus postgres-exporter
apt:
update_cache: true
name: prometheus-postgres-exporter
register: apt_result
retries: 3
until: apt_result is succeeded
when:
- ansible_lsb.codename != 'bullseye'
# Doesn't work on Debian Stretch with the old prometheus package
- name: Make Prometheus node-exporter listen on adm only
lineinfile:
path: /etc/default/prometheus-node-exporter
regexp: '^ARGS='
line: |
ARGS="--web.listen-address={{ ansible_hostname }}.adm.auro.re:9100"
notify: Restart prometheus-node-exporter
git push --set-upstream origin add_ups_231