prometheus-bird-role → dedicated role + various alerts

This commit is contained in:
jeltz 2023-04-02 13:25:03 +02:00
parent 922b6894a7
commit 0807dc1d70
Signed by: jeltz
GPG key ID: 800882B66C0C3326
9 changed files with 145 additions and 67 deletions

View file

@ -1,10 +1,12 @@
from ansible.parsing.yaml.objects import AnsibleUnicode
class FilterModule:
def filters(self):
return {
"prometheus__convert_jobs": convert_jobs,
"interp": interp,
"interp_float": interp_float,
}
@ -12,6 +14,10 @@ def interp(string):
return AnsibleUnicode(f"{{{{ {string} }}}}")
def interp_float(string):
return AnsibleUnicode(f'{{{{ printf "%.0f" {string} }}}}')
def convert_jobs(config):
for name, job in config.items():

View file

@ -6,10 +6,22 @@
roles:
- prometheus_node
- hosts:
- edge-1.rtr.infra.auro.re
- edge-2.rtr.infra.auro.re
- isp-1.rtr.infra.auro.re
- isp-2.rtr.infra.auro.re
- infra-1.rtr.infra.auro.re
- infra-2.rtr.infra.auro.re
roles:
- prometheus_bird
- hosts:
- prometheus-1.monit.infra.auro.re
- prometheus-2.monit.infra.auro.re
vars:
prometheus__alertmanager_targets:
- docker-ovh.adm.auro.re:9093
prometheus__tsdb_retention_time: 90d
prometheus__scraping:
node:
@ -18,7 +30,25 @@
| flatten }}"
address:
port: 9100
prometheus:
targets:
- prometheus-1.monit.infra.auro.re
- prometheus-2.monit.infra.auro.re
address:
port: 9090
bird:
targets:
- edge-1.rtr.infra.auro.re
- edge-2.rtr.infra.auro.re
address:
port: 9324
prometheus__alert_rules:
prometheus:
- alert: PrometheusTsdbCompactionFailed
expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
for: 0m
labels:
severity: critical
node:
- alert: MachineDown
expr: "up == 0"
@ -36,7 +66,7 @@
labels:
severity: warning
annotations:
summary: "Mémoire libre à {{ '$value' | interp }}%"
summary: "Mémoire libre à {{ '$value' | interp_float }}%"
- alert: HostSwapIsFillingUp
expr: "( 1 - ( node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes ) )
@ -45,14 +75,14 @@
labels:
severity: critical
annotations:
summary: "Swap {{ '$value' | interp }}%"
summary: "Swap {{ '$value' | interp_float }}%"
- alert: HostPhysicalComponentTooHot
expr: "node_hwmon_temp_celsius > 79"
for: 3m
labels:
severity: critical
annotations:
summary: "{{ '$value' | interp }}°C :
summary: "{{ '$value' | interp_float }}°C :
{{ '$labels.chip' | interp }},
{{ '$labels.sensor' | interp }}"
- alert: HostNodeOvertemperatureAlarm
@ -63,6 +93,20 @@
annotations:
summary: "{{ '$labels.chip' | interp }},
{{ '$labels.sensor' | interp }}"
- alert: HostRaidArrayGotInactive
expr: 'node_md_state{state="inactive"} > 0'
for: 0m
labels:
severity: critical
annotations:
summary: "{{ '$labels.device' | interp }}"
- alert: HostRaidDiskFailure
expr: 'node_md_disks{state="failed"} > 0'
for: 0m
labels:
severity: critical
annotations:
severity: "{{ '$labels.md_device' | interp }}"
- alert: HostOomKillDetected
expr: "increase(node_vmstat_oom_kill[1m]) > 0"
for: 0m
@ -77,15 +121,51 @@
severity: warning
annotations:
summary: "{{ '$value' | interp }} erreurs corrigées"
- alert: HostEdacUncorrectableErrorsDetected
expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0"
for: 0m
labels:
severity: warning
annotations:
summary: "{{ '$value' | interp }} erreurs corrigées"
- alert: OutOfDiskSpace
expr: "node_filesystem_free_bytes
/ node_filesystem_size_bytes * 100 < 10"
expr: "( node_filesystem_free_bytes
/ node_filesystem_size_bytes * 100 < 10 )
and on (instance, device, mountpoint)
node_filesystem_readonly == 0"
for: 5m
labels:
severity: critical
annotations:
summary: "{{ '$labels.mountpoint' | interp }} :
{{ '$value' | interp_float }}% libre"
- alert: HostConntrackLimit
expr: "( node_nf_conntrack_entries
/ node_nf_conntrack_entries_limit ) * 100 > 80"
for: 5m
labels:
severity: warning
annotations:
summary: "{{ '$labels.mountpoint' | interp }} :
{{ '$value' | interp }}% libre"
summary: "{{ '$value' | interp_float }}% complet"
- alert: HostClockSkew
expr: "(node_timex_offset_seconds > 0.05
and deriv(node_timex_offset_seconds[5m]) >= 0)
or (node_timex_offset_seconds < -0.05
and deriv(node_timex_offset_seconds[5m]) <= 0)"
for: 2m
labels:
severity: warning
- alert: HostClockNotSynchronising
expr: "min_over_time(node_timex_sync_status[1m]) == 0
and node_timex_maxerror_seconds >= 16"
for: 2m
labels:
severity: warning
- alert: HostRequiresReboot
expr: "node_reboot_required > 0"
for: 5m
labels:
severity: warning
- alert: OutOfInodes
expr: "node_filesystem_files_free
/ node_filesystem_files * 100 < 10"
@ -94,7 +174,7 @@
severity: warning
annotations:
summary: "{{ '$labels.mountpoint' | interp }} :
{{ '$value' | interp }}% libre"
{{ '$value' | interp_float }}% libre"
- alert: CpuUsage
expr: '( 100 - avg by (instance)
( irate(node_cpu_seconds_total{mode="idle"}[5m]) )
@ -103,7 +183,7 @@
labels:
severity: warning
annotations:
summary: "{{ '$value' | interp }}%"
summary: "{{ '$value' | interp_float }}%"
- alert: SystemdServiceFailed
expr: 'node_systemd_unit_state{state="failed"} == 1'
for: 10m
@ -117,7 +197,7 @@
labels:
severity: warning
annotations:
summary: "{{ '$value' | interp }}"
summary: "{{ '$value' | interp_float }}"
- alert: UnhealthyDisk
expr: "smartmon_device_smart_healthy < 1"
for: 10m
@ -125,6 +205,24 @@
severity: critical
annotations:
summary: "{{ '$labels.disk' | interp }}"
- alert: HostCpuStealNoisyNeighbor
expr: 'avg by(instance)
(rate(node_cpu_seconds_total{mode="steal"}[5m]))
* 100 > 10'
for: 5m
labels:
severity: warning
annotations:
summary: "{{ '$labels.disk' | interp }}"
bird:
- alert: BirdProtocolDown
expr: "bird_protocol_up == 0"
for: 0m
labels:
severity: critical
annotations:
summary: "{{ '$labels.name' | interp }} :
{{ '$labels.state' | interp }}"
roles:
- prometheus
@ -332,38 +430,7 @@
# - prometheus
# - update_motd
#
#- hosts: prometheus-federate.adm.auro.re
# vars:
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
# snmp_ilo_user: aurore
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
#
# prometheus_servers_targets:
# - prometheus-edc.adm.auro.re
# - prometheus-gs.adm.auro.re
# - prometheus-fleming.adm.auro.re
# - prometheus-pacaterie.adm.auro.re
# - prometheus-rives.adm.auro.re
# - prometheus-aurore.adm.auro.re
# - prometheus-ovh.adm.auro.re
#
# update_motd:
# prometheus_federate: >-
# Prometheus (en configuration fédération) est déployé (/etc/prometheus).
# roles:
# - prometheus_federate
# - update_motd
#
## Postgres Exporters
#- hosts: bdd,radius
# roles:
# - prometheus_postgres
#
## Monitor all hosts
#- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
# roles:
# - prometheus_node

View file

@ -11,5 +11,4 @@ bird__radv_dns_servers: []
bird__radv_max_interval: 5
bird__static_unreachable: []
bird__bgp_sessions: []
bird__prometheus_listen_address: 0.0.0.0:9324
...

View file

@ -3,9 +3,4 @@
systemd:
name: bird.service
state: reloaded
- name: Restart prometheus-bird-exporter
systemd:
name: prometheus-bird-exporter.service
state: restarted
...

View file

@ -1,9 +1,7 @@
---
- name: Install bird
apt:
name:
- bird2
- prometheus-bird-exporter
name: bird2
- name: Configure bird
template:
@ -15,26 +13,9 @@
notify:
- Reload bird
- name: Configure prometheus-bird-exporter
template:
src: prometheus-bird-exporter.j2
dest: /etc/default/prometheus-bird-exporter
owner: root
group: root
mode: u=rw,g=r,o=
notify:
- Restart prometheus-bird-exporter
- name: Enable and start bird
systemd:
name: bird.service
state: started
enabled: true
- name: Enable and start prometheus-bird-exporter
systemd:
name: prometheus-bird-exporter.service
state: started
enabled: true
...

View file

@ -0,0 +1,3 @@
---
bird__prometheus_listen_address: ':9324'
...

View file

@ -0,0 +1,6 @@
---
- name: Restart prometheus-bird-exporter
systemd:
name: prometheus-bird-exporter.service
state: restarted
...

View file

@ -0,0 +1,21 @@
---
- name: Install prometheus-bird-exporter
apt:
name: prometheus-bird-exporter
- name: Configure prometheus-bird-exporter
template:
src: prometheus-bird-exporter.j2
dest: /etc/default/prometheus-bird-exporter
owner: root
group: root
mode: u=rw,g=r,o=
notify:
- Restart prometheus-bird-exporter
- name: Enable and start prometheus-bird-exporter
systemd:
name: prometheus-bird-exporter.service
state: started
enabled: true
...