prometheus-bird-role → dedicated role + various alerts
This commit is contained in:
parent
922b6894a7
commit
0807dc1d70
9 changed files with 145 additions and 67 deletions
|
@ -1,10 +1,12 @@
|
||||||
from ansible.parsing.yaml.objects import AnsibleUnicode
|
from ansible.parsing.yaml.objects import AnsibleUnicode
|
||||||
|
|
||||||
|
|
||||||
class FilterModule:
|
class FilterModule:
|
||||||
def filters(self):
|
def filters(self):
|
||||||
return {
|
return {
|
||||||
"prometheus__convert_jobs": convert_jobs,
|
"prometheus__convert_jobs": convert_jobs,
|
||||||
"interp": interp,
|
"interp": interp,
|
||||||
|
"interp_float": interp_float,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,6 +14,10 @@ def interp(string):
|
||||||
return AnsibleUnicode(f"{{{{ {string} }}}}")
|
return AnsibleUnicode(f"{{{{ {string} }}}}")
|
||||||
|
|
||||||
|
|
||||||
|
def interp_float(string):
|
||||||
|
return AnsibleUnicode(f'{{{{ printf "%.0f" {string} }}}}')
|
||||||
|
|
||||||
|
|
||||||
def convert_jobs(config):
|
def convert_jobs(config):
|
||||||
|
|
||||||
for name, job in config.items():
|
for name, job in config.items():
|
||||||
|
|
|
@ -6,10 +6,22 @@
|
||||||
roles:
|
roles:
|
||||||
- prometheus_node
|
- prometheus_node
|
||||||
|
|
||||||
|
- hosts:
|
||||||
|
- edge-1.rtr.infra.auro.re
|
||||||
|
- edge-2.rtr.infra.auro.re
|
||||||
|
- isp-1.rtr.infra.auro.re
|
||||||
|
- isp-2.rtr.infra.auro.re
|
||||||
|
- infra-1.rtr.infra.auro.re
|
||||||
|
- infra-2.rtr.infra.auro.re
|
||||||
|
roles:
|
||||||
|
- prometheus_bird
|
||||||
|
|
||||||
- hosts:
|
- hosts:
|
||||||
- prometheus-1.monit.infra.auro.re
|
- prometheus-1.monit.infra.auro.re
|
||||||
- prometheus-2.monit.infra.auro.re
|
- prometheus-2.monit.infra.auro.re
|
||||||
vars:
|
vars:
|
||||||
|
prometheus__alertmanager_targets:
|
||||||
|
- docker-ovh.adm.auro.re:9093
|
||||||
prometheus__tsdb_retention_time: 90d
|
prometheus__tsdb_retention_time: 90d
|
||||||
prometheus__scraping:
|
prometheus__scraping:
|
||||||
node:
|
node:
|
||||||
|
@ -18,7 +30,25 @@
|
||||||
| flatten }}"
|
| flatten }}"
|
||||||
address:
|
address:
|
||||||
port: 9100
|
port: 9100
|
||||||
|
prometheus:
|
||||||
|
targets:
|
||||||
|
- prometheus-1.monit.infra.auro.re
|
||||||
|
- prometheus-2.monit.infra.auro.re
|
||||||
|
address:
|
||||||
|
port: 9090
|
||||||
|
bird:
|
||||||
|
targets:
|
||||||
|
- edge-1.rtr.infra.auro.re
|
||||||
|
- edge-2.rtr.infra.auro.re
|
||||||
|
address:
|
||||||
|
port: 9324
|
||||||
prometheus__alert_rules:
|
prometheus__alert_rules:
|
||||||
|
prometheus:
|
||||||
|
- alert: PrometheusTsdbCompactionFailed
|
||||||
|
expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
node:
|
node:
|
||||||
- alert: MachineDown
|
- alert: MachineDown
|
||||||
expr: "up == 0"
|
expr: "up == 0"
|
||||||
|
@ -36,7 +66,7 @@
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Mémoire libre à {{ '$value' | interp }}%"
|
summary: "Mémoire libre à {{ '$value' | interp_float }}%"
|
||||||
- alert: HostSwapIsFillingUp
|
- alert: HostSwapIsFillingUp
|
||||||
expr: "( 1 - ( node_memory_SwapFree_bytes
|
expr: "( 1 - ( node_memory_SwapFree_bytes
|
||||||
/ node_memory_SwapTotal_bytes ) )
|
/ node_memory_SwapTotal_bytes ) )
|
||||||
|
@ -45,14 +75,14 @@
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Swap {{ '$value' | interp }}%"
|
summary: "Swap {{ '$value' | interp_float }}%"
|
||||||
- alert: HostPhysicalComponentTooHot
|
- alert: HostPhysicalComponentTooHot
|
||||||
expr: "node_hwmon_temp_celsius > 79"
|
expr: "node_hwmon_temp_celsius > 79"
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$value' | interp }}°C :
|
summary: "{{ '$value' | interp_float }}°C :
|
||||||
{{ '$labels.chip' | interp }},
|
{{ '$labels.chip' | interp }},
|
||||||
{{ '$labels.sensor' | interp }}"
|
{{ '$labels.sensor' | interp }}"
|
||||||
- alert: HostNodeOvertemperatureAlarm
|
- alert: HostNodeOvertemperatureAlarm
|
||||||
|
@ -63,6 +93,20 @@
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.chip' | interp }},
|
summary: "{{ '$labels.chip' | interp }},
|
||||||
{{ '$labels.sensor' | interp }}"
|
{{ '$labels.sensor' | interp }}"
|
||||||
|
- alert: HostRaidArrayGotInactive
|
||||||
|
expr: 'node_md_state{state="inactive"} > 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "{{ '$labels.device' | interp }}"
|
||||||
|
- alert: HostRaidDiskFailure
|
||||||
|
expr: 'node_md_disks{state="failed"} > 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
severity: "{{ '$labels.md_device' | interp }}"
|
||||||
- alert: HostOomKillDetected
|
- alert: HostOomKillDetected
|
||||||
expr: "increase(node_vmstat_oom_kill[1m]) > 0"
|
expr: "increase(node_vmstat_oom_kill[1m]) > 0"
|
||||||
for: 0m
|
for: 0m
|
||||||
|
@ -77,15 +121,51 @@
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$value' | interp }} erreurs corrigées"
|
summary: "{{ '$value' | interp }} erreurs corrigées"
|
||||||
|
- alert: HostEdacUncorrectableErrorsDetected
|
||||||
|
expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "{{ '$value' | interp }} erreurs corrigées"
|
||||||
- alert: OutOfDiskSpace
|
- alert: OutOfDiskSpace
|
||||||
expr: "node_filesystem_free_bytes
|
expr: "( node_filesystem_free_bytes
|
||||||
/ node_filesystem_size_bytes * 100 < 10"
|
/ node_filesystem_size_bytes * 100 < 10 )
|
||||||
|
and on (instance, device, mountpoint)
|
||||||
|
node_filesystem_readonly == 0"
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "{{ '$labels.mountpoint' | interp }} :
|
||||||
|
{{ '$value' | interp_float }}% libre"
|
||||||
|
- alert: HostConntrackLimit
|
||||||
|
expr: "( node_nf_conntrack_entries
|
||||||
|
/ node_nf_conntrack_entries_limit ) * 100 > 80"
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.mountpoint' | interp }} :
|
summary: "{{ '$value' | interp_float }}% complet"
|
||||||
{{ '$value' | interp }}% libre"
|
- alert: HostClockSkew
|
||||||
|
expr: "(node_timex_offset_seconds > 0.05
|
||||||
|
and deriv(node_timex_offset_seconds[5m]) >= 0)
|
||||||
|
or (node_timex_offset_seconds < -0.05
|
||||||
|
and deriv(node_timex_offset_seconds[5m]) <= 0)"
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: HostClockNotSynchronising
|
||||||
|
expr: "min_over_time(node_timex_sync_status[1m]) == 0
|
||||||
|
and node_timex_maxerror_seconds >= 16"
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: HostRequiresReboot
|
||||||
|
expr: "node_reboot_required > 0"
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
- alert: OutOfInodes
|
- alert: OutOfInodes
|
||||||
expr: "node_filesystem_files_free
|
expr: "node_filesystem_files_free
|
||||||
/ node_filesystem_files * 100 < 10"
|
/ node_filesystem_files * 100 < 10"
|
||||||
|
@ -94,7 +174,7 @@
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.mountpoint' | interp }} :
|
summary: "{{ '$labels.mountpoint' | interp }} :
|
||||||
{{ '$value' | interp }}% libre"
|
{{ '$value' | interp_float }}% libre"
|
||||||
- alert: CpuUsage
|
- alert: CpuUsage
|
||||||
expr: '( 100 - avg by (instance)
|
expr: '( 100 - avg by (instance)
|
||||||
( irate(node_cpu_seconds_total{mode="idle"}[5m]) )
|
( irate(node_cpu_seconds_total{mode="idle"}[5m]) )
|
||||||
|
@ -103,7 +183,7 @@
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$value' | interp }}%"
|
summary: "{{ '$value' | interp_float }}%"
|
||||||
- alert: SystemdServiceFailed
|
- alert: SystemdServiceFailed
|
||||||
expr: 'node_systemd_unit_state{state="failed"} == 1'
|
expr: 'node_systemd_unit_state{state="failed"} == 1'
|
||||||
for: 10m
|
for: 10m
|
||||||
|
@ -117,7 +197,7 @@
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$value' | interp }}"
|
summary: "{{ '$value' | interp_float }}"
|
||||||
- alert: UnhealthyDisk
|
- alert: UnhealthyDisk
|
||||||
expr: "smartmon_device_smart_healthy < 1"
|
expr: "smartmon_device_smart_healthy < 1"
|
||||||
for: 10m
|
for: 10m
|
||||||
|
@ -125,6 +205,24 @@
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.disk' | interp }}"
|
summary: "{{ '$labels.disk' | interp }}"
|
||||||
|
- alert: HostCpuStealNoisyNeighbor
|
||||||
|
expr: 'avg by(instance)
|
||||||
|
(rate(node_cpu_seconds_total{mode="steal"}[5m]))
|
||||||
|
* 100 > 10'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "{{ '$labels.disk' | interp }}"
|
||||||
|
bird:
|
||||||
|
- alert: BirdProtocolDown
|
||||||
|
expr: "bird_protocol_up == 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "{{ '$labels.name' | interp }} :
|
||||||
|
{{ '$labels.state' | interp }}"
|
||||||
roles:
|
roles:
|
||||||
- prometheus
|
- prometheus
|
||||||
|
|
||||||
|
@ -332,38 +430,7 @@
|
||||||
# - prometheus
|
# - prometheus
|
||||||
# - update_motd
|
# - update_motd
|
||||||
#
|
#
|
||||||
#- hosts: prometheus-federate.adm.auro.re
|
|
||||||
# vars:
|
|
||||||
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
|
||||||
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
|
||||||
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
|
||||||
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
|
||||||
# snmp_ilo_user: aurore
|
|
||||||
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
|
||||||
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
|
||||||
#
|
|
||||||
# prometheus_servers_targets:
|
|
||||||
# - prometheus-edc.adm.auro.re
|
|
||||||
# - prometheus-gs.adm.auro.re
|
|
||||||
# - prometheus-fleming.adm.auro.re
|
|
||||||
# - prometheus-pacaterie.adm.auro.re
|
|
||||||
# - prometheus-rives.adm.auro.re
|
|
||||||
# - prometheus-aurore.adm.auro.re
|
|
||||||
# - prometheus-ovh.adm.auro.re
|
|
||||||
#
|
|
||||||
# update_motd:
|
|
||||||
# prometheus_federate: >-
|
|
||||||
# Prometheus (en configuration fédération) est déployé (/etc/prometheus).
|
|
||||||
# roles:
|
|
||||||
# - prometheus_federate
|
|
||||||
# - update_motd
|
|
||||||
#
|
|
||||||
## Postgres Exporters
|
## Postgres Exporters
|
||||||
#- hosts: bdd,radius
|
#- hosts: bdd,radius
|
||||||
# roles:
|
# roles:
|
||||||
# - prometheus_postgres
|
# - prometheus_postgres
|
||||||
#
|
|
||||||
## Monitor all hosts
|
|
||||||
#- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
|
|
||||||
# roles:
|
|
||||||
# - prometheus_node
|
|
||||||
|
|
|
@ -11,5 +11,4 @@ bird__radv_dns_servers: []
|
||||||
bird__radv_max_interval: 5
|
bird__radv_max_interval: 5
|
||||||
bird__static_unreachable: []
|
bird__static_unreachable: []
|
||||||
bird__bgp_sessions: []
|
bird__bgp_sessions: []
|
||||||
bird__prometheus_listen_address: 0.0.0.0:9324
|
|
||||||
...
|
...
|
||||||
|
|
|
@ -3,9 +3,4 @@
|
||||||
systemd:
|
systemd:
|
||||||
name: bird.service
|
name: bird.service
|
||||||
state: reloaded
|
state: reloaded
|
||||||
|
|
||||||
- name: Restart prometheus-bird-exporter
|
|
||||||
systemd:
|
|
||||||
name: prometheus-bird-exporter.service
|
|
||||||
state: restarted
|
|
||||||
...
|
...
|
||||||
|
|
|
@ -1,9 +1,7 @@
|
||||||
---
|
---
|
||||||
- name: Install bird
|
- name: Install bird
|
||||||
apt:
|
apt:
|
||||||
name:
|
name: bird2
|
||||||
- bird2
|
|
||||||
- prometheus-bird-exporter
|
|
||||||
|
|
||||||
- name: Configure bird
|
- name: Configure bird
|
||||||
template:
|
template:
|
||||||
|
@ -15,26 +13,9 @@
|
||||||
notify:
|
notify:
|
||||||
- Reload bird
|
- Reload bird
|
||||||
|
|
||||||
- name: Configure prometheus-bird-exporter
|
|
||||||
template:
|
|
||||||
src: prometheus-bird-exporter.j2
|
|
||||||
dest: /etc/default/prometheus-bird-exporter
|
|
||||||
owner: root
|
|
||||||
group: root
|
|
||||||
mode: u=rw,g=r,o=
|
|
||||||
notify:
|
|
||||||
- Restart prometheus-bird-exporter
|
|
||||||
|
|
||||||
- name: Enable and start bird
|
- name: Enable and start bird
|
||||||
systemd:
|
systemd:
|
||||||
name: bird.service
|
name: bird.service
|
||||||
state: started
|
state: started
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|
||||||
- name: Enable and start prometheus-bird-exporter
|
|
||||||
systemd:
|
|
||||||
name: prometheus-bird-exporter.service
|
|
||||||
state: started
|
|
||||||
enabled: true
|
|
||||||
|
|
||||||
...
|
...
|
||||||
|
|
3
roles/prometheus_bird/defaults/main.yml
Normal file
3
roles/prometheus_bird/defaults/main.yml
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
---
|
||||||
|
bird__prometheus_listen_address: ':9324'
|
||||||
|
...
|
6
roles/prometheus_bird/handlers/main.yml
Normal file
6
roles/prometheus_bird/handlers/main.yml
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
---
|
||||||
|
- name: Restart prometheus-bird-exporter
|
||||||
|
systemd:
|
||||||
|
name: prometheus-bird-exporter.service
|
||||||
|
state: restarted
|
||||||
|
...
|
21
roles/prometheus_bird/tasks/main.yml
Normal file
21
roles/prometheus_bird/tasks/main.yml
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
---
|
||||||
|
- name: Install prometheus-bird-exporter
|
||||||
|
apt:
|
||||||
|
name: prometheus-bird-exporter
|
||||||
|
|
||||||
|
- name: Configure prometheus-bird-exporter
|
||||||
|
template:
|
||||||
|
src: prometheus-bird-exporter.j2
|
||||||
|
dest: /etc/default/prometheus-bird-exporter
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: u=rw,g=r,o=
|
||||||
|
notify:
|
||||||
|
- Restart prometheus-bird-exporter
|
||||||
|
|
||||||
|
- name: Enable and start prometheus-bird-exporter
|
||||||
|
systemd:
|
||||||
|
name: prometheus-bird-exporter.service
|
||||||
|
state: started
|
||||||
|
enabled: true
|
||||||
|
...
|
Loading…
Reference in a new issue