prometheus-bird-role → dedicated role + various alerts
This commit is contained in:
parent
922b6894a7
commit
0807dc1d70
9 changed files with 145 additions and 67 deletions
|
@ -1,10 +1,12 @@
|
|||
from ansible.parsing.yaml.objects import AnsibleUnicode
|
||||
|
||||
|
||||
class FilterModule:
|
||||
def filters(self):
|
||||
return {
|
||||
"prometheus__convert_jobs": convert_jobs,
|
||||
"interp": interp,
|
||||
"interp_float": interp_float,
|
||||
}
|
||||
|
||||
|
||||
|
@ -12,6 +14,10 @@ def interp(string):
|
|||
return AnsibleUnicode(f"{{{{ {string} }}}}")
|
||||
|
||||
|
||||
def interp_float(string):
|
||||
return AnsibleUnicode(f'{{{{ printf "%.0f" {string} }}}}')
|
||||
|
||||
|
||||
def convert_jobs(config):
|
||||
|
||||
for name, job in config.items():
|
||||
|
|
|
@ -6,10 +6,22 @@
|
|||
roles:
|
||||
- prometheus_node
|
||||
|
||||
- hosts:
|
||||
- edge-1.rtr.infra.auro.re
|
||||
- edge-2.rtr.infra.auro.re
|
||||
- isp-1.rtr.infra.auro.re
|
||||
- isp-2.rtr.infra.auro.re
|
||||
- infra-1.rtr.infra.auro.re
|
||||
- infra-2.rtr.infra.auro.re
|
||||
roles:
|
||||
- prometheus_bird
|
||||
|
||||
- hosts:
|
||||
- prometheus-1.monit.infra.auro.re
|
||||
- prometheus-2.monit.infra.auro.re
|
||||
vars:
|
||||
prometheus__alertmanager_targets:
|
||||
- docker-ovh.adm.auro.re:9093
|
||||
prometheus__tsdb_retention_time: 90d
|
||||
prometheus__scraping:
|
||||
node:
|
||||
|
@ -18,7 +30,25 @@
|
|||
| flatten }}"
|
||||
address:
|
||||
port: 9100
|
||||
prometheus:
|
||||
targets:
|
||||
- prometheus-1.monit.infra.auro.re
|
||||
- prometheus-2.monit.infra.auro.re
|
||||
address:
|
||||
port: 9090
|
||||
bird:
|
||||
targets:
|
||||
- edge-1.rtr.infra.auro.re
|
||||
- edge-2.rtr.infra.auro.re
|
||||
address:
|
||||
port: 9324
|
||||
prometheus__alert_rules:
|
||||
prometheus:
|
||||
- alert: PrometheusTsdbCompactionFailed
|
||||
expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
node:
|
||||
- alert: MachineDown
|
||||
expr: "up == 0"
|
||||
|
@ -36,7 +66,7 @@
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Mémoire libre à {{ '$value' | interp }}%"
|
||||
summary: "Mémoire libre à {{ '$value' | interp_float }}%"
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: "( 1 - ( node_memory_SwapFree_bytes
|
||||
/ node_memory_SwapTotal_bytes ) )
|
||||
|
@ -45,14 +75,14 @@
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Swap {{ '$value' | interp }}%"
|
||||
summary: "Swap {{ '$value' | interp_float }}%"
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: "node_hwmon_temp_celsius > 79"
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{ '$value' | interp }}°C :
|
||||
summary: "{{ '$value' | interp_float }}°C :
|
||||
{{ '$labels.chip' | interp }},
|
||||
{{ '$labels.sensor' | interp }}"
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
|
@ -63,6 +93,20 @@
|
|||
annotations:
|
||||
summary: "{{ '$labels.chip' | interp }},
|
||||
{{ '$labels.sensor' | interp }}"
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: 'node_md_state{state="inactive"} > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{ '$labels.device' | interp }}"
|
||||
- alert: HostRaidDiskFailure
|
||||
expr: 'node_md_disks{state="failed"} > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
severity: "{{ '$labels.md_device' | interp }}"
|
||||
- alert: HostOomKillDetected
|
||||
expr: "increase(node_vmstat_oom_kill[1m]) > 0"
|
||||
for: 0m
|
||||
|
@ -77,15 +121,51 @@
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ '$value' | interp }} erreurs corrigées"
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ '$value' | interp }} erreurs corrigées"
|
||||
- alert: OutOfDiskSpace
|
||||
expr: "node_filesystem_free_bytes
|
||||
/ node_filesystem_size_bytes * 100 < 10"
|
||||
expr: "( node_filesystem_free_bytes
|
||||
/ node_filesystem_size_bytes * 100 < 10 )
|
||||
and on (instance, device, mountpoint)
|
||||
node_filesystem_readonly == 0"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{ '$labels.mountpoint' | interp }} :
|
||||
{{ '$value' | interp_float }}% libre"
|
||||
- alert: HostConntrackLimit
|
||||
expr: "( node_nf_conntrack_entries
|
||||
/ node_nf_conntrack_entries_limit ) * 100 > 80"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ '$labels.mountpoint' | interp }} :
|
||||
{{ '$value' | interp }}% libre"
|
||||
summary: "{{ '$value' | interp_float }}% complet"
|
||||
- alert: HostClockSkew
|
||||
expr: "(node_timex_offset_seconds > 0.05
|
||||
and deriv(node_timex_offset_seconds[5m]) >= 0)
|
||||
or (node_timex_offset_seconds < -0.05
|
||||
and deriv(node_timex_offset_seconds[5m]) <= 0)"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: "min_over_time(node_timex_sync_status[1m]) == 0
|
||||
and node_timex_maxerror_seconds >= 16"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: HostRequiresReboot
|
||||
expr: "node_reboot_required > 0"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: OutOfInodes
|
||||
expr: "node_filesystem_files_free
|
||||
/ node_filesystem_files * 100 < 10"
|
||||
|
@ -94,7 +174,7 @@
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ '$labels.mountpoint' | interp }} :
|
||||
{{ '$value' | interp }}% libre"
|
||||
{{ '$value' | interp_float }}% libre"
|
||||
- alert: CpuUsage
|
||||
expr: '( 100 - avg by (instance)
|
||||
( irate(node_cpu_seconds_total{mode="idle"}[5m]) )
|
||||
|
@ -103,7 +183,7 @@
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ '$value' | interp }}%"
|
||||
summary: "{{ '$value' | interp_float }}%"
|
||||
- alert: SystemdServiceFailed
|
||||
expr: 'node_systemd_unit_state{state="failed"} == 1'
|
||||
for: 10m
|
||||
|
@ -117,7 +197,7 @@
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ '$value' | interp }}"
|
||||
summary: "{{ '$value' | interp_float }}"
|
||||
- alert: UnhealthyDisk
|
||||
expr: "smartmon_device_smart_healthy < 1"
|
||||
for: 10m
|
||||
|
@ -125,6 +205,24 @@
|
|||
severity: critical
|
||||
annotations:
|
||||
summary: "{{ '$labels.disk' | interp }}"
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: 'avg by(instance)
|
||||
(rate(node_cpu_seconds_total{mode="steal"}[5m]))
|
||||
* 100 > 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ '$labels.disk' | interp }}"
|
||||
bird:
|
||||
- alert: BirdProtocolDown
|
||||
expr: "bird_protocol_up == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{ '$labels.name' | interp }} :
|
||||
{{ '$labels.state' | interp }}"
|
||||
roles:
|
||||
- prometheus
|
||||
|
||||
|
@ -332,38 +430,7 @@
|
|||
# - prometheus
|
||||
# - update_motd
|
||||
#
|
||||
#- hosts: prometheus-federate.adm.auro.re
|
||||
# vars:
|
||||
# prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
||||
# snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
||||
# snmp_pdu_user: "{{ vault_snmp_pdu_user }}"
|
||||
# snmp_pdu_password: "{{ vault_snmp_pdu_password }}"
|
||||
# snmp_ilo_user: aurore
|
||||
# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}"
|
||||
# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}"
|
||||
#
|
||||
# prometheus_servers_targets:
|
||||
# - prometheus-edc.adm.auro.re
|
||||
# - prometheus-gs.adm.auro.re
|
||||
# - prometheus-fleming.adm.auro.re
|
||||
# - prometheus-pacaterie.adm.auro.re
|
||||
# - prometheus-rives.adm.auro.re
|
||||
# - prometheus-aurore.adm.auro.re
|
||||
# - prometheus-ovh.adm.auro.re
|
||||
#
|
||||
# update_motd:
|
||||
# prometheus_federate: >-
|
||||
# Prometheus (en configuration fédération) est déployé (/etc/prometheus).
|
||||
# roles:
|
||||
# - prometheus_federate
|
||||
# - update_motd
|
||||
#
|
||||
## Postgres Exporters
|
||||
#- hosts: bdd,radius
|
||||
# roles:
|
||||
# - prometheus_postgres
|
||||
#
|
||||
## Monitor all hosts
|
||||
#- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
|
||||
# roles:
|
||||
# - prometheus_node
|
||||
|
|
|
@ -11,5 +11,4 @@ bird__radv_dns_servers: []
|
|||
bird__radv_max_interval: 5
|
||||
bird__static_unreachable: []
|
||||
bird__bgp_sessions: []
|
||||
bird__prometheus_listen_address: 0.0.0.0:9324
|
||||
...
|
||||
|
|
|
@ -3,9 +3,4 @@
|
|||
systemd:
|
||||
name: bird.service
|
||||
state: reloaded
|
||||
|
||||
- name: Restart prometheus-bird-exporter
|
||||
systemd:
|
||||
name: prometheus-bird-exporter.service
|
||||
state: restarted
|
||||
...
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
---
|
||||
- name: Install bird
|
||||
apt:
|
||||
name:
|
||||
- bird2
|
||||
- prometheus-bird-exporter
|
||||
name: bird2
|
||||
|
||||
- name: Configure bird
|
||||
template:
|
||||
|
@ -15,26 +13,9 @@
|
|||
notify:
|
||||
- Reload bird
|
||||
|
||||
- name: Configure prometheus-bird-exporter
|
||||
template:
|
||||
src: prometheus-bird-exporter.j2
|
||||
dest: /etc/default/prometheus-bird-exporter
|
||||
owner: root
|
||||
group: root
|
||||
mode: u=rw,g=r,o=
|
||||
notify:
|
||||
- Restart prometheus-bird-exporter
|
||||
|
||||
- name: Enable and start bird
|
||||
systemd:
|
||||
name: bird.service
|
||||
state: started
|
||||
enabled: true
|
||||
|
||||
- name: Enable and start prometheus-bird-exporter
|
||||
systemd:
|
||||
name: prometheus-bird-exporter.service
|
||||
state: started
|
||||
enabled: true
|
||||
|
||||
...
|
||||
|
|
3
roles/prometheus_bird/defaults/main.yml
Normal file
3
roles/prometheus_bird/defaults/main.yml
Normal file
|
@ -0,0 +1,3 @@
|
|||
---
|
||||
bird__prometheus_listen_address: ':9324'
|
||||
...
|
6
roles/prometheus_bird/handlers/main.yml
Normal file
6
roles/prometheus_bird/handlers/main.yml
Normal file
|
@ -0,0 +1,6 @@
|
|||
---
|
||||
- name: Restart prometheus-bird-exporter
|
||||
systemd:
|
||||
name: prometheus-bird-exporter.service
|
||||
state: restarted
|
||||
...
|
21
roles/prometheus_bird/tasks/main.yml
Normal file
21
roles/prometheus_bird/tasks/main.yml
Normal file
|
@ -0,0 +1,21 @@
|
|||
---
|
||||
- name: Install prometheus-bird-exporter
|
||||
apt:
|
||||
name: prometheus-bird-exporter
|
||||
|
||||
- name: Configure prometheus-bird-exporter
|
||||
template:
|
||||
src: prometheus-bird-exporter.j2
|
||||
dest: /etc/default/prometheus-bird-exporter
|
||||
owner: root
|
||||
group: root
|
||||
mode: u=rw,g=r,o=
|
||||
notify:
|
||||
- Restart prometheus-bird-exporter
|
||||
|
||||
- name: Enable and start prometheus-bird-exporter
|
||||
systemd:
|
||||
name: prometheus-bird-exporter.service
|
||||
state: started
|
||||
enabled: true
|
||||
...
|
Loading…
Reference in a new issue