From 0807dc1d70da4bd3e48699bc42344e777035ad70 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Sun, 2 Apr 2023 13:25:03 +0200 Subject: [PATCH] =?UTF-8?q?prometheus-bird-role=20=E2=86=92=20dedicated=20?= =?UTF-8?q?role=20+=20various=20alerts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- filter_plugins/prometheus.py | 6 + playbooks/prometheus.yml | 149 +++++++++++++----- roles/bird/defaults/main.yml | 1 - roles/bird/handlers/main.yml | 5 - roles/bird/tasks/main.yml | 21 +-- roles/prometheus_bird/defaults/main.yml | 3 + roles/prometheus_bird/handlers/main.yml | 6 + roles/prometheus_bird/tasks/main.yml | 21 +++ .../templates/prometheus-bird-exporter.j2 | 0 9 files changed, 145 insertions(+), 67 deletions(-) create mode 100644 roles/prometheus_bird/defaults/main.yml create mode 100644 roles/prometheus_bird/handlers/main.yml create mode 100644 roles/prometheus_bird/tasks/main.yml rename roles/{bird => prometheus_bird}/templates/prometheus-bird-exporter.j2 (100%) diff --git a/filter_plugins/prometheus.py b/filter_plugins/prometheus.py index 2494af5..a7e2cd2 100644 --- a/filter_plugins/prometheus.py +++ b/filter_plugins/prometheus.py @@ -1,10 +1,12 @@ from ansible.parsing.yaml.objects import AnsibleUnicode + class FilterModule: def filters(self): return { "prometheus__convert_jobs": convert_jobs, "interp": interp, + "interp_float": interp_float, } @@ -12,6 +14,10 @@ def interp(string): return AnsibleUnicode(f"{{{{ {string} }}}}") +def interp_float(string): + return AnsibleUnicode(f'{{{{ printf "%.0f" {string} }}}}') + + def convert_jobs(config): for name, job in config.items(): diff --git a/playbooks/prometheus.yml b/playbooks/prometheus.yml index 443f7d4..8a2c120 100755 --- a/playbooks/prometheus.yml +++ b/playbooks/prometheus.yml @@ -6,10 +6,22 @@ roles: - prometheus_node +- hosts: + - edge-1.rtr.infra.auro.re + - edge-2.rtr.infra.auro.re + - isp-1.rtr.infra.auro.re + - isp-2.rtr.infra.auro.re + - infra-1.rtr.infra.auro.re + - infra-2.rtr.infra.auro.re + roles: + - prometheus_bird + - hosts: - prometheus-1.monit.infra.auro.re - prometheus-2.monit.infra.auro.re vars: + prometheus__alertmanager_targets: + - docker-ovh.adm.auro.re:9093 prometheus__tsdb_retention_time: 90d prometheus__scraping: node: @@ -18,7 +30,25 @@ | flatten }}" address: port: 9100 + prometheus: + targets: + - prometheus-1.monit.infra.auro.re + - prometheus-2.monit.infra.auro.re + address: + port: 9090 + bird: + targets: + - edge-1.rtr.infra.auro.re + - edge-2.rtr.infra.auro.re + address: + port: 9324 prometheus__alert_rules: + prometheus: + - alert: PrometheusTsdbCompactionFailed + expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0" + for: 0m + labels: + severity: critical node: - alert: MachineDown expr: "up == 0" @@ -36,7 +66,7 @@ labels: severity: warning annotations: - summary: "Mémoire libre à {{ '$value' | interp }}%" + summary: "Mémoire libre à {{ '$value' | interp_float }}%" - alert: HostSwapIsFillingUp expr: "( 1 - ( node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes ) ) @@ -45,14 +75,14 @@ labels: severity: critical annotations: - summary: "Swap {{ '$value' | interp }}%" + summary: "Swap {{ '$value' | interp_float }}%" - alert: HostPhysicalComponentTooHot expr: "node_hwmon_temp_celsius > 79" for: 3m labels: severity: critical annotations: - summary: "{{ '$value' | interp }}°C : + summary: "{{ '$value' | interp_float }}°C : {{ '$labels.chip' | interp }}, {{ '$labels.sensor' | interp }}" - alert: HostNodeOvertemperatureAlarm @@ -63,6 +93,20 @@ annotations: summary: "{{ '$labels.chip' | interp }}, {{ '$labels.sensor' | interp }}" + - alert: HostRaidArrayGotInactive + expr: 'node_md_state{state="inactive"} > 0' + for: 0m + labels: + severity: critical + annotations: + summary: "{{ '$labels.device' | interp }}" + - alert: HostRaidDiskFailure + expr: 'node_md_disks{state="failed"} > 0' + for: 0m + labels: + severity: critical + annotations: + severity: "{{ '$labels.md_device' | interp }}" - alert: HostOomKillDetected expr: "increase(node_vmstat_oom_kill[1m]) > 0" for: 0m @@ -77,15 +121,51 @@ severity: warning annotations: summary: "{{ '$value' | interp }} erreurs corrigées" + - alert: HostEdacUncorrectableErrorsDetected + expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0" + for: 0m + labels: + severity: warning + annotations: + summary: "{{ '$value' | interp }} erreurs corrigées" - alert: OutOfDiskSpace - expr: "node_filesystem_free_bytes - / node_filesystem_size_bytes * 100 < 10" + expr: "( node_filesystem_free_bytes + / node_filesystem_size_bytes * 100 < 10 ) + and on (instance, device, mountpoint) + node_filesystem_readonly == 0" + for: 5m + labels: + severity: critical + annotations: + summary: "{{ '$labels.mountpoint' | interp }} : + {{ '$value' | interp_float }}% libre" + - alert: HostConntrackLimit + expr: "( node_nf_conntrack_entries + / node_nf_conntrack_entries_limit ) * 100 > 80" for: 5m labels: severity: warning annotations: - summary: "{{ '$labels.mountpoint' | interp }} : - {{ '$value' | interp }}% libre" + summary: "{{ '$value' | interp_float }}% complet" + - alert: HostClockSkew + expr: "(node_timex_offset_seconds > 0.05 + and deriv(node_timex_offset_seconds[5m]) >= 0) + or (node_timex_offset_seconds < -0.05 + and deriv(node_timex_offset_seconds[5m]) <= 0)" + for: 2m + labels: + severity: warning + - alert: HostClockNotSynchronising + expr: "min_over_time(node_timex_sync_status[1m]) == 0 + and node_timex_maxerror_seconds >= 16" + for: 2m + labels: + severity: warning + - alert: HostRequiresReboot + expr: "node_reboot_required > 0" + for: 5m + labels: + severity: warning - alert: OutOfInodes expr: "node_filesystem_files_free / node_filesystem_files * 100 < 10" @@ -94,7 +174,7 @@ severity: warning annotations: summary: "{{ '$labels.mountpoint' | interp }} : - {{ '$value' | interp }}% libre" + {{ '$value' | interp_float }}% libre" - alert: CpuUsage expr: '( 100 - avg by (instance) ( irate(node_cpu_seconds_total{mode="idle"}[5m]) ) @@ -103,7 +183,7 @@ labels: severity: warning annotations: - summary: "{{ '$value' | interp }}%" + summary: "{{ '$value' | interp_float }}%" - alert: SystemdServiceFailed expr: 'node_systemd_unit_state{state="failed"} == 1' for: 10m @@ -117,7 +197,7 @@ labels: severity: warning annotations: - summary: "{{ '$value' | interp }}" + summary: "{{ '$value' | interp_float }}" - alert: UnhealthyDisk expr: "smartmon_device_smart_healthy < 1" for: 10m @@ -125,6 +205,24 @@ severity: critical annotations: summary: "{{ '$labels.disk' | interp }}" + - alert: HostCpuStealNoisyNeighbor + expr: 'avg by(instance) + (rate(node_cpu_seconds_total{mode="steal"}[5m])) + * 100 > 10' + for: 5m + labels: + severity: warning + annotations: + summary: "{{ '$labels.disk' | interp }}" + bird: + - alert: BirdProtocolDown + expr: "bird_protocol_up == 0" + for: 0m + labels: + severity: critical + annotations: + summary: "{{ '$labels.name' | interp }} : + {{ '$labels.state' | interp }}" roles: - prometheus @@ -332,38 +430,7 @@ # - prometheus # - update_motd # -#- hosts: prometheus-federate.adm.auro.re -# vars: -# prometheus_alertmanager: docker-ovh.adm.auro.re:9093 -# snmp_unifi_password: "{{ vault_snmp_unifi_password }}" -# snmp_pdu_user: "{{ vault_snmp_pdu_user }}" -# snmp_pdu_password: "{{ vault_snmp_pdu_password }}" -# snmp_ilo_user: aurore -# snmp_ilo_auth: "{{ vault_snmp_ilo_auth }}" -# snmp_ilo_priv: "{{ vault_snmp_ilo_priv }}" -# -# prometheus_servers_targets: -# - prometheus-edc.adm.auro.re -# - prometheus-gs.adm.auro.re -# - prometheus-fleming.adm.auro.re -# - prometheus-pacaterie.adm.auro.re -# - prometheus-rives.adm.auro.re -# - prometheus-aurore.adm.auro.re -# - prometheus-ovh.adm.auro.re -# -# update_motd: -# prometheus_federate: >- -# Prometheus (en configuration fédération) est déployé (/etc/prometheus). -# roles: -# - prometheus_federate -# - update_motd -# ## Postgres Exporters #- hosts: bdd,radius # roles: # - prometheus_postgres -# -## Monitor all hosts -#- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container -# roles: -# - prometheus_node diff --git a/roles/bird/defaults/main.yml b/roles/bird/defaults/main.yml index 1c72633..68e90f2 100644 --- a/roles/bird/defaults/main.yml +++ b/roles/bird/defaults/main.yml @@ -11,5 +11,4 @@ bird__radv_dns_servers: [] bird__radv_max_interval: 5 bird__static_unreachable: [] bird__bgp_sessions: [] -bird__prometheus_listen_address: 0.0.0.0:9324 ... diff --git a/roles/bird/handlers/main.yml b/roles/bird/handlers/main.yml index 8b3f2a0..7de17ff 100644 --- a/roles/bird/handlers/main.yml +++ b/roles/bird/handlers/main.yml @@ -3,9 +3,4 @@ systemd: name: bird.service state: reloaded - -- name: Restart prometheus-bird-exporter - systemd: - name: prometheus-bird-exporter.service - state: restarted ... diff --git a/roles/bird/tasks/main.yml b/roles/bird/tasks/main.yml index 0f7bda1..4b896ef 100644 --- a/roles/bird/tasks/main.yml +++ b/roles/bird/tasks/main.yml @@ -1,9 +1,7 @@ --- - name: Install bird apt: - name: - - bird2 - - prometheus-bird-exporter + name: bird2 - name: Configure bird template: @@ -15,26 +13,9 @@ notify: - Reload bird -- name: Configure prometheus-bird-exporter - template: - src: prometheus-bird-exporter.j2 - dest: /etc/default/prometheus-bird-exporter - owner: root - group: root - mode: u=rw,g=r,o= - notify: - - Restart prometheus-bird-exporter - - name: Enable and start bird systemd: name: bird.service state: started enabled: true - -- name: Enable and start prometheus-bird-exporter - systemd: - name: prometheus-bird-exporter.service - state: started - enabled: true - ... diff --git a/roles/prometheus_bird/defaults/main.yml b/roles/prometheus_bird/defaults/main.yml new file mode 100644 index 0000000..683acf1 --- /dev/null +++ b/roles/prometheus_bird/defaults/main.yml @@ -0,0 +1,3 @@ +--- +bird__prometheus_listen_address: ':9324' +... diff --git a/roles/prometheus_bird/handlers/main.yml b/roles/prometheus_bird/handlers/main.yml new file mode 100644 index 0000000..8873400 --- /dev/null +++ b/roles/prometheus_bird/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Restart prometheus-bird-exporter + systemd: + name: prometheus-bird-exporter.service + state: restarted +... diff --git a/roles/prometheus_bird/tasks/main.yml b/roles/prometheus_bird/tasks/main.yml new file mode 100644 index 0000000..d1dc0c2 --- /dev/null +++ b/roles/prometheus_bird/tasks/main.yml @@ -0,0 +1,21 @@ +--- +- name: Install prometheus-bird-exporter + apt: + name: prometheus-bird-exporter + +- name: Configure prometheus-bird-exporter + template: + src: prometheus-bird-exporter.j2 + dest: /etc/default/prometheus-bird-exporter + owner: root + group: root + mode: u=rw,g=r,o= + notify: + - Restart prometheus-bird-exporter + +- name: Enable and start prometheus-bird-exporter + systemd: + name: prometheus-bird-exporter.service + state: started + enabled: true +... diff --git a/roles/bird/templates/prometheus-bird-exporter.j2 b/roles/prometheus_bird/templates/prometheus-bird-exporter.j2 similarity index 100% rename from roles/bird/templates/prometheus-bird-exporter.j2 rename to roles/prometheus_bird/templates/prometheus-bird-exporter.j2