From b301ce5eaf015b4ac89cc67ead878a3c8052cd9f Mon Sep 17 00:00:00 2001 From: Jean-Marie Mineau Date: Sun, 10 Oct 2021 01:33:08 +0200 Subject: [PATCH] add blackbox alerts --- .../files/alerts-blackbox.yml | 47 +++++++++++++++++++ .../handlers/main.yml | 5 ++ .../tasks/main.yml | 9 ++++ .../files/alerts-node.yml | 2 +- roles/prometheus/tasks/main.yml | 6 +-- roles/prometheus/templates/prometheus.yml | 2 +- 6 files changed, 66 insertions(+), 5 deletions(-) create mode 100644 roles/prometheus-blackbox-exporter/files/alerts-blackbox.yml diff --git a/roles/prometheus-blackbox-exporter/files/alerts-blackbox.yml b/roles/prometheus-blackbox-exporter/files/alerts-blackbox.yml new file mode 100644 index 0000000..58d9546 --- /dev/null +++ b/roles/prometheus-blackbox-exporter/files/alerts-blackbox.yml @@ -0,0 +1,47 @@ +--- +groups: +- name: BlackBoxAllInstances + rules: + + - alert: SiteUp + expr: probe_success{job="blackbox http-down"} == 1 + annotations: + title: '{{ $labels.instance }} is UP!' + description: '{{ $labels.instance }} is now up!' + labels: + value: "{{ $value }}" + severity: 'critical' + + - alert: SiteDown + expr: probe_success{job="blackbox http-up"} == 0 + for: 5m + annotations: + title: '{{ $labels.instance }} is Down' + description: >- + {{ $labels.instance }} has been down for more than 5 minutes. + labels: + value: "{{ $value }}" + severity: 'warning' + + - alert: CertExpLess30days + expr: (probe_ssl_earliest_cert_expiry{job="blackbox internal tls"}-time()) < 2592000 + annotations: + title: '{{ $labels.cname }} will expire soon' + description: >- + The certificate {{ $labels.cname }} on {{ $labels.instance }} will expire in + {{ $value | humanizeDuration }}, it's time to renew it. + labels: + value: "{{ $value }}" + severity: 'warning' + + - alert: CertExpLess10days + expr: (probe_ssl_earliest_cert_expiry{job="blackbox internal tls"}-time()) < 864000 + annotations: + title: '{{ $labels.cname }} expiracy is imminent!' + description: >- + The certificate {{ $labels.cname }} on {{ $labels.instance }} will expire in + {{ $value | humanizeDuration }}! + labels: + value: "{{ $value }}" + severity: 'critical' +... diff --git a/roles/prometheus-blackbox-exporter/handlers/main.yml b/roles/prometheus-blackbox-exporter/handlers/main.yml index a15d341..eda96a7 100644 --- a/roles/prometheus-blackbox-exporter/handlers/main.yml +++ b/roles/prometheus-blackbox-exporter/handlers/main.yml @@ -3,3 +3,8 @@ systemd: name: prometheus-blackbox-exporter.service state: restarted + +- name: Restart prometheus + systemd: + name: prometheus + state: restarted diff --git a/roles/prometheus-blackbox-exporter/tasks/main.yml b/roles/prometheus-blackbox-exporter/tasks/main.yml index 83506e3..caf3464 100644 --- a/roles/prometheus-blackbox-exporter/tasks/main.yml +++ b/roles/prometheus-blackbox-exporter/tasks/main.yml @@ -85,3 +85,12 @@ dest: "/etc/nginx/sites-enabled/internal-blackbox" state: link force: yes + +- name: Add alert rules for node on the prometheus server + copy: + src: alerts-blackbox.yml + dest: /etc/prometheus/alertsblackbox.yml + owner: prometheus + group: prometheus + mode: u=rw,g=r,o=r + notify: Restart prometheus diff --git a/roles/prometheus-node-exporter/files/alerts-node.yml b/roles/prometheus-node-exporter/files/alerts-node.yml index 0fbedf8..1dd79b8 100644 --- a/roles/prometheus-node-exporter/files/alerts-node.yml +++ b/roles/prometheus-node-exporter/files/alerts-node.yml @@ -1,6 +1,6 @@ --- groups: -- name: AllInstances +- name: NodeAllInstances rules: - alert: InstanceDown diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 78c4c94..2acae58 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -71,9 +71,9 @@ force: no notify: Restart prometheus loop: - - blackbox-http-external-down - - blackbox-http-external-up - - blackbox-https-internal + - blackbox-http-down + - blackbox-http-up + - blackbox-tls-internal - node - name: Copy the web-config folder diff --git a/roles/prometheus/templates/prometheus.yml b/roles/prometheus/templates/prometheus.yml index 6baa231..bcecce5 100644 --- a/roles/prometheus/templates/prometheus.yml +++ b/roles/prometheus/templates/prometheus.yml @@ -64,7 +64,7 @@ scrape_configs: cert_file: '/etc/prometheus/prometheus-{{ lan_address }}.crt' key_file: '/etc/prometheus/prometheus-{{ lan_address }}.key' -{% for target_type in ('http-external-up', 'http-external-down') %} +{% for target_type in ('http-up', 'http-down') %} - job_name: blackbox {{ target_type }} metrics_path: /probe params: