Browse Source

add blackbox alerts

monitoring
histausse 12 months ago
parent
commit
b301ce5eaf
Signed by: histausse
GPG Key ID: 67486F107F62E9E9
  1. 47
      roles/prometheus-blackbox-exporter/files/alerts-blackbox.yml
  2. 5
      roles/prometheus-blackbox-exporter/handlers/main.yml
  3. 9
      roles/prometheus-blackbox-exporter/tasks/main.yml
  4. 2
      roles/prometheus-node-exporter/files/alerts-node.yml
  5. 6
      roles/prometheus/tasks/main.yml
  6. 2
      roles/prometheus/templates/prometheus.yml

47
roles/prometheus-blackbox-exporter/files/alerts-blackbox.yml

@ -0,0 +1,47 @@
---
groups:
- name: BlackBoxAllInstances
rules:
- alert: SiteUp
expr: probe_success{job="blackbox http-down"} == 1
annotations:
title: '{{ $labels.instance }} is UP!'
description: '{{ $labels.instance }} is now up!'
labels:
value: "{{ $value }}"
severity: 'critical'
- alert: SiteDown
expr: probe_success{job="blackbox http-up"} == 0
for: 5m
annotations:
title: '{{ $labels.instance }} is Down'
description: >-
{{ $labels.instance }} has been down for more than 5 minutes.
labels:
value: "{{ $value }}"
severity: 'warning'
- alert: CertExpLess30days
expr: (probe_ssl_earliest_cert_expiry{job="blackbox internal tls"}-time()) < 2592000
annotations:
title: '{{ $labels.cname }} will expire soon'
description: >-
The certificate {{ $labels.cname }} on {{ $labels.instance }} will expire in
{{ $value | humanizeDuration }}, it's time to renew it.
labels:
value: "{{ $value }}"
severity: 'warning'
- alert: CertExpLess10days
expr: (probe_ssl_earliest_cert_expiry{job="blackbox internal tls"}-time()) < 864000
annotations:
title: '{{ $labels.cname }} expiracy is imminent!'
description: >-
The certificate {{ $labels.cname }} on {{ $labels.instance }} will expire in
{{ $value | humanizeDuration }}!
labels:
value: "{{ $value }}"
severity: 'critical'
...

5
roles/prometheus-blackbox-exporter/handlers/main.yml

@ -3,3 +3,8 @@
systemd:
name: prometheus-blackbox-exporter.service
state: restarted
- name: Restart prometheus
systemd:
name: prometheus
state: restarted

9
roles/prometheus-blackbox-exporter/tasks/main.yml

@ -85,3 +85,12 @@
dest: "/etc/nginx/sites-enabled/internal-blackbox"
state: link
force: yes
- name: Add alert rules for node on the prometheus server
copy:
src: alerts-blackbox.yml
dest: /etc/prometheus/alertsblackbox.yml
owner: prometheus
group: prometheus
mode: u=rw,g=r,o=r
notify: Restart prometheus

2
roles/prometheus-node-exporter/files/alerts-node.yml

@ -1,6 +1,6 @@
---
groups:
- name: AllInstances
- name: NodeAllInstances
rules:
- alert: InstanceDown

6
roles/prometheus/tasks/main.yml

@ -71,9 +71,9 @@
force: no
notify: Restart prometheus
loop:
- blackbox-http-external-down
- blackbox-http-external-up
- blackbox-https-internal
- blackbox-http-down
- blackbox-http-up
- blackbox-tls-internal
- node
- name: Copy the web-config folder

2
roles/prometheus/templates/prometheus.yml

@ -64,7 +64,7 @@ scrape_configs:
cert_file: '/etc/prometheus/prometheus-{{ lan_address }}.crt'
key_file: '/etc/prometheus/prometheus-{{ lan_address }}.key'
{% for target_type in ('http-external-up', 'http-external-down') %}
{% for target_type in ('http-up', 'http-down') %}
- job_name: blackbox {{ target_type }}
metrics_path: /probe
params:

Loading…
Cancel
Save