From dd550700d0e7742a82e102645c2e86052c150b37 Mon Sep 17 00:00:00 2001 From: Jean-Marie Mineau Date: Wed, 6 Oct 2021 13:24:58 +0200 Subject: [PATCH] add first config for alertmanager --- books/monitoring.yml | 1 + .../handlers/main.yml | 5 ++ roles/prometheus-alert-manager/tasks/main.yml | 41 ++++++++++ .../templates/prometheus-alertmanager | 75 +++++++++++++++++++ roles/prometheus/tasks/main.yml | 1 - .../prometheus/templates/atrocious_nginx_stub | 2 +- roles/prometheus/templates/prometheus.yml | 2 +- 7 files changed, 124 insertions(+), 3 deletions(-) create mode 100644 roles/prometheus-alert-manager/handlers/main.yml create mode 100644 roles/prometheus-alert-manager/tasks/main.yml create mode 100644 roles/prometheus-alert-manager/templates/prometheus-alertmanager diff --git a/books/monitoring.yml b/books/monitoring.yml index 4982c2b..ce74f25 100644 --- a/books/monitoring.yml +++ b/books/monitoring.yml @@ -3,6 +3,7 @@ - hosts: prometheus_servers roles: - prometheus + - prometheus-alert-manager - grafana - hosts: all, !tests, diff --git a/roles/prometheus-alert-manager/handlers/main.yml b/roles/prometheus-alert-manager/handlers/main.yml new file mode 100644 index 0000000..e1eb12d --- /dev/null +++ b/roles/prometheus-alert-manager/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart Alertmanager + systemd: + name: prometheus-alertmanager.service + state: restarted diff --git a/roles/prometheus-alert-manager/tasks/main.yml b/roles/prometheus-alert-manager/tasks/main.yml new file mode 100644 index 0000000..b9cf74e --- /dev/null +++ b/roles/prometheus-alert-manager/tasks/main.yml @@ -0,0 +1,41 @@ +--- +- name: Install Prometheus Alert Manager + apt: + name: + - prometheus-alertmanager + state: latest + update_cache: true + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Setup the arguments for alertmanager + template: + src: prometheus-alertmanager + dest: /etc/default/prometheus-alertmanager + owner: root + group: root + mode: '0644' + notify: Restart Alertmanager + vars: + args: + - name: web.listen-address + value: "127.0.0.1:9093" + +#- name: Copy the CA cert +# copy: +# content: "{{ ca_cert }}" +# dest: /etc/?/ca.crt +# notify: Restart Alertmanager +# +#- name: Generate certificate +# include_role: +# name: generate-cert +# vars: +# directory: /etc/?/ +# cname: "alertmanager-{{ lan_address }}" +# owner: ? +# group: ? +# key_mode: u=rw,g=,o= +# subject_alt_name: "IP:{{ lan_address }}" +## Need an equivalent to notify here diff --git a/roles/prometheus-alert-manager/templates/prometheus-alertmanager b/roles/prometheus-alert-manager/templates/prometheus-alertmanager new file mode 100644 index 0000000..ff797fc --- /dev/null +++ b/roles/prometheus-alert-manager/templates/prometheus-alertmanager @@ -0,0 +1,75 @@ +{{ ansible_managed | comment }} + +# Set the command-line arguments to pass to the server. +{% if not args %} +ARGS="" +{% else %} +ARGS="\ +{% for arg in args %} + --{{ arg.name }}={{ arg.value }} \ +{% endfor %} +" +{% endif %} + +# The alert manager supports the following options: + +# --config.file="/etc/prometheus/alertmanager.yml" +# Alertmanager configuration file name. +# --storage.path="/var/lib/prometheus/alertmanager/" +# Base path for data storage. +# --data.retention=120h +# How long to keep data for. +# --alerts.gc-interval=30m +# Interval between alert GC. +# --log.level=info +# Only log messages with the given severity or above. +# --web.external-url=WEB.EXTERNAL-URL +# The URL under which Alertmanager is externally reachable (for example, +# if Alertmanager is served via a reverse proxy). Used for generating +# relative and absolute links back to Alertmanager itself. If the URL has +# a path portion, it will be used to prefix all HTTP endpoints served by +# Alertmanager. If omitted, relevant URL components will be derived +# automatically. +# --web.route-prefix=WEB.ROUTE-PREFIX +# Prefix for the internal routes of web endpoints. Defaults to path of +# --web.external-url. +# --web.listen-address=":9093" +# Address to listen on for the web interface and API. +# --web.ui-path="/usr/share/prometheus/alertmanager/ui/" +# Path to static UI directory. +# --template.default="/usr/share/prometheus/alertmanager/default.tmpl" +# Path to default notification template. +# --cluster.listen-address="0.0.0.0:9094" +# Listen address for cluster. +# --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS +# Explicit address to advertise in cluster. +# --cluster.peer=CLUSTER.PEER ... +# Initial peers (may be repeated). +# --cluster.peer-timeout=15s +# Time to wait between peers to send notifications. +# --cluster.gossip-interval=200ms +# Interval between sending gossip messages. By lowering this value (more +# frequent) gossip messages are propagated across the cluster more +# quickly at the expense of increased bandwidth. +# --cluster.pushpull-interval=1m0s +# Interval for gossip state syncs. Setting this interval lower (more +# frequent) will increase convergence speeds across larger clusters at +# the expense of increased bandwidth usage. +# --cluster.tcp-timeout=10s Timeout for establishing a stream connection +# with a remote node for a full state sync, and for stream read and write +# operations. +# --cluster.probe-timeout=500ms +# Timeout to wait for an ack from a probed node before assuming it is +# unhealthy. This should be set to 99-percentile of RTT (round-trip time) +# on your network. +# --cluster.probe-interval=1s +# Interval between random node probes. Setting this lower (more frequent) +# will cause the cluster to detect failed nodes more quickly at the +# expense of increased bandwidth usage. +# --cluster.settle-timeout=1m0s +# Maximum time to wait for cluster connections to settle before +# evaluating notifications. +# --cluster.reconnect-interval=10s +# Interval between attempting to reconnect to lost peers. +# --cluster.reconnect-timeout=6h0m0s +# Length of time to attempt to reconnect to a lost peer. diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index e939f9a..5b62111 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -3,7 +3,6 @@ apt: name: - prometheus - - prometheus-alertmanager - prometheus-pushgateway state: latest update_cache: true diff --git a/roles/prometheus/templates/atrocious_nginx_stub b/roles/prometheus/templates/atrocious_nginx_stub index da2e67b..c28eefb 100644 --- a/roles/prometheus/templates/atrocious_nginx_stub +++ b/roles/prometheus/templates/atrocious_nginx_stub @@ -8,6 +8,6 @@ server { ssl_verify_client on; location / { - proxy_pass 127.0.0.1:9090; + proxy_pass http://127.0.0.1:9090; } } diff --git a/roles/prometheus/templates/prometheus.yml b/roles/prometheus/templates/prometheus.yml index 0d59dac..98af4fa 100644 --- a/roles/prometheus/templates/prometheus.yml +++ b/roles/prometheus/templates/prometheus.yml @@ -14,7 +14,7 @@ global: alerting: alertmanagers: - static_configs: - - targets: ['localhost:9093'] + - targets: ['{{ lan_address }}:9093'] # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: