add first config for alertmanager

2021-10-06 13:24:58 +02:00 · 2021-10-06 13:24:58 +02:00 · dd550700d0
commit dd550700d0
parent 6e28abc1e7
7 changed files with 124 additions and 3 deletions
--- a/books/monitoring.yml
+++ b/books/monitoring.yml
@ -3,6 +3,7 @@
 - hosts: prometheus_servers
  roles:
    - prometheus
+    - prometheus-alert-manager
    - grafana

 - hosts: all, !tests,
--- a/roles/prometheus-alert-manager/handlers/main.yml
+++ b/roles/prometheus-alert-manager/handlers/main.yml
@ -0,0 +1,5 @@
+---
+- name: Restart Alertmanager
+  systemd:
+    name: prometheus-alertmanager.service
+    state: restarted
--- a/roles/prometheus-alert-manager/tasks/main.yml
+++ b/roles/prometheus-alert-manager/tasks/main.yml
@ -0,0 +1,41 @@
+---
+- name: Install Prometheus Alert Manager
+  apt:
+    name:
+      - prometheus-alertmanager
+    state: latest
+    update_cache: true
+  register: apt_result
+  retries: 3
+  until: apt_result is succeeded
+
+- name: Setup the arguments for alertmanager
+  template:
+    src: prometheus-alertmanager
+    dest: /etc/default/prometheus-alertmanager
+    owner: root
+    group: root
+    mode: '0644'
+  notify: Restart Alertmanager 
+  vars:
+    args:
+      - name: web.listen-address
+        value: "127.0.0.1:9093"
+
+#- name: Copy the CA cert
+#  copy:
+#    content: "{{ ca_cert }}"
+#    dest: /etc/?/ca.crt
+#  notify: Restart Alertmanager
+#
+#- name: Generate certificate
+#  include_role:
+#    name: generate-cert
+#  vars:
+#    directory: /etc/?/
+#    cname: "alertmanager-{{ lan_address }}"
+#    owner: ?
+#    group: ?
+#    key_mode: u=rw,g=,o=
+#    subject_alt_name: "IP:{{ lan_address }}"
+## Need an equivalent to notify here
--- a/roles/prometheus-alert-manager/templates/prometheus-alertmanager
+++ b/roles/prometheus-alert-manager/templates/prometheus-alertmanager
@ -0,0 +1,75 @@
+{{ ansible_managed | comment }}
+
+# Set the command-line arguments to pass to the server.
+{% if not args %}
+ARGS=""
+{% else %}
+ARGS="\
+{% for arg in args %}
+      --{{ arg.name }}={{ arg.value }} \
+{% endfor %}
+"
+{% endif %}
+
+# The alert manager supports the following options:
+
+#  --config.file="/etc/prometheus/alertmanager.yml"
+#       Alertmanager configuration file name.
+#  --storage.path="/var/lib/prometheus/alertmanager/"
+#       Base path for data storage.
+#  --data.retention=120h
+#       How long to keep data for.
+#  --alerts.gc-interval=30m
+#       Interval between alert GC.
+#  --log.level=info
+#       Only log messages with the given severity or above.
+#  --web.external-url=WEB.EXTERNAL-URL
+#       The URL under which Alertmanager is externally reachable (for example,
+#       if Alertmanager is served via a reverse proxy). Used for generating
+#       relative and absolute links back to Alertmanager itself. If the URL has
+#       a path portion, it will be used to prefix all HTTP endpoints served by
+#       Alertmanager. If omitted, relevant URL components will be derived
+#       automatically.
+#  --web.route-prefix=WEB.ROUTE-PREFIX
+#       Prefix for the internal routes of web endpoints. Defaults to path of
+#       --web.external-url.
+#  --web.listen-address=":9093"
+#       Address to listen on for the web interface and API.
+#  --web.ui-path="/usr/share/prometheus/alertmanager/ui/"
+#       Path to static UI directory.
+#  --template.default="/usr/share/prometheus/alertmanager/default.tmpl"
+#       Path to default notification template.
+#  --cluster.listen-address="0.0.0.0:9094"
+#       Listen address for cluster.
+#  --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS
+#       Explicit address to advertise in cluster.
+#  --cluster.peer=CLUSTER.PEER ...
+#       Initial peers (may be repeated).
+#  --cluster.peer-timeout=15s
+#       Time to wait between peers to send notifications.
+#  --cluster.gossip-interval=200ms
+#       Interval between sending gossip messages. By lowering this value (more
+#       frequent) gossip messages are propagated across the cluster more
+#       quickly at the expense of increased bandwidth.
+#  --cluster.pushpull-interval=1m0s
+#       Interval for gossip state syncs. Setting this interval lower (more
+#       frequent) will increase convergence speeds across larger clusters at
+#       the expense of increased bandwidth usage.
+#  --cluster.tcp-timeout=10s  Timeout for establishing a stream connection
+#       with a remote node for a full state sync, and for stream read and write
+#       operations.
+#  --cluster.probe-timeout=500ms
+#       Timeout to wait for an ack from a probed node before assuming it is
+#       unhealthy. This should be set to 99-percentile of RTT (round-trip time)
+#       on your network.
+#  --cluster.probe-interval=1s
+#       Interval between random node probes. Setting this lower (more frequent)
+#       will cause the cluster to detect failed nodes more quickly at the
+#       expense of increased bandwidth usage.
+#  --cluster.settle-timeout=1m0s
+#       Maximum time to wait for cluster connections to settle before
+#       evaluating notifications.
+#  --cluster.reconnect-interval=10s
+#       Interval between attempting to reconnect to lost peers.
+#  --cluster.reconnect-timeout=6h0m0s
+#       Length of time to attempt to reconnect to a lost peer.
--- a/roles/prometheus/tasks/main.yml
+++ b/roles/prometheus/tasks/main.yml
@ -3,7 +3,6 @@
  apt:
    name:
      - prometheus
-      - prometheus-alertmanager
      - prometheus-pushgateway
    state: latest
    update_cache: true
--- a/roles/prometheus/templates/atrocious_nginx_stub
+++ b/roles/prometheus/templates/atrocious_nginx_stub
@ -8,6 +8,6 @@ server {
    ssl_verify_client on;

    location / {
-        proxy_pass 127.0.0.1:9090;
+        proxy_pass http://127.0.0.1:9090;
    }
 }
--- a/roles/prometheus/templates/prometheus.yml
+++ b/roles/prometheus/templates/prometheus.yml
@ -14,7 +14,7 @@ global:
 alerting:
  alertmanagers:
  - static_configs:
-    - targets: ['localhost:9093']
+    - targets: ['{{ lan_address }}:9093']

 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 rule_files: