add first config for alertmanager

2021-10-06 13:24:58 +02:00 · 2021-10-06 13:24:58 +02:00 · dd550700d0
commit dd550700d0
parent 6e28abc1e7
7 changed files with 124 additions and 3 deletions
--- a/books/monitoring.yml
+++ b/books/monitoring.yml
@ -3,6 +3,7 @@
 - hosts: prometheus_servers
  roles:
    - prometheus
    - prometheus-alert-manager
    - grafana
 - hosts: all, !tests,
--- a/roles/prometheus-alert-manager/handlers/main.yml
+++ b/roles/prometheus-alert-manager/handlers/main.yml
@ -0,0 +1,5 @@
 ---
 - name: Restart Alertmanager
  systemd:
    name: prometheus-alertmanager.service
    state: restarted
--- a/roles/prometheus-alert-manager/tasks/main.yml
+++ b/roles/prometheus-alert-manager/tasks/main.yml
@ -0,0 +1,41 @@
 ---
 - name: Install Prometheus Alert Manager
  apt:
    name:
      - prometheus-alertmanager
    state: latest
    update_cache: true
  register: apt_result
  retries: 3
  until: apt_result is succeeded
 - name: Setup the arguments for alertmanager
  template:
    src: prometheus-alertmanager
    dest: /etc/default/prometheus-alertmanager
    owner: root
    group: root
    mode: '0644'
  notify: Restart Alertmanager 
  vars:
    args:
      - name: web.listen-address
        value: "127.0.0.1:9093"
 #- name: Copy the CA cert
 #  copy:
 #    content: "{{ ca_cert }}"
 #    dest: /etc/?/ca.crt
 #  notify: Restart Alertmanager
 #
 #- name: Generate certificate
 #  include_role:
 #    name: generate-cert
 #  vars:
 #    directory: /etc/?/
 #    cname: "alertmanager-{{ lan_address }}"
 #    owner: ?
 #    group: ?
 #    key_mode: u=rw,g=,o=
 #    subject_alt_name: "IP:{{ lan_address }}"
 ## Need an equivalent to notify here
--- a/roles/prometheus-alert-manager/templates/prometheus-alertmanager
+++ b/roles/prometheus-alert-manager/templates/prometheus-alertmanager
@ -0,0 +1,75 @@
 {{ ansible_managed | comment }}
 # Set the command-line arguments to pass to the server.
 {% if not args %}
 ARGS=""
 {% else %}
 ARGS="\
 {% for arg in args %}
      --{{ arg.name }}={{ arg.value }} \
 {% endfor %}
 "
 {% endif %}
 # The alert manager supports the following options:
 #  --config.file="/etc/prometheus/alertmanager.yml"
 #       Alertmanager configuration file name.
 #  --storage.path="/var/lib/prometheus/alertmanager/"
 #       Base path for data storage.
 #  --data.retention=120h
 #       How long to keep data for.
 #  --alerts.gc-interval=30m
 #       Interval between alert GC.
 #  --log.level=info
 #       Only log messages with the given severity or above.
 #  --web.external-url=WEB.EXTERNAL-URL
 #       The URL under which Alertmanager is externally reachable (for example,
 #       if Alertmanager is served via a reverse proxy). Used for generating
 #       relative and absolute links back to Alertmanager itself. If the URL has
 #       a path portion, it will be used to prefix all HTTP endpoints served by
 #       Alertmanager. If omitted, relevant URL components will be derived
 #       automatically.
 #  --web.route-prefix=WEB.ROUTE-PREFIX
 #       Prefix for the internal routes of web endpoints. Defaults to path of
 #       --web.external-url.
 #  --web.listen-address=":9093"
 #       Address to listen on for the web interface and API.
 #  --web.ui-path="/usr/share/prometheus/alertmanager/ui/"
 #       Path to static UI directory.
 #  --template.default="/usr/share/prometheus/alertmanager/default.tmpl"
 #       Path to default notification template.
 #  --cluster.listen-address="0.0.0.0:9094"
 #       Listen address for cluster.
 #  --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS
 #       Explicit address to advertise in cluster.
 #  --cluster.peer=CLUSTER.PEER ...
 #       Initial peers (may be repeated).
 #  --cluster.peer-timeout=15s
 #       Time to wait between peers to send notifications.
 #  --cluster.gossip-interval=200ms
 #       Interval between sending gossip messages. By lowering this value (more
 #       frequent) gossip messages are propagated across the cluster more
 #       quickly at the expense of increased bandwidth.
 #  --cluster.pushpull-interval=1m0s
 #       Interval for gossip state syncs. Setting this interval lower (more
 #       frequent) will increase convergence speeds across larger clusters at
 #       the expense of increased bandwidth usage.
 #  --cluster.tcp-timeout=10s  Timeout for establishing a stream connection
 #       with a remote node for a full state sync, and for stream read and write
 #       operations.
 #  --cluster.probe-timeout=500ms
 #       Timeout to wait for an ack from a probed node before assuming it is
 #       unhealthy. This should be set to 99-percentile of RTT (round-trip time)
 #       on your network.
 #  --cluster.probe-interval=1s
 #       Interval between random node probes. Setting this lower (more frequent)
 #       will cause the cluster to detect failed nodes more quickly at the
 #       expense of increased bandwidth usage.
 #  --cluster.settle-timeout=1m0s
 #       Maximum time to wait for cluster connections to settle before
 #       evaluating notifications.
 #  --cluster.reconnect-interval=10s
 #       Interval between attempting to reconnect to lost peers.
 #  --cluster.reconnect-timeout=6h0m0s
 #       Length of time to attempt to reconnect to a lost peer.
--- a/roles/prometheus/tasks/main.yml
+++ b/roles/prometheus/tasks/main.yml
@ -3,7 +3,6 @@
  apt:
    name:
      - prometheus
      - prometheus-alertmanager
      - prometheus-pushgateway
    state: latest
    update_cache: true
--- a/roles/prometheus/templates/atrocious_nginx_stub
+++ b/roles/prometheus/templates/atrocious_nginx_stub
@ -8,6 +8,6 @@ server {
    ssl_verify_client on;
    location / {
-        proxy_pass 127.0.0.1:9090;
+        proxy_pass http://127.0.0.1:9090;
    }
 }
--- a/roles/prometheus/templates/prometheus.yml
+++ b/roles/prometheus/templates/prometheus.yml
@ -14,7 +14,7 @@ global:
 alerting:
  alertmanagers:
  - static_configs:
-    - targets: ['localhost:9093']
+    - targets: ['{{ lan_address }}:9093']
 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 rule_files: