Browse Source

add first config for alertmanager

monitoring
histausse 1 year ago
parent
commit
dd550700d0
Signed by: histausse
GPG Key ID: 67486F107F62E9E9
  1. 1
      books/monitoring.yml
  2. 5
      roles/prometheus-alert-manager/handlers/main.yml
  3. 41
      roles/prometheus-alert-manager/tasks/main.yml
  4. 75
      roles/prometheus-alert-manager/templates/prometheus-alertmanager
  5. 1
      roles/prometheus/tasks/main.yml
  6. 2
      roles/prometheus/templates/atrocious_nginx_stub
  7. 2
      roles/prometheus/templates/prometheus.yml

1
books/monitoring.yml

@ -3,6 +3,7 @@
- hosts: prometheus_servers
roles:
- prometheus
- prometheus-alert-manager
- grafana
- hosts: all, !tests,

5
roles/prometheus-alert-manager/handlers/main.yml

@ -0,0 +1,5 @@
---
- name: Restart Alertmanager
systemd:
name: prometheus-alertmanager.service
state: restarted

41
roles/prometheus-alert-manager/tasks/main.yml

@ -0,0 +1,41 @@
---
- name: Install Prometheus Alert Manager
apt:
name:
- prometheus-alertmanager
state: latest
update_cache: true
register: apt_result
retries: 3
until: apt_result is succeeded
- name: Setup the arguments for alertmanager
template:
src: prometheus-alertmanager
dest: /etc/default/prometheus-alertmanager
owner: root
group: root
mode: '0644'
notify: Restart Alertmanager
vars:
args:
- name: web.listen-address
value: "127.0.0.1:9093"
#- name: Copy the CA cert
# copy:
# content: "{{ ca_cert }}"
# dest: /etc/?/ca.crt
# notify: Restart Alertmanager
#
#- name: Generate certificate
# include_role:
# name: generate-cert
# vars:
# directory: /etc/?/
# cname: "alertmanager-{{ lan_address }}"
# owner: ?
# group: ?
# key_mode: u=rw,g=,o=
# subject_alt_name: "IP:{{ lan_address }}"
## Need an equivalent to notify here

75
roles/prometheus-alert-manager/templates/prometheus-alertmanager

@ -0,0 +1,75 @@
{{ ansible_managed | comment }}
# Set the command-line arguments to pass to the server.
{% if not args %}
ARGS=""
{% else %}
ARGS="\
{% for arg in args %}
--{{ arg.name }}={{ arg.value }} \
{% endfor %}
"
{% endif %}
# The alert manager supports the following options:
# --config.file="/etc/prometheus/alertmanager.yml"
# Alertmanager configuration file name.
# --storage.path="/var/lib/prometheus/alertmanager/"
# Base path for data storage.
# --data.retention=120h
# How long to keep data for.
# --alerts.gc-interval=30m
# Interval between alert GC.
# --log.level=info
# Only log messages with the given severity or above.
# --web.external-url=WEB.EXTERNAL-URL
# The URL under which Alertmanager is externally reachable (for example,
# if Alertmanager is served via a reverse proxy). Used for generating
# relative and absolute links back to Alertmanager itself. If the URL has
# a path portion, it will be used to prefix all HTTP endpoints served by
# Alertmanager. If omitted, relevant URL components will be derived
# automatically.
# --web.route-prefix=WEB.ROUTE-PREFIX
# Prefix for the internal routes of web endpoints. Defaults to path of
# --web.external-url.
# --web.listen-address=":9093"
# Address to listen on for the web interface and API.
# --web.ui-path="/usr/share/prometheus/alertmanager/ui/"
# Path to static UI directory.
# --template.default="/usr/share/prometheus/alertmanager/default.tmpl"
# Path to default notification template.
# --cluster.listen-address="0.0.0.0:9094"
# Listen address for cluster.
# --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS
# Explicit address to advertise in cluster.
# --cluster.peer=CLUSTER.PEER ...
# Initial peers (may be repeated).
# --cluster.peer-timeout=15s
# Time to wait between peers to send notifications.
# --cluster.gossip-interval=200ms
# Interval between sending gossip messages. By lowering this value (more
# frequent) gossip messages are propagated across the cluster more
# quickly at the expense of increased bandwidth.
# --cluster.pushpull-interval=1m0s
# Interval for gossip state syncs. Setting this interval lower (more
# frequent) will increase convergence speeds across larger clusters at
# the expense of increased bandwidth usage.
# --cluster.tcp-timeout=10s Timeout for establishing a stream connection
# with a remote node for a full state sync, and for stream read and write
# operations.
# --cluster.probe-timeout=500ms
# Timeout to wait for an ack from a probed node before assuming it is
# unhealthy. This should be set to 99-percentile of RTT (round-trip time)
# on your network.
# --cluster.probe-interval=1s
# Interval between random node probes. Setting this lower (more frequent)
# will cause the cluster to detect failed nodes more quickly at the
# expense of increased bandwidth usage.
# --cluster.settle-timeout=1m0s
# Maximum time to wait for cluster connections to settle before
# evaluating notifications.
# --cluster.reconnect-interval=10s
# Interval between attempting to reconnect to lost peers.
# --cluster.reconnect-timeout=6h0m0s
# Length of time to attempt to reconnect to a lost peer.

1
roles/prometheus/tasks/main.yml

@ -3,7 +3,6 @@
apt:
name:
- prometheus
- prometheus-alertmanager
- prometheus-pushgateway
state: latest
update_cache: true

2
roles/prometheus/templates/atrocious_nginx_stub

@ -8,6 +8,6 @@ server {
ssl_verify_client on;
location / {
proxy_pass 127.0.0.1:9090;
proxy_pass http://127.0.0.1:9090;
}
}

2
roles/prometheus/templates/prometheus.yml

@ -14,7 +14,7 @@ global:
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
- targets: ['{{ lan_address }}:9093']
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:

Loading…
Cancel
Save