Prometheus role

This commit is contained in:
Alexandre Iooss 2019-05-05 14:07:04 +02:00
parent 091bf9a0d8
commit c1c995e38d
No known key found for this signature in database
GPG key ID: 6C79278F3FCDCC02
9 changed files with 195 additions and 0 deletions

22
monitoring.yml Normal file
View file

@ -0,0 +1,22 @@
---
# Deploy Prometheus
- hosts: prometheus-pacaterie.adm.auro.re
vars:
# Prometheus targets.json
prometheus_targets:
- labels:
job: node
targets: # TODO {{ ansible_play_batch }}
- prometheus-pacaterie.adm.auro.re:9100
- labels:
job: prometheus
targets:
- localhost:9090
roles:
- prometheus
- prometheus-alertmanager
# Monitor all hosts
- hosts: all
roles:
- prometheus-node

View file

@ -0,0 +1,5 @@
---
- name: Restart Prometheus Alertmanager
service:
name: prometheus-alertmanager
state: restarted

View file

@ -0,0 +1,14 @@
---
- name: Install Prometheus Alertmanager
apt:
update_cache: true
name: prometheus-alertmanager
register: apt_result
retries: 3
until: apt_result is succeeded
- name: Configure Prometheus Alertmanager
template:
src: prometheus/alertmanager.yml.j2
dest: /etc/prometheus/alertmanager.yml
notify: Restart Prometheus Alertmanager

View file

@ -0,0 +1,58 @@
# {{ ansible_managed }}
global:
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost: 'proxy.auro.re:25'
smtp_from: 'prometheus@auro.re'
#smtp_auth_username: 'alertmanager'
#smtp_auth_password: 'password'
smtp_require_tls: false
# The directory from which notification templates are read.
templates:
- '/etc/prometheus/alertmanager_templates/*.tmpl'
# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
group_by: ['alertname', 'cluster', 'service']
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 3h
# A default receiver
receiver: team-monitoring-mails
# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
# Apply inhibition if the alertname is the same.
equal: ['alertname', 'cluster', 'service']
receivers:
- name: 'team-monitoring-mails'
email_configs:
- to: 'monitoring.aurore@lists.crans.org'

View file

@ -0,0 +1,8 @@
---
- name: Install Prometheus node-exporter
apt:
update_cache: true
name: prometheus-node-exporter
register: apt_result
retries: 3
until: apt_result is succeeded

View file

@ -0,0 +1,5 @@
---
- name: Restart Prometheus
service:
name: prometheus
state: restarted

View file

@ -0,0 +1,26 @@
---
- name: Install Prometheus
apt:
update_cache: true
name: prometheus
register: apt_result
retries: 3
until: apt_result is succeeded
- name: Configure Prometheus
template:
src: prometheus/prometheus.yml.j2
dest: /etc/prometheus/prometheus.yml
notify: Restart Prometheus
- name: Configure Prometheus alert rules
template:
src: prometheus/alert.rules.yml.j2
dest: /etc/prometheus/alert.rules.yml
notify: Restart Prometheus
# We don't need to restart Prometheus when updating nodes
- name: Configure Prometheus nodes
copy:
content: "{{ prometheus_targets | to_nice_json }}"
dest: /etc/prometheus/targets.json

View file

@ -0,0 +1,25 @@
# {{ ansible_managed }}
{# As this is also Jinja2 it will conflict without a raw block #}
{% raw %}
groups:
- name: example
rules:
# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
# Alert for any instance that has a median request latency >1s.
- alert: APIHighRequestLatency
expr: api_http_request_latencies_second{quantile="0.5"} > 1
for: 10m
annotations:
summary: "High request latency on {{ $labels.instance }}"
description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
{% endraw %}

View file

@ -0,0 +1,32 @@
# {{ ansible_managed }}
global:
# scrape_interval is set to the global default (60s)
# evaluation_interval is set to the global default (60s)
# scrape_timeout is set to the global default (10s).
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'example'
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "alert.rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
- job_name: dummy
# This reload dynamically the list of targets
# You don't need to restart Prometheus when updating targets.json
file_sd_configs:
- files:
- '/etc/prometheus/targets.json'