From c1c995e38d118747be700693b741ac803093d007 Mon Sep 17 00:00:00 2001
From: Alexandre Iooss <erdnaxe@crans.org>
Date: Sun, 5 May 2019 14:07:04 +0200
Subject: [PATCH] Prometheus role

---
 monitoring.yml                                | 22 +++++++
 .../prometheus-alertmanager/handlers/main.yml |  5 ++
 roles/prometheus-alertmanager/tasks/main.yml  | 14 +++++
 .../templates/prometheus/alertmanager.yml.j2  | 58 +++++++++++++++++++
 roles/prometheus-node/tasks/main.yml          |  8 +++
 roles/prometheus/handlers/main.yml            |  5 ++
 roles/prometheus/tasks/main.yml               | 26 +++++++++
 .../templates/prometheus/alert.rules.yml.j2   | 25 ++++++++
 .../templates/prometheus/prometheus.yml.j2    | 32 ++++++++++
 9 files changed, 195 insertions(+)
 create mode 100644 monitoring.yml
 create mode 100644 roles/prometheus-alertmanager/handlers/main.yml
 create mode 100644 roles/prometheus-alertmanager/tasks/main.yml
 create mode 100644 roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2
 create mode 100644 roles/prometheus-node/tasks/main.yml
 create mode 100644 roles/prometheus/handlers/main.yml
 create mode 100644 roles/prometheus/tasks/main.yml
 create mode 100644 roles/prometheus/templates/prometheus/alert.rules.yml.j2
 create mode 100644 roles/prometheus/templates/prometheus/prometheus.yml.j2

diff --git a/monitoring.yml b/monitoring.yml
new file mode 100644
index 0000000..f148c9c
--- /dev/null
+++ b/monitoring.yml
@@ -0,0 +1,22 @@
+---
+# Deploy Prometheus
+- hosts: prometheus-pacaterie.adm.auro.re
+  vars:
+    # Prometheus targets.json
+    prometheus_targets:
+      - labels:
+          job: node
+        targets:  # TODO {{ ansible_play_batch }}
+          - prometheus-pacaterie.adm.auro.re:9100
+      - labels:
+          job: prometheus
+        targets:
+          - localhost:9090
+  roles:
+    - prometheus
+    - prometheus-alertmanager
+
+# Monitor all hosts
+- hosts: all
+  roles:
+    - prometheus-node
diff --git a/roles/prometheus-alertmanager/handlers/main.yml b/roles/prometheus-alertmanager/handlers/main.yml
new file mode 100644
index 0000000..3ddbf93
--- /dev/null
+++ b/roles/prometheus-alertmanager/handlers/main.yml
@@ -0,0 +1,5 @@
+---
+- name: Restart Prometheus Alertmanager
+  service:
+    name: prometheus-alertmanager
+    state: restarted
diff --git a/roles/prometheus-alertmanager/tasks/main.yml b/roles/prometheus-alertmanager/tasks/main.yml
new file mode 100644
index 0000000..b65a295
--- /dev/null
+++ b/roles/prometheus-alertmanager/tasks/main.yml
@@ -0,0 +1,14 @@
+---
+- name: Install Prometheus Alertmanager
+  apt:
+    update_cache: true
+    name: prometheus-alertmanager
+  register: apt_result
+  retries: 3
+  until: apt_result is succeeded
+
+- name: Configure Prometheus Alertmanager
+  template:
+    src: prometheus/alertmanager.yml.j2
+    dest: /etc/prometheus/alertmanager.yml
+  notify: Restart Prometheus Alertmanager
diff --git a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2
new file mode 100644
index 0000000..209e4d1
--- /dev/null
+++ b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2
@@ -0,0 +1,58 @@
+# {{ ansible_managed }}
+
+global:
+  # The smarthost and SMTP sender used for mail notifications.
+  smtp_smarthost: 'proxy.auro.re:25'
+  smtp_from: 'prometheus@auro.re'
+  #smtp_auth_username: 'alertmanager'
+  #smtp_auth_password: 'password'
+  smtp_require_tls: false
+
+# The directory from which notification templates are read.
+templates: 
+- '/etc/prometheus/alertmanager_templates/*.tmpl'
+
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  group_by: ['alertname', 'cluster', 'service']
+
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first 
+  # notification.
+  group_wait: 30s
+
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  # resend them.
+  repeat_interval: 3h 
+
+  # A default receiver
+  receiver: team-monitoring-mails
+
+
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is 
+# already critical.
+inhibit_rules:
+- source_match:
+    severity: 'critical'
+  target_match:
+    severity: 'warning'
+  # Apply inhibition if the alertname is the same.
+  equal: ['alertname', 'cluster', 'service']
+
+
+receivers:
+- name: 'team-monitoring-mails'
+  email_configs:
+  - to: 'monitoring.aurore@lists.crans.org'
+
diff --git a/roles/prometheus-node/tasks/main.yml b/roles/prometheus-node/tasks/main.yml
new file mode 100644
index 0000000..15c12b3
--- /dev/null
+++ b/roles/prometheus-node/tasks/main.yml
@@ -0,0 +1,8 @@
+---
+- name: Install Prometheus node-exporter
+  apt:
+    update_cache: true
+    name: prometheus-node-exporter
+  register: apt_result
+  retries: 3
+  until: apt_result is succeeded
diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml
new file mode 100644
index 0000000..4214def
--- /dev/null
+++ b/roles/prometheus/handlers/main.yml
@@ -0,0 +1,5 @@
+---
+- name: Restart Prometheus
+  service:
+    name: prometheus
+    state: restarted
diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml
new file mode 100644
index 0000000..3f70542
--- /dev/null
+++ b/roles/prometheus/tasks/main.yml
@@ -0,0 +1,26 @@
+---
+- name: Install Prometheus
+  apt:
+    update_cache: true
+    name: prometheus
+  register: apt_result
+  retries: 3
+  until: apt_result is succeeded
+
+- name: Configure Prometheus
+  template:
+    src: prometheus/prometheus.yml.j2
+    dest: /etc/prometheus/prometheus.yml
+  notify: Restart Prometheus
+
+- name: Configure Prometheus alert rules
+  template:
+    src: prometheus/alert.rules.yml.j2
+    dest: /etc/prometheus/alert.rules.yml
+  notify: Restart Prometheus
+
+# We don't need to restart Prometheus when updating nodes
+- name: Configure Prometheus nodes
+  copy:
+    content: "{{ prometheus_targets | to_nice_json }}"
+    dest: /etc/prometheus/targets.json
diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2
new file mode 100644
index 0000000..9e603a4
--- /dev/null
+++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2
@@ -0,0 +1,25 @@
+# {{ ansible_managed }}
+{# As this is also Jinja2 it will conflict without a raw block #}
+{% raw %}
+groups:
+- name: example
+  rules:
+
+  # Alert for any instance that is unreachable for >5 minutes.
+  - alert: InstanceDown
+    expr: up == 0
+    for: 5m
+    labels:
+      severity: page
+    annotations:
+      summary: "Instance {{ $labels.instance }} down"
+      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
+
+  # Alert for any instance that has a median request latency >1s.
+  - alert: APIHighRequestLatency
+    expr: api_http_request_latencies_second{quantile="0.5"} > 1
+    for: 10m
+    annotations:
+      summary: "High request latency on {{ $labels.instance }}"
+      description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
+{% endraw %}
diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2
new file mode 100644
index 0000000..76573fa
--- /dev/null
+++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2
@@ -0,0 +1,32 @@
+# {{ ansible_managed }}
+
+global:
+  # scrape_interval is set to the global default (60s)
+  # evaluation_interval is set to the global default (60s)
+  # scrape_timeout is set to the global default (10s).
+
+  # Attach these labels to any time series or alerts when communicating with
+  # external systems (federation, remote storage, Alertmanager).
+  external_labels:
+      monitor: 'example'
+
+# Alertmanager configuration
+alerting:
+  alertmanagers:
+  - static_configs:
+    - targets: ['localhost:9093']
+
+# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
+rule_files:
+  - "alert.rules.yml"
+
+# A scrape configuration containing exactly one endpoint to scrape:
+# Here it's Prometheus itself.
+scrape_configs:
+  - job_name: dummy
+    # This reload dynamically the list of targets
+    # You don't need to restart Prometheus when updating targets.json
+    file_sd_configs:
+      - files:
+        - '/etc/prometheus/targets.json'
+