From c1c995e38d118747be700693b741ac803093d007 Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Sun, 5 May 2019 14:07:04 +0200 Subject: [PATCH] Prometheus role --- monitoring.yml | 22 +++++++ .../prometheus-alertmanager/handlers/main.yml | 5 ++ roles/prometheus-alertmanager/tasks/main.yml | 14 +++++ .../templates/prometheus/alertmanager.yml.j2 | 58 +++++++++++++++++++ roles/prometheus-node/tasks/main.yml | 8 +++ roles/prometheus/handlers/main.yml | 5 ++ roles/prometheus/tasks/main.yml | 26 +++++++++ .../templates/prometheus/alert.rules.yml.j2 | 25 ++++++++ .../templates/prometheus/prometheus.yml.j2 | 32 ++++++++++ 9 files changed, 195 insertions(+) create mode 100644 monitoring.yml create mode 100644 roles/prometheus-alertmanager/handlers/main.yml create mode 100644 roles/prometheus-alertmanager/tasks/main.yml create mode 100644 roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 create mode 100644 roles/prometheus-node/tasks/main.yml create mode 100644 roles/prometheus/handlers/main.yml create mode 100644 roles/prometheus/tasks/main.yml create mode 100644 roles/prometheus/templates/prometheus/alert.rules.yml.j2 create mode 100644 roles/prometheus/templates/prometheus/prometheus.yml.j2 diff --git a/monitoring.yml b/monitoring.yml new file mode 100644 index 0000000..f148c9c --- /dev/null +++ b/monitoring.yml @@ -0,0 +1,22 @@ +--- +# Deploy Prometheus +- hosts: prometheus-pacaterie.adm.auro.re + vars: + # Prometheus targets.json + prometheus_targets: + - labels: + job: node + targets: # TODO {{ ansible_play_batch }} + - prometheus-pacaterie.adm.auro.re:9100 + - labels: + job: prometheus + targets: + - localhost:9090 + roles: + - prometheus + - prometheus-alertmanager + +# Monitor all hosts +- hosts: all + roles: + - prometheus-node diff --git a/roles/prometheus-alertmanager/handlers/main.yml b/roles/prometheus-alertmanager/handlers/main.yml new file mode 100644 index 0000000..3ddbf93 --- /dev/null +++ b/roles/prometheus-alertmanager/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart Prometheus Alertmanager + service: + name: prometheus-alertmanager + state: restarted diff --git a/roles/prometheus-alertmanager/tasks/main.yml b/roles/prometheus-alertmanager/tasks/main.yml new file mode 100644 index 0000000..b65a295 --- /dev/null +++ b/roles/prometheus-alertmanager/tasks/main.yml @@ -0,0 +1,14 @@ +--- +- name: Install Prometheus Alertmanager + apt: + update_cache: true + name: prometheus-alertmanager + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Configure Prometheus Alertmanager + template: + src: prometheus/alertmanager.yml.j2 + dest: /etc/prometheus/alertmanager.yml + notify: Restart Prometheus Alertmanager diff --git a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 new file mode 100644 index 0000000..209e4d1 --- /dev/null +++ b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 @@ -0,0 +1,58 @@ +# {{ ansible_managed }} + +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'proxy.auro.re:25' + smtp_from: 'prometheus@auro.re' + #smtp_auth_username: 'alertmanager' + #smtp_auth_password: 'password' + smtp_require_tls: false + +# The directory from which notification templates are read. +templates: +- '/etc/prometheus/alertmanager_templates/*.tmpl' + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['alertname', 'cluster', 'service'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + + # A default receiver + receiver: team-monitoring-mails + + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: +- source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + equal: ['alertname', 'cluster', 'service'] + + +receivers: +- name: 'team-monitoring-mails' + email_configs: + - to: 'monitoring.aurore@lists.crans.org' + diff --git a/roles/prometheus-node/tasks/main.yml b/roles/prometheus-node/tasks/main.yml new file mode 100644 index 0000000..15c12b3 --- /dev/null +++ b/roles/prometheus-node/tasks/main.yml @@ -0,0 +1,8 @@ +--- +- name: Install Prometheus node-exporter + apt: + update_cache: true + name: prometheus-node-exporter + register: apt_result + retries: 3 + until: apt_result is succeeded diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml new file mode 100644 index 0000000..4214def --- /dev/null +++ b/roles/prometheus/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart Prometheus + service: + name: prometheus + state: restarted diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml new file mode 100644 index 0000000..3f70542 --- /dev/null +++ b/roles/prometheus/tasks/main.yml @@ -0,0 +1,26 @@ +--- +- name: Install Prometheus + apt: + update_cache: true + name: prometheus + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Configure Prometheus + template: + src: prometheus/prometheus.yml.j2 + dest: /etc/prometheus/prometheus.yml + notify: Restart Prometheus + +- name: Configure Prometheus alert rules + template: + src: prometheus/alert.rules.yml.j2 + dest: /etc/prometheus/alert.rules.yml + notify: Restart Prometheus + +# We don't need to restart Prometheus when updating nodes +- name: Configure Prometheus nodes + copy: + content: "{{ prometheus_targets | to_nice_json }}" + dest: /etc/prometheus/targets.json diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 new file mode 100644 index 0000000..9e603a4 --- /dev/null +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -0,0 +1,25 @@ +# {{ ansible_managed }} +{# As this is also Jinja2 it will conflict without a raw block #} +{% raw %} +groups: +- name: example + rules: + + # Alert for any instance that is unreachable for >5 minutes. + - alert: InstanceDown + expr: up == 0 + for: 5m + labels: + severity: page + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." + + # Alert for any instance that has a median request latency >1s. + - alert: APIHighRequestLatency + expr: api_http_request_latencies_second{quantile="0.5"} > 1 + for: 10m + annotations: + summary: "High request latency on {{ $labels.instance }}" + description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)" +{% endraw %} diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 new file mode 100644 index 0000000..76573fa --- /dev/null +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -0,0 +1,32 @@ +# {{ ansible_managed }} + +global: + # scrape_interval is set to the global default (60s) + # evaluation_interval is set to the global default (60s) + # scrape_timeout is set to the global default (10s). + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'example' + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: ['localhost:9093'] + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + - "alert.rules.yml" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + - job_name: dummy + # This reload dynamically the list of targets + # You don't need to restart Prometheus when updating targets.json + file_sd_configs: + - files: + - '/etc/prometheus/targets.json' +