Prometheus role
This commit is contained in:
parent
091bf9a0d8
commit
c1c995e38d
9 changed files with 195 additions and 0 deletions
22
monitoring.yml
Normal file
22
monitoring.yml
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
---
|
||||||
|
# Deploy Prometheus
|
||||||
|
- hosts: prometheus-pacaterie.adm.auro.re
|
||||||
|
vars:
|
||||||
|
# Prometheus targets.json
|
||||||
|
prometheus_targets:
|
||||||
|
- labels:
|
||||||
|
job: node
|
||||||
|
targets: # TODO {{ ansible_play_batch }}
|
||||||
|
- prometheus-pacaterie.adm.auro.re:9100
|
||||||
|
- labels:
|
||||||
|
job: prometheus
|
||||||
|
targets:
|
||||||
|
- localhost:9090
|
||||||
|
roles:
|
||||||
|
- prometheus
|
||||||
|
- prometheus-alertmanager
|
||||||
|
|
||||||
|
# Monitor all hosts
|
||||||
|
- hosts: all
|
||||||
|
roles:
|
||||||
|
- prometheus-node
|
5
roles/prometheus-alertmanager/handlers/main.yml
Normal file
5
roles/prometheus-alertmanager/handlers/main.yml
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
---
|
||||||
|
- name: Restart Prometheus Alertmanager
|
||||||
|
service:
|
||||||
|
name: prometheus-alertmanager
|
||||||
|
state: restarted
|
14
roles/prometheus-alertmanager/tasks/main.yml
Normal file
14
roles/prometheus-alertmanager/tasks/main.yml
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
---
|
||||||
|
- name: Install Prometheus Alertmanager
|
||||||
|
apt:
|
||||||
|
update_cache: true
|
||||||
|
name: prometheus-alertmanager
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Configure Prometheus Alertmanager
|
||||||
|
template:
|
||||||
|
src: prometheus/alertmanager.yml.j2
|
||||||
|
dest: /etc/prometheus/alertmanager.yml
|
||||||
|
notify: Restart Prometheus Alertmanager
|
|
@ -0,0 +1,58 @@
|
||||||
|
# {{ ansible_managed }}
|
||||||
|
|
||||||
|
global:
|
||||||
|
# The smarthost and SMTP sender used for mail notifications.
|
||||||
|
smtp_smarthost: 'proxy.auro.re:25'
|
||||||
|
smtp_from: 'prometheus@auro.re'
|
||||||
|
#smtp_auth_username: 'alertmanager'
|
||||||
|
#smtp_auth_password: 'password'
|
||||||
|
smtp_require_tls: false
|
||||||
|
|
||||||
|
# The directory from which notification templates are read.
|
||||||
|
templates:
|
||||||
|
- '/etc/prometheus/alertmanager_templates/*.tmpl'
|
||||||
|
|
||||||
|
# The root route on which each incoming alert enters.
|
||||||
|
route:
|
||||||
|
# The labels by which incoming alerts are grouped together. For example,
|
||||||
|
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
||||||
|
# be batched into a single group.
|
||||||
|
group_by: ['alertname', 'cluster', 'service']
|
||||||
|
|
||||||
|
# When a new group of alerts is created by an incoming alert, wait at
|
||||||
|
# least 'group_wait' to send the initial notification.
|
||||||
|
# This way ensures that you get multiple alerts for the same group that start
|
||||||
|
# firing shortly after another are batched together on the first
|
||||||
|
# notification.
|
||||||
|
group_wait: 30s
|
||||||
|
|
||||||
|
# When the first notification was sent, wait 'group_interval' to send a batch
|
||||||
|
# of new alerts that started firing for that group.
|
||||||
|
group_interval: 5m
|
||||||
|
|
||||||
|
# If an alert has successfully been sent, wait 'repeat_interval' to
|
||||||
|
# resend them.
|
||||||
|
repeat_interval: 3h
|
||||||
|
|
||||||
|
# A default receiver
|
||||||
|
receiver: team-monitoring-mails
|
||||||
|
|
||||||
|
|
||||||
|
# Inhibition rules allow to mute a set of alerts given that another alert is
|
||||||
|
# firing.
|
||||||
|
# We use this to mute any warning-level notifications if the same alert is
|
||||||
|
# already critical.
|
||||||
|
inhibit_rules:
|
||||||
|
- source_match:
|
||||||
|
severity: 'critical'
|
||||||
|
target_match:
|
||||||
|
severity: 'warning'
|
||||||
|
# Apply inhibition if the alertname is the same.
|
||||||
|
equal: ['alertname', 'cluster', 'service']
|
||||||
|
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: 'team-monitoring-mails'
|
||||||
|
email_configs:
|
||||||
|
- to: 'monitoring.aurore@lists.crans.org'
|
||||||
|
|
8
roles/prometheus-node/tasks/main.yml
Normal file
8
roles/prometheus-node/tasks/main.yml
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
---
|
||||||
|
- name: Install Prometheus node-exporter
|
||||||
|
apt:
|
||||||
|
update_cache: true
|
||||||
|
name: prometheus-node-exporter
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
5
roles/prometheus/handlers/main.yml
Normal file
5
roles/prometheus/handlers/main.yml
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
---
|
||||||
|
- name: Restart Prometheus
|
||||||
|
service:
|
||||||
|
name: prometheus
|
||||||
|
state: restarted
|
26
roles/prometheus/tasks/main.yml
Normal file
26
roles/prometheus/tasks/main.yml
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
---
|
||||||
|
- name: Install Prometheus
|
||||||
|
apt:
|
||||||
|
update_cache: true
|
||||||
|
name: prometheus
|
||||||
|
register: apt_result
|
||||||
|
retries: 3
|
||||||
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Configure Prometheus
|
||||||
|
template:
|
||||||
|
src: prometheus/prometheus.yml.j2
|
||||||
|
dest: /etc/prometheus/prometheus.yml
|
||||||
|
notify: Restart Prometheus
|
||||||
|
|
||||||
|
- name: Configure Prometheus alert rules
|
||||||
|
template:
|
||||||
|
src: prometheus/alert.rules.yml.j2
|
||||||
|
dest: /etc/prometheus/alert.rules.yml
|
||||||
|
notify: Restart Prometheus
|
||||||
|
|
||||||
|
# We don't need to restart Prometheus when updating nodes
|
||||||
|
- name: Configure Prometheus nodes
|
||||||
|
copy:
|
||||||
|
content: "{{ prometheus_targets | to_nice_json }}"
|
||||||
|
dest: /etc/prometheus/targets.json
|
25
roles/prometheus/templates/prometheus/alert.rules.yml.j2
Normal file
25
roles/prometheus/templates/prometheus/alert.rules.yml.j2
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
# {{ ansible_managed }}
|
||||||
|
{# As this is also Jinja2 it will conflict without a raw block #}
|
||||||
|
{% raw %}
|
||||||
|
groups:
|
||||||
|
- name: example
|
||||||
|
rules:
|
||||||
|
|
||||||
|
# Alert for any instance that is unreachable for >5 minutes.
|
||||||
|
- alert: InstanceDown
|
||||||
|
expr: up == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
summary: "Instance {{ $labels.instance }} down"
|
||||||
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||||
|
|
||||||
|
# Alert for any instance that has a median request latency >1s.
|
||||||
|
- alert: APIHighRequestLatency
|
||||||
|
expr: api_http_request_latencies_second{quantile="0.5"} > 1
|
||||||
|
for: 10m
|
||||||
|
annotations:
|
||||||
|
summary: "High request latency on {{ $labels.instance }}"
|
||||||
|
description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
|
||||||
|
{% endraw %}
|
32
roles/prometheus/templates/prometheus/prometheus.yml.j2
Normal file
32
roles/prometheus/templates/prometheus/prometheus.yml.j2
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
# {{ ansible_managed }}
|
||||||
|
|
||||||
|
global:
|
||||||
|
# scrape_interval is set to the global default (60s)
|
||||||
|
# evaluation_interval is set to the global default (60s)
|
||||||
|
# scrape_timeout is set to the global default (10s).
|
||||||
|
|
||||||
|
# Attach these labels to any time series or alerts when communicating with
|
||||||
|
# external systems (federation, remote storage, Alertmanager).
|
||||||
|
external_labels:
|
||||||
|
monitor: 'example'
|
||||||
|
|
||||||
|
# Alertmanager configuration
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets: ['localhost:9093']
|
||||||
|
|
||||||
|
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||||
|
rule_files:
|
||||||
|
- "alert.rules.yml"
|
||||||
|
|
||||||
|
# A scrape configuration containing exactly one endpoint to scrape:
|
||||||
|
# Here it's Prometheus itself.
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: dummy
|
||||||
|
# This reload dynamically the list of targets
|
||||||
|
# You don't need to restart Prometheus when updating targets.json
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- '/etc/prometheus/targets.json'
|
||||||
|
|
Loading…
Reference in a new issue