Prometheus role

2019-05-05 14:07:04 +02:00 · 2019-05-05 14:07:04 +02:00 · c1c995e38d
commit c1c995e38d
parent 091bf9a0d8
9 changed files with 195 additions and 0 deletions
--- a/monitoring.yml
+++ b/monitoring.yml
@ -0,0 +1,22 @@
 ---
 # Deploy Prometheus
 - hosts: prometheus-pacaterie.adm.auro.re
  vars:
    # Prometheus targets.json
    prometheus_targets:
      - labels:
          job: node
        targets:  # TODO {{ ansible_play_batch }}
          - prometheus-pacaterie.adm.auro.re:9100
      - labels:
          job: prometheus
        targets:
          - localhost:9090
  roles:
    - prometheus
    - prometheus-alertmanager
 # Monitor all hosts
 - hosts: all
  roles:
    - prometheus-node
--- a/roles/prometheus-alertmanager/handlers/main.yml
+++ b/roles/prometheus-alertmanager/handlers/main.yml
@ -0,0 +1,5 @@
 ---
 - name: Restart Prometheus Alertmanager
  service:
    name: prometheus-alertmanager
    state: restarted
--- a/roles/prometheus-alertmanager/tasks/main.yml
+++ b/roles/prometheus-alertmanager/tasks/main.yml
@ -0,0 +1,14 @@
 ---
 - name: Install Prometheus Alertmanager
  apt:
    update_cache: true
    name: prometheus-alertmanager
  register: apt_result
  retries: 3
  until: apt_result is succeeded
 - name: Configure Prometheus Alertmanager
  template:
    src: prometheus/alertmanager.yml.j2
    dest: /etc/prometheus/alertmanager.yml
  notify: Restart Prometheus Alertmanager
--- a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2
+++ b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2
@ -0,0 +1,58 @@
 # {{ ansible_managed }}
 global:
  # The smarthost and SMTP sender used for mail notifications.
  smtp_smarthost: 'proxy.auro.re:25'
  smtp_from: 'prometheus@auro.re'
  #smtp_auth_username: 'alertmanager'
  #smtp_auth_password: 'password'
  smtp_require_tls: false
 # The directory from which notification templates are read.
 templates: 
 - '/etc/prometheus/alertmanager_templates/*.tmpl'
 # The root route on which each incoming alert enters.
 route:
  # The labels by which incoming alerts are grouped together. For example,
  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
  # be batched into a single group.
  group_by: ['alertname', 'cluster', 'service']
  # When a new group of alerts is created by an incoming alert, wait at
  # least 'group_wait' to send the initial notification.
  # This way ensures that you get multiple alerts for the same group that start
  # firing shortly after another are batched together on the first 
  # notification.
  group_wait: 30s
  # When the first notification was sent, wait 'group_interval' to send a batch
  # of new alerts that started firing for that group.
  group_interval: 5m
  # If an alert has successfully been sent, wait 'repeat_interval' to
  # resend them.
  repeat_interval: 3h 
  # A default receiver
  receiver: team-monitoring-mails
 # Inhibition rules allow to mute a set of alerts given that another alert is
 # firing.
 # We use this to mute any warning-level notifications if the same alert is 
 # already critical.
 inhibit_rules:
 - source_match:
    severity: 'critical'
  target_match:
    severity: 'warning'
  # Apply inhibition if the alertname is the same.
  equal: ['alertname', 'cluster', 'service']
 receivers:
 - name: 'team-monitoring-mails'
  email_configs:
  - to: 'monitoring.aurore@lists.crans.org'
--- a/roles/prometheus-node/tasks/main.yml
+++ b/roles/prometheus-node/tasks/main.yml
@ -0,0 +1,8 @@
 ---
 - name: Install Prometheus node-exporter
  apt:
    update_cache: true
    name: prometheus-node-exporter
  register: apt_result
  retries: 3
  until: apt_result is succeeded
--- a/roles/prometheus/handlers/main.yml
+++ b/roles/prometheus/handlers/main.yml
@ -0,0 +1,5 @@
 ---
 - name: Restart Prometheus
  service:
    name: prometheus
    state: restarted
--- a/roles/prometheus/tasks/main.yml
+++ b/roles/prometheus/tasks/main.yml
@ -0,0 +1,26 @@
 ---
 - name: Install Prometheus
  apt:
    update_cache: true
    name: prometheus
  register: apt_result
  retries: 3
  until: apt_result is succeeded
 - name: Configure Prometheus
  template:
    src: prometheus/prometheus.yml.j2
    dest: /etc/prometheus/prometheus.yml
  notify: Restart Prometheus
 - name: Configure Prometheus alert rules
  template:
    src: prometheus/alert.rules.yml.j2
    dest: /etc/prometheus/alert.rules.yml
  notify: Restart Prometheus
 # We don't need to restart Prometheus when updating nodes
 - name: Configure Prometheus nodes
  copy:
    content: "{{ prometheus_targets | to_nice_json }}"
    dest: /etc/prometheus/targets.json
--- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2
+++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2
@ -0,0 +1,25 @@
 # {{ ansible_managed }}
 {# As this is also Jinja2 it will conflict without a raw block #}
 {% raw %}
 groups:
 - name: example
  rules:
  # Alert for any instance that is unreachable for >5 minutes.
  - alert: InstanceDown
    expr: up == 0
    for: 5m
    labels:
      severity: page
    annotations:
      summary: "Instance {{ $labels.instance }} down"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
  # Alert for any instance that has a median request latency >1s.
  - alert: APIHighRequestLatency
    expr: api_http_request_latencies_second{quantile="0.5"} > 1
    for: 10m
    annotations:
      summary: "High request latency on {{ $labels.instance }}"
      description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
 {% endraw %}
--- a/roles/prometheus/templates/prometheus/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2
@ -0,0 +1,32 @@
 # {{ ansible_managed }}
 global:
  # scrape_interval is set to the global default (60s)
  # evaluation_interval is set to the global default (60s)
  # scrape_timeout is set to the global default (10s).
  # Attach these labels to any time series or alerts when communicating with
  # external systems (federation, remote storage, Alertmanager).
  external_labels:
      monitor: 'example'
 # Alertmanager configuration
 alerting:
  alertmanagers:
  - static_configs:
    - targets: ['localhost:9093']
 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 rule_files:
  - "alert.rules.yml"
 # A scrape configuration containing exactly one endpoint to scrape:
 # Here it's Prometheus itself.
 scrape_configs:
  - job_name: dummy
    # This reload dynamically the list of targets
    # You don't need to restart Prometheus when updating targets.json
    file_sd_configs:
      - files:
        - '/etc/prometheus/targets.json'