From 694501dfa3b712c4d5f6edfe0d0386bdfc2150de Mon Sep 17 00:00:00 2001 From: Alexandre Iooss Date: Mon, 22 Jul 2019 20:56:43 +0200 Subject: [PATCH] Merge crans monitoring --- monitoring.yml | 17 +-- .../templates/prometheus/alertmanager.yml.j2 | 9 +- roles/prometheus-node/tasks/main.yml | 10 +- roles/prometheus/handlers/main.yml | 5 + roles/prometheus/tasks/main.yml | 30 ++++- .../templates/prometheus/alert.rules.yml.j2 | 59 ++++++++-- .../templates/prometheus/django.rules.yml.j2 | 106 ++++++++++++++++++ .../templates/prometheus/prometheus.yml.j2 | 2 +- .../templates/update-motd.d/05-service.j2 | 4 + 9 files changed, 208 insertions(+), 34 deletions(-) create mode 100644 roles/prometheus/templates/prometheus/django.rules.yml.j2 create mode 100755 roles/prometheus/templates/update-motd.d/05-service.j2 diff --git a/monitoring.yml b/monitoring.yml index 2010d53..049d418 100644 --- a/monitoring.yml +++ b/monitoring.yml @@ -4,19 +4,10 @@ vars: # Prometheus targets.json prometheus_targets: - - labels: - job: node - targets: - - mordred.adm.auro.re:9100 - - ldap-replica-pacaterie.adm.auro.re:9100 - - dhcp-pacaterie.adm.auro.re:9100 - - dns-pacaterie.adm.auro.re:9100 - - prometheus-pacaterie.adm.auro.re:9100 - - radius-pacaterie.adm.auro.re:9100 - - labels: - job: prometheus - targets: - - localhost:9090 + - labels: {job: node} + targets: "{{ groups['pacaterie'] | map('replace', '.org', '.org:9100') | list | sort }}" + - labels: {job: prometheus} + targets: ['localhost:9090'] roles: - prometheus - prometheus-alertmanager diff --git a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 index 209e4d1..85a31c0 100644 --- a/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 +++ b/roles/prometheus-alertmanager/templates/prometheus/alertmanager.yml.j2 @@ -17,22 +17,23 @@ route: # The labels by which incoming alerts are grouped together. For example, # multiple alerts coming in for cluster=A and alertname=LatencyHigh would # be batched into a single group. - group_by: ['alertname', 'cluster', 'service'] + #group_by: ['alertname', 'cluster', 'service'] + group_by: [] # do not group for text chat # When a new group of alerts is created by an incoming alert, wait at # least 'group_wait' to send the initial notification. # This way ensures that you get multiple alerts for the same group that start # firing shortly after another are batched together on the first # notification. - group_wait: 30s + group_wait: 1m # When the first notification was sent, wait 'group_interval' to send a batch # of new alerts that started firing for that group. - group_interval: 5m + group_interval: 1m # If an alert has successfully been sent, wait 'repeat_interval' to # resend them. - repeat_interval: 3h + repeat_interval: 12h # A default receiver receiver: team-monitoring-mails diff --git a/roles/prometheus-node/tasks/main.yml b/roles/prometheus-node/tasks/main.yml index 994166e..96ec332 100644 --- a/roles/prometheus-node/tasks/main.yml +++ b/roles/prometheus-node/tasks/main.yml @@ -7,12 +7,18 @@ retries: 3 until: apt_result is succeeded +- name: Activate prometheus-node-exporter service + systemd: + name: prometheus-node-exporter + enabled: yes + state: started + # Doesn't work on Debian Stretch - name: Make Prometheus node-exporter listen on adm only when: - - ansible_distribution_release != 'stretch' + - ansible_lsb.codename == 'buster' lineinfile: path: /etc/default/prometheus-node-exporter regexp: '^ARGS=' - line: "ARGS=\"--web.listen-address={{ ansible_fqdn }}:9100\"" + line: "ARGS=\"--web.listen-address={{ ansible_hostname }}.adm.auro.re:9100\"" notify: Restart prometheus-node-exporter diff --git a/roles/prometheus/handlers/main.yml b/roles/prometheus/handlers/main.yml index 4214def..670847b 100644 --- a/roles/prometheus/handlers/main.yml +++ b/roles/prometheus/handlers/main.yml @@ -3,3 +3,8 @@ service: name: prometheus state: restarted + +- name: Restart prometheus-snmp-exporter + service: + name: prometheus-snmp-exporter + state: restarted diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 3f70542..68be2f9 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -2,7 +2,9 @@ - name: Install Prometheus apt: update_cache: true - name: prometheus + name: + - prometheus + - prometheus-snmp-exporter register: apt_result retries: 3 until: apt_result is succeeded @@ -15,12 +17,34 @@ - name: Configure Prometheus alert rules template: - src: prometheus/alert.rules.yml.j2 - dest: /etc/prometheus/alert.rules.yml + src: "prometheus/{{ item }}.j2" + dest: "/etc/prometheus/{{ item }}" notify: Restart Prometheus + loop: + - alert.rules.yml + - django.rules.yml + +- name: Make Prometheus snmp-exporter listen on localhost only + lineinfile: + path: /etc/default/prometheus-snmp-exporter + regexp: '^ARGS=' + line: "ARGS=\"--web.listen-address=127.0.0.1:9116\"" + notify: Restart prometheus-snmp-exporter # We don't need to restart Prometheus when updating nodes - name: Configure Prometheus nodes copy: content: "{{ prometheus_targets | to_nice_json }}" dest: /etc/prometheus/targets.json + +- name: Activate prometheus service + systemd: + name: prometheus + enabled: yes + state: started + +- name: Indicate role in motd + template: + src: update-motd.d/05-service.j2 + dest: /etc/update-motd.d/05-prometheus + mode: 0755 diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index 9e603a4..0729cc7 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -1,25 +1,62 @@ # {{ ansible_managed }} {# As this is also Jinja2 it will conflict without a raw block #} +{# Depending of Prometheus Node exporter version, rules can change depending of version #} {% raw %} groups: -- name: example +- name: alert.rules rules: - # Alert for any instance that is unreachable for >5 minutes. + # Alert for any instance that is unreachable for >3 minutes. - alert: InstanceDown expr: up == 0 + for: 3m + labels: + severity: critical + annotations: + summary: "{{ $labels.instance }} est invisible depuis plus de 3 minutes !" + + # Alert for out of memory + - alert: OutOfMemory + expr: ((node_memory_MemFree_bytes or node_memory_MemFree) + (node_memory_Cached_bytes or node_memory_Cached) + (node_memory_Buffers_bytes or node_memory_Buffers)) / (node_memory_MemTotal_bytes or node_memory_MemTotal) * 100 < 10 for: 5m labels: - severity: page + severity: warning annotations: - summary: "Instance {{ $labels.instance }} down" - description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." + summary: "Mémoire libre de {{ $labels.instance }} à {{ $value }}%." - # Alert for any instance that has a median request latency >1s. - - alert: APIHighRequestLatency - expr: api_http_request_latencies_second{quantile="0.5"} > 1 - for: 10m + # Alert for out of disk space + - alert: OutOfDiskSpace + expr: (node_filesystem_free_bytes{fstype="ext4"} or node_filesystem_free{fstype="ext4"}) / (node_filesystem_size_bytes{fstype="ext4"} or node_filesystem_size{fstype="ext4"}) * 100 < 10 + for: 5m + labels: + severity: warning annotations: - summary: "High request latency on {{ $labels.instance }}" - description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)" + summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ $value }}%." + + # Alert for out of inode space on disk + - alert: OutOfInodes + expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}." + + # Alert for high CPU usage + - alert: CpuUsage + expr: ((100 - avg by (instance) (irate(node_cpu{mode="idle"}[5m])) * 100) or (100 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) > 75 + for: 5m + labels: + severity: warning + annotations: + summary: "CPU sur {{ $labels.instance }} à {{ $value }}%." + + # Check systemd unit (> buster) + - alert: SystemdServiceFailed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" {% endraw %} diff --git a/roles/prometheus/templates/prometheus/django.rules.yml.j2 b/roles/prometheus/templates/prometheus/django.rules.yml.j2 new file mode 100644 index 0000000..fddd398 --- /dev/null +++ b/roles/prometheus/templates/prometheus/django.rules.yml.j2 @@ -0,0 +1,106 @@ +# {{ ansible_managed }} +{# As this is also Jinja2 it will conflict without a raw block #} +{% raw %} +groups: +- name: django.rules + rules: + - record: job:django_http_requests_before_middlewares_total:sum_rate30s + expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job) + - record: job:django_http_requests_unknown_latency_total:sum_rate30s + expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job) + - record: job:django_http_ajax_requests_total:sum_rate30s + expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job) + - record: job:django_http_responses_before_middlewares_total:sum_rate30s + expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job) + - record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s + expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s])) + BY (job) + - record: job:django_http_requests_body_total_bytes:sum_rate30s + expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job) + - record: job:django_http_responses_streaming_total:sum_rate30s + expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job) + - record: job:django_http_responses_body_total_bytes:sum_rate30s + expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job) + - record: job:django_http_requests_total:sum_rate30s + expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job) + - record: job:django_http_requests_total_by_method:sum_rate30s + expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method) + - record: job:django_http_requests_total_by_transport:sum_rate30s + expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport) + - record: job:django_http_requests_total_by_view:sum_rate30s + expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, + view) + - record: job:django_http_requests_total_by_view_transport_method:sum_rate30s + expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, + view, transport, method) + - record: job:django_http_responses_total_by_templatename:sum_rate30s + expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename) + - record: job:django_http_responses_total_by_status:sum_rate30s + expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status) + - record: job:django_http_responses_total_by_charset:sum_rate30s + expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset) + - record: job:django_http_exceptions_total_by_type:sum_rate30s + expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type) + - record: job:django_http_exceptions_total_by_view:sum_rate30s + expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view) + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "50" + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "95" + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "99" + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "99.9" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "50" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "95" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "99" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "99.9" + - record: job:django_model_inserts_total:sum_rate1m + expr: sum(rate(django_model_inserts_total[1m])) BY (job, model) + - record: job:django_model_updates_total:sum_rate1m + expr: sum(rate(django_model_updates_total[1m])) BY (job, model) + - record: job:django_model_deletes_total:sum_rate1m + expr: sum(rate(django_model_deletes_total[1m])) BY (job, model) + - record: job:django_db_new_connections_total:sum_rate30s + expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor) + - record: job:django_db_new_connection_errors_total:sum_rate30s + expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor) + - record: job:django_db_execute_total:sum_rate30s + expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor) + - record: job:django_db_execute_many_total:sum_rate30s + expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor) + - record: job:django_db_errors_total:sum_rate30s + expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type) + - record: job:django_migrations_applied_total:max + expr: max(django_migrations_applied_total) BY (job, connection) + - record: job:django_migrations_unapplied_total:max + expr: max(django_migrations_unapplied_total) BY (job, connection) +{% endraw %} diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 index 76573fa..8b8fecb 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -19,6 +19,7 @@ alerting: # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - "alert.rules.yml" + - "django.rules.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. @@ -29,4 +30,3 @@ scrape_configs: file_sd_configs: - files: - '/etc/prometheus/targets.json' - diff --git a/roles/prometheus/templates/update-motd.d/05-service.j2 b/roles/prometheus/templates/update-motd.d/05-service.j2 new file mode 100755 index 0000000..f027dc4 --- /dev/null +++ b/roles/prometheus/templates/update-motd.d/05-service.j2 @@ -0,0 +1,4 @@ +#!/bin/sh +# {{ ansible_managed }} +echo "> prometheus a été déployé sur cette machine." +echo " Voir /etc/prometheus/"