Merge crans monitoring
This commit is contained in:
parent
e86cb23660
commit
694501dfa3
9 changed files with 208 additions and 34 deletions
|
@ -4,19 +4,10 @@
|
||||||
vars:
|
vars:
|
||||||
# Prometheus targets.json
|
# Prometheus targets.json
|
||||||
prometheus_targets:
|
prometheus_targets:
|
||||||
- labels:
|
- labels: {job: node}
|
||||||
job: node
|
targets: "{{ groups['pacaterie'] | map('replace', '.org', '.org:9100') | list | sort }}"
|
||||||
targets:
|
- labels: {job: prometheus}
|
||||||
- mordred.adm.auro.re:9100
|
targets: ['localhost:9090']
|
||||||
- ldap-replica-pacaterie.adm.auro.re:9100
|
|
||||||
- dhcp-pacaterie.adm.auro.re:9100
|
|
||||||
- dns-pacaterie.adm.auro.re:9100
|
|
||||||
- prometheus-pacaterie.adm.auro.re:9100
|
|
||||||
- radius-pacaterie.adm.auro.re:9100
|
|
||||||
- labels:
|
|
||||||
job: prometheus
|
|
||||||
targets:
|
|
||||||
- localhost:9090
|
|
||||||
roles:
|
roles:
|
||||||
- prometheus
|
- prometheus
|
||||||
- prometheus-alertmanager
|
- prometheus-alertmanager
|
||||||
|
|
|
@ -17,22 +17,23 @@ route:
|
||||||
# The labels by which incoming alerts are grouped together. For example,
|
# The labels by which incoming alerts are grouped together. For example,
|
||||||
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
||||||
# be batched into a single group.
|
# be batched into a single group.
|
||||||
group_by: ['alertname', 'cluster', 'service']
|
#group_by: ['alertname', 'cluster', 'service']
|
||||||
|
group_by: [] # do not group for text chat
|
||||||
|
|
||||||
# When a new group of alerts is created by an incoming alert, wait at
|
# When a new group of alerts is created by an incoming alert, wait at
|
||||||
# least 'group_wait' to send the initial notification.
|
# least 'group_wait' to send the initial notification.
|
||||||
# This way ensures that you get multiple alerts for the same group that start
|
# This way ensures that you get multiple alerts for the same group that start
|
||||||
# firing shortly after another are batched together on the first
|
# firing shortly after another are batched together on the first
|
||||||
# notification.
|
# notification.
|
||||||
group_wait: 30s
|
group_wait: 1m
|
||||||
|
|
||||||
# When the first notification was sent, wait 'group_interval' to send a batch
|
# When the first notification was sent, wait 'group_interval' to send a batch
|
||||||
# of new alerts that started firing for that group.
|
# of new alerts that started firing for that group.
|
||||||
group_interval: 5m
|
group_interval: 1m
|
||||||
|
|
||||||
# If an alert has successfully been sent, wait 'repeat_interval' to
|
# If an alert has successfully been sent, wait 'repeat_interval' to
|
||||||
# resend them.
|
# resend them.
|
||||||
repeat_interval: 3h
|
repeat_interval: 12h
|
||||||
|
|
||||||
# A default receiver
|
# A default receiver
|
||||||
receiver: team-monitoring-mails
|
receiver: team-monitoring-mails
|
||||||
|
|
|
@ -7,12 +7,18 @@
|
||||||
retries: 3
|
retries: 3
|
||||||
until: apt_result is succeeded
|
until: apt_result is succeeded
|
||||||
|
|
||||||
|
- name: Activate prometheus-node-exporter service
|
||||||
|
systemd:
|
||||||
|
name: prometheus-node-exporter
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
|
||||||
# Doesn't work on Debian Stretch
|
# Doesn't work on Debian Stretch
|
||||||
- name: Make Prometheus node-exporter listen on adm only
|
- name: Make Prometheus node-exporter listen on adm only
|
||||||
when:
|
when:
|
||||||
- ansible_distribution_release != 'stretch'
|
- ansible_lsb.codename == 'buster'
|
||||||
lineinfile:
|
lineinfile:
|
||||||
path: /etc/default/prometheus-node-exporter
|
path: /etc/default/prometheus-node-exporter
|
||||||
regexp: '^ARGS='
|
regexp: '^ARGS='
|
||||||
line: "ARGS=\"--web.listen-address={{ ansible_fqdn }}:9100\""
|
line: "ARGS=\"--web.listen-address={{ ansible_hostname }}.adm.auro.re:9100\""
|
||||||
notify: Restart prometheus-node-exporter
|
notify: Restart prometheus-node-exporter
|
||||||
|
|
|
@ -3,3 +3,8 @@
|
||||||
service:
|
service:
|
||||||
name: prometheus
|
name: prometheus
|
||||||
state: restarted
|
state: restarted
|
||||||
|
|
||||||
|
- name: Restart prometheus-snmp-exporter
|
||||||
|
service:
|
||||||
|
name: prometheus-snmp-exporter
|
||||||
|
state: restarted
|
||||||
|
|
|
@ -2,7 +2,9 @@
|
||||||
- name: Install Prometheus
|
- name: Install Prometheus
|
||||||
apt:
|
apt:
|
||||||
update_cache: true
|
update_cache: true
|
||||||
name: prometheus
|
name:
|
||||||
|
- prometheus
|
||||||
|
- prometheus-snmp-exporter
|
||||||
register: apt_result
|
register: apt_result
|
||||||
retries: 3
|
retries: 3
|
||||||
until: apt_result is succeeded
|
until: apt_result is succeeded
|
||||||
|
@ -15,12 +17,34 @@
|
||||||
|
|
||||||
- name: Configure Prometheus alert rules
|
- name: Configure Prometheus alert rules
|
||||||
template:
|
template:
|
||||||
src: prometheus/alert.rules.yml.j2
|
src: "prometheus/{{ item }}.j2"
|
||||||
dest: /etc/prometheus/alert.rules.yml
|
dest: "/etc/prometheus/{{ item }}"
|
||||||
notify: Restart Prometheus
|
notify: Restart Prometheus
|
||||||
|
loop:
|
||||||
|
- alert.rules.yml
|
||||||
|
- django.rules.yml
|
||||||
|
|
||||||
|
- name: Make Prometheus snmp-exporter listen on localhost only
|
||||||
|
lineinfile:
|
||||||
|
path: /etc/default/prometheus-snmp-exporter
|
||||||
|
regexp: '^ARGS='
|
||||||
|
line: "ARGS=\"--web.listen-address=127.0.0.1:9116\""
|
||||||
|
notify: Restart prometheus-snmp-exporter
|
||||||
|
|
||||||
# We don't need to restart Prometheus when updating nodes
|
# We don't need to restart Prometheus when updating nodes
|
||||||
- name: Configure Prometheus nodes
|
- name: Configure Prometheus nodes
|
||||||
copy:
|
copy:
|
||||||
content: "{{ prometheus_targets | to_nice_json }}"
|
content: "{{ prometheus_targets | to_nice_json }}"
|
||||||
dest: /etc/prometheus/targets.json
|
dest: /etc/prometheus/targets.json
|
||||||
|
|
||||||
|
- name: Activate prometheus service
|
||||||
|
systemd:
|
||||||
|
name: prometheus
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
|
||||||
|
- name: Indicate role in motd
|
||||||
|
template:
|
||||||
|
src: update-motd.d/05-service.j2
|
||||||
|
dest: /etc/update-motd.d/05-prometheus
|
||||||
|
mode: 0755
|
||||||
|
|
|
@ -1,25 +1,62 @@
|
||||||
# {{ ansible_managed }}
|
# {{ ansible_managed }}
|
||||||
{# As this is also Jinja2 it will conflict without a raw block #}
|
{# As this is also Jinja2 it will conflict without a raw block #}
|
||||||
|
{# Depending of Prometheus Node exporter version, rules can change depending of version #}
|
||||||
{% raw %}
|
{% raw %}
|
||||||
groups:
|
groups:
|
||||||
- name: example
|
- name: alert.rules
|
||||||
rules:
|
rules:
|
||||||
|
|
||||||
# Alert for any instance that is unreachable for >5 minutes.
|
# Alert for any instance that is unreachable for >3 minutes.
|
||||||
- alert: InstanceDown
|
- alert: InstanceDown
|
||||||
expr: up == 0
|
expr: up == 0
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $labels.instance }} est invisible depuis plus de 3 minutes !"
|
||||||
|
|
||||||
|
# Alert for out of memory
|
||||||
|
- alert: OutOfMemory
|
||||||
|
expr: ((node_memory_MemFree_bytes or node_memory_MemFree) + (node_memory_Cached_bytes or node_memory_Cached) + (node_memory_Buffers_bytes or node_memory_Buffers)) / (node_memory_MemTotal_bytes or node_memory_MemTotal) * 100 < 10
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Instance {{ $labels.instance }} down"
|
summary: "Mémoire libre de {{ $labels.instance }} à {{ $value }}%."
|
||||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
|
||||||
|
|
||||||
# Alert for any instance that has a median request latency >1s.
|
# Alert for out of disk space
|
||||||
- alert: APIHighRequestLatency
|
- alert: OutOfDiskSpace
|
||||||
expr: api_http_request_latencies_second{quantile="0.5"} > 1
|
expr: (node_filesystem_free_bytes{fstype="ext4"} or node_filesystem_free{fstype="ext4"}) / (node_filesystem_size_bytes{fstype="ext4"} or node_filesystem_size{fstype="ext4"}) * 100 < 10
|
||||||
for: 10m
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "High request latency on {{ $labels.instance }}"
|
summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ $value }}%."
|
||||||
description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
|
|
||||||
|
# Alert for out of inode space on disk
|
||||||
|
- alert: OutOfInodes
|
||||||
|
expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}."
|
||||||
|
|
||||||
|
# Alert for high CPU usage
|
||||||
|
- alert: CpuUsage
|
||||||
|
expr: ((100 - avg by (instance) (irate(node_cpu{mode="idle"}[5m])) * 100) or (100 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) > 75
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "CPU sur {{ $labels.instance }} à {{ $value }}%."
|
||||||
|
|
||||||
|
# Check systemd unit (> buster)
|
||||||
|
- alert: SystemdServiceFailed
|
||||||
|
expr: node_systemd_unit_state{state="failed"} == 1
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
|
||||||
{% endraw %}
|
{% endraw %}
|
||||||
|
|
106
roles/prometheus/templates/prometheus/django.rules.yml.j2
Normal file
106
roles/prometheus/templates/prometheus/django.rules.yml.j2
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# {{ ansible_managed }}
|
||||||
|
{# As this is also Jinja2 it will conflict without a raw block #}
|
||||||
|
{% raw %}
|
||||||
|
groups:
|
||||||
|
- name: django.rules
|
||||||
|
rules:
|
||||||
|
- record: job:django_http_requests_before_middlewares_total:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job)
|
||||||
|
- record: job:django_http_requests_unknown_latency_total:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job)
|
||||||
|
- record: job:django_http_ajax_requests_total:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job)
|
||||||
|
- record: job:django_http_responses_before_middlewares_total:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job)
|
||||||
|
- record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s]))
|
||||||
|
BY (job)
|
||||||
|
- record: job:django_http_requests_body_total_bytes:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job)
|
||||||
|
- record: job:django_http_responses_streaming_total:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job)
|
||||||
|
- record: job:django_http_responses_body_total_bytes:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job)
|
||||||
|
- record: job:django_http_requests_total:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job)
|
||||||
|
- record: job:django_http_requests_total_by_method:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method)
|
||||||
|
- record: job:django_http_requests_total_by_transport:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport)
|
||||||
|
- record: job:django_http_requests_total_by_view:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
|
||||||
|
view)
|
||||||
|
- record: job:django_http_requests_total_by_view_transport_method:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
|
||||||
|
view, transport, method)
|
||||||
|
- record: job:django_http_responses_total_by_templatename:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename)
|
||||||
|
- record: job:django_http_responses_total_by_status:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status)
|
||||||
|
- record: job:django_http_responses_total_by_charset:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset)
|
||||||
|
- record: job:django_http_exceptions_total_by_type:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type)
|
||||||
|
- record: job:django_http_exceptions_total_by_view:sum_rate30s
|
||||||
|
expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view)
|
||||||
|
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
||||||
|
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
|
||||||
|
BY (job, le))
|
||||||
|
labels:
|
||||||
|
quantile: "50"
|
||||||
|
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
||||||
|
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
|
||||||
|
BY (job, le))
|
||||||
|
labels:
|
||||||
|
quantile: "95"
|
||||||
|
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
||||||
|
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
|
||||||
|
BY (job, le))
|
||||||
|
labels:
|
||||||
|
quantile: "99"
|
||||||
|
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
||||||
|
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
|
||||||
|
BY (job, le))
|
||||||
|
labels:
|
||||||
|
quantile: "99.9"
|
||||||
|
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
||||||
|
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
|
||||||
|
BY (job, le))
|
||||||
|
labels:
|
||||||
|
quantile: "50"
|
||||||
|
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
||||||
|
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
|
||||||
|
BY (job, le))
|
||||||
|
labels:
|
||||||
|
quantile: "95"
|
||||||
|
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
||||||
|
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
|
||||||
|
BY (job, le))
|
||||||
|
labels:
|
||||||
|
quantile: "99"
|
||||||
|
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
||||||
|
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
|
||||||
|
BY (job, le))
|
||||||
|
labels:
|
||||||
|
quantile: "99.9"
|
||||||
|
- record: job:django_model_inserts_total:sum_rate1m
|
||||||
|
expr: sum(rate(django_model_inserts_total[1m])) BY (job, model)
|
||||||
|
- record: job:django_model_updates_total:sum_rate1m
|
||||||
|
expr: sum(rate(django_model_updates_total[1m])) BY (job, model)
|
||||||
|
- record: job:django_model_deletes_total:sum_rate1m
|
||||||
|
expr: sum(rate(django_model_deletes_total[1m])) BY (job, model)
|
||||||
|
- record: job:django_db_new_connections_total:sum_rate30s
|
||||||
|
expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor)
|
||||||
|
- record: job:django_db_new_connection_errors_total:sum_rate30s
|
||||||
|
expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor)
|
||||||
|
- record: job:django_db_execute_total:sum_rate30s
|
||||||
|
expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor)
|
||||||
|
- record: job:django_db_execute_many_total:sum_rate30s
|
||||||
|
expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor)
|
||||||
|
- record: job:django_db_errors_total:sum_rate30s
|
||||||
|
expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type)
|
||||||
|
- record: job:django_migrations_applied_total:max
|
||||||
|
expr: max(django_migrations_applied_total) BY (job, connection)
|
||||||
|
- record: job:django_migrations_unapplied_total:max
|
||||||
|
expr: max(django_migrations_unapplied_total) BY (job, connection)
|
||||||
|
{% endraw %}
|
|
@ -19,6 +19,7 @@ alerting:
|
||||||
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||||
rule_files:
|
rule_files:
|
||||||
- "alert.rules.yml"
|
- "alert.rules.yml"
|
||||||
|
- "django.rules.yml"
|
||||||
|
|
||||||
# A scrape configuration containing exactly one endpoint to scrape:
|
# A scrape configuration containing exactly one endpoint to scrape:
|
||||||
# Here it's Prometheus itself.
|
# Here it's Prometheus itself.
|
||||||
|
@ -29,4 +30,3 @@ scrape_configs:
|
||||||
file_sd_configs:
|
file_sd_configs:
|
||||||
- files:
|
- files:
|
||||||
- '/etc/prometheus/targets.json'
|
- '/etc/prometheus/targets.json'
|
||||||
|
|
||||||
|
|
4
roles/prometheus/templates/update-motd.d/05-service.j2
Executable file
4
roles/prometheus/templates/update-motd.d/05-service.j2
Executable file
|
@ -0,0 +1,4 @@
|
||||||
|
#!/bin/sh
|
||||||
|
# {{ ansible_managed }}
|
||||||
|
echo "> prometheus a été déployé sur cette machine."
|
||||||
|
echo " Voir /etc/prometheus/"
|
Loading…
Reference in a new issue