Global_monitoring #24

Merged
otthorn merged 12 commits from Global_monitoring into master 2021-02-18 18:03:05 +01:00
9 changed files with 102 additions and 16 deletions
Showing only changes of commit a5b4deacee - Show all commits

View file

@ -121,8 +121,9 @@
- prometheus-rives.adm.auro.re - prometheus-rives.adm.auro.re
- prometheus-aurore.adm.auro.re - prometheus-aurore.adm.auro.re
- prometheus-ovh.adm.auro.re - prometheus-ovh.adm.auro.re
- prometheus-federate.adm.auro.re
roles: roles:
- prometheus-federate - prometheus_federate
# Monitor all hosts # Monitor all hosts

View file

@ -0,0 +1,84 @@
---
- name: Install Prometheus
apt:
update_cache: true
name:
- prometheus
- prometheus-snmp-exporter
register: apt_result
retries: 3
until: apt_result is succeeded
- name: Configure Prometheus
template:
src: prometheus/prometheus.yml.j2
dest: /etc/prometheus/prometheus.yml
mode: 0644
notify: Restart Prometheus
- name: Configure Prometheus alert rules
template:
src: "prometheus/{{ item }}.j2"
dest: "/etc/prometheus/{{ item }}"
mode: 0644
notify: Restart Prometheus
loop:
- alert.rules.yml
- django.rules.yml
- name: Make Prometheus snmp-exporter listen on localhost only
lineinfile:
path: /etc/default/prometheus-snmp-exporter
regexp: '^ARGS='
line: "ARGS=\"--web.listen-address=127.0.0.1:9116\""
notify: Restart prometheus-snmp-exporter
# This file store SNMP OIDs
- name: Configure Prometheus snmp-exporter
template:
src: "prometheus/snmp.yml.j2"
dest: "/etc/prometheus/snmp.yml"
mode: 0600
owner: prometheus
notify: Restart prometheus-snmp-exporter
# We don't need to restart Prometheus when updating nodes
- name: Configure Prometheus nodes
copy:
content: "{{ prometheus_targets | to_nice_json }}"
dest: /etc/prometheus/targets.json
mode: 0644
# We don't need to restart Prometheus when updating nodes
- name: Configure Prometheus Ubiquity Unifi SNMP devices
copy:
content: "{{ prometheus_unifi_snmp_targets | to_nice_json }}"
dest: /etc/prometheus/targets_unifi_snmp.json
mode: 0644
when: prometheus_unifi_snmp_targets is defined
- name: Configure Prometheus UPS SNMP devices
copy:
content: "{{ [{'targets': prometheus_ups_snmp_targets }]7yk[:Cp_g$#dT'yv!. | to_nice_json }}\n"
dest: /etc/prometheus/targets_ups_snmp.json
mode: 0644
when: prometheus_ups_snmp_targets is defined
- name: Configure Prometheus docker monitoring
copy:
content: "{{ [{'targets': prometheus_docker_targets }] | to_nice_json }}\n"
dest: /etc/prometheus/targets_docker.json
mode: 0644
when: prometheus_docker_targets is defined
- name: Activate prometheus service
systemd:
name: prometheus
enabled: true
state: started
- name: Indicate role in motd
template:
src: update-motd.d/05-service.j2
dest: /etc/update-motd.d/05-prometheus
mode: 0755

View file

@ -13,7 +13,7 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "{{ $labels.instance }} est invisible depuis plus de 3 minutes !" summary: "Federate : {{ $labels.exported_instance }} est invisible depuis plus de 3 minutes !"
# Alert for out of memory # Alert for out of memory
- alert: OutOfMemory - alert: OutOfMemory
@ -22,7 +22,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Mémoire libre de {{ $labels.instance }} à {{ humanize $value }}%." summary: "Federate : Mémoire libre de {{ $labels.exported_instance }} à {{ humanize $value }}%."
# Alert for out of disk space # Alert for out of disk space
- alert: OutOfDiskSpace - alert: OutOfDiskSpace
@ -31,7 +31,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ humanize $value }}%." summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.exported_instance }} à {{ humanize $value }}%."
# Alert for out of inode space on disk # Alert for out of inode space on disk
- alert: OutOfInodes - alert: OutOfInodes
@ -40,7 +40,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}." summary: "Federate : Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.exported_instance }}."
# Alert for high CPU usage # Alert for high CPU usage
- alert: CpuUsage - alert: CpuUsage
@ -49,7 +49,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "CPU sur {{ $labels.instance }} à {{ humanize $value }}%." summary: "Federate : CPU sur {{ $labels.exported_instance }} à {{ humanize $value }}%."
# Check systemd unit (> buster) # Check systemd unit (> buster)
- alert: SystemdServiceFailed - alert: SystemdServiceFailed
@ -58,7 +58,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" summary: "Federate : {{ $labels.name }} a échoué sur {{ $labels.exported_instance }}"
# Check UPS # Check UPS
- alert: UpsOutputSourceChanged - alert: UpsOutputSourceChanged
@ -67,7 +67,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "La source d'alimentation de {{ $labels.instance }} a changé !" summary: "Federate : La source d'alimentation de {{ $labels.exported_instance }} a changé !"
- alert: UpsBatteryStatusWarning - alert: UpsBatteryStatusWarning
expr: upsBatteryStatus == 3 expr: upsBatteryStatus == 3
@ -75,7 +75,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "L'état de la batterie de {{ $labels.instance }} est faible !" summary: "Federate : L'état de la batterie de {{ $labels.exported_instance }} est faible !"
- alert: UpsBatteryStatusCritical - alert: UpsBatteryStatusCritical
expr: upsBatteryStatus == 4 expr: upsBatteryStatus == 4
@ -83,7 +83,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "L'état de la batterie de {{ $labels.instance }} est affaibli !" summary: "L'état de la batterie de {{ $labels.exported_instance }} est affaibli !"
- alert: UpsHighLoad - alert: UpsHighLoad
expr: upsOutputPercentLoad > 70 expr: upsOutputPercentLoad > 70
@ -91,7 +91,7 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !" summary: "Federate : La charge de {{ $labels.exported_instance }} est de {{ $value }}% !"
- alert: UpsWrongInputVoltage - alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
@ -99,7 +99,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V." summary: "Federate : La tension d'entrée de {{ $labels.exported_instance }} est de {{ $value }}V."
- alert: UpsWrongOutputVoltage - alert: UpsWrongOutputVoltage
expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240)
@ -107,7 +107,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V." summary: "Federate : La tension de sortie de {{ $labels.exported_instance }} est de {{ $value }}V."
- alert: UpsTimeRemainingWarning - alert: UpsTimeRemainingWarning
expr: upsEstimatedMinutesRemaining < 15 expr: upsEstimatedMinutesRemaining < 15
@ -115,7 +115,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." summary: "Federate : L'autonomie restante sur {{ $labels.exported_instance }} est de {{ $value }} min."
- alert: UpsTimeRemainingCritical - alert: UpsTimeRemainingCritical
expr: upsEstimatedMinutesRemaining < 5 expr: upsEstimatedMinutesRemaining < 5
@ -123,7 +123,7 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." summary: "Federate : L'autonomie restante sur {{ $labels.exported_instance }} est de {{ $value }} min."
{% endraw %} {% endraw %}

View file

@ -52,4 +52,5 @@ scrape_configs:
- '{job="ups_snmp"}' - '{job="ups_snmp"}'
- '{job="django"}' - '{job="django"}'
- '{job="docker"}' - '{job="docker"}'
- '{job="switch"}'