From 5dfadc0b52aed2af961014a471f60c8fa91d775e Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 10 Feb 2021 18:39:13 +0100 Subject: [PATCH 01/11] Add prometheus federate and ovh in hosts --- hosts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hosts b/hosts index 3578d95..4e9b264 100644 --- a/hosts +++ b/hosts @@ -63,6 +63,8 @@ vpn-ovh.adm.auro.re docker-ovh.adm.auro.re switchs-manager.adm.auro.re ldap-replica-ovh.adm.auro.re +prometheus-ovh.adm.auro.re +prometheus-federate.adm.auro.re [ovh_testing_vm] #re2o-test.adm.auro.re From b5dbe2c5c9f7347a7575666a24f9b813428b5396 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 10 Feb 2021 18:40:28 +0100 Subject: [PATCH 02/11] Prometheus-ovh role --- monitoring.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/monitoring.yml b/monitoring.yml index c31fe86..a47ca0d 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -92,6 +92,18 @@ roles: - prometheus +- hosts: prometheus-ovh.adm.auro.re + vars: + prometheus_alertmanager: docker-ovh.adm.auro.re:9093 + snmp_unifi_password: "{{ vault_snmp_unifi_password }}" + + # Prometheus targets.json + prometheus_targets: + - targets: | + {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} + roles: + - prometheus + # Monitor all hosts - hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container From 4308bedf8f2c4bf8046775d9d17bd10e14f457a4 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 10 Feb 2021 19:06:28 +0100 Subject: [PATCH 03/11] Monitoring of docker containers --- monitoring.yml | 2 ++ roles/prometheus/tasks/main.yml | 7 +++++++ roles/prometheus/templates/prometheus/prometheus.yml.j2 | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/monitoring.yml b/monitoring.yml index a47ca0d..98192b2 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -101,6 +101,8 @@ prometheus_targets: - targets: | {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} + prometheus_docker_targets: + - targets: docker-ovh.adm.auro.re:8087 roles: - prometheus diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 8697ef9..f215930 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -64,6 +64,13 @@ mode: 0644 when: prometheus_ups_snmp_targets is defined +- name: Configure Prometheus docker monitoring + copy: + content: "{{ [{'targets': prometheus_docker_targets }] | to_nice_json }}\n" + dest: /etc/prometheus/targets_docker.json + mode: 0644 + when: prometheus_docker_targets is defined + - name: Activate prometheus service systemd: name: prometheus diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 index e35a0cf..75c8be9 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -81,3 +81,7 @@ scrape_configs: - target_label: __address__ replacement: 127.0.0.1:9116 + - job_name: docker + file_sd_configs: + - files: + - '/etc/prometheus/targets_docker.json' From 45d8ca80a4e382a180eaa520a37fab88ab1532cc Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 10 Feb 2021 20:12:04 +0100 Subject: [PATCH 04/11] OVH PVE and VM are now monitored by prometheus-ovh --- monitoring.yml | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/monitoring.yml b/monitoring.yml index 98192b2..10895bb 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -88,7 +88,7 @@ # Prometheus targets.json prometheus_targets: - targets: | - {{ groups['aurore_pve'] + groups['aurore_vm'] + groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} + {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }} roles: - prometheus @@ -102,11 +102,27 @@ - targets: | {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} prometheus_docker_targets: - - targets: docker-ovh.adm.auro.re:8087 + - docker-ovh.adm.auro.re:8087 roles: - prometheus +- hosts: prometheus-federate.adm.auro.re + vars: + prometheus_alertmanager: docker-ovh.adm.auro.re:9093 + snmp_unifi_password: "{{ vault_snmp_unifi_password }}" + + # Prometheus targets.json + prometheus_targets: + - prometheus-edc.adm.auro.re + - prometheus-gs.adm.auro.re + - prometheus-fleming.adm.auro.re + - prometheus-pacaterie.adm.auro.re + - prometheus-rives.adm.auro.re + roles: + - prometheus-federate + + # Monitor all hosts - hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container roles: From d8924abe6693eaf6da59491d978922395b46f1be Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 10 Feb 2021 20:42:37 +0100 Subject: [PATCH 05/11] Add prometheus-federate role --- monitoring.yml | 2 + roles/prometheus-federate/handlers/main.yml | 10 + roles/prometheus-federate/tasks/main.yml | 46 +++ .../templates/prometheus/alert.rules.yml.j2 | 129 ++++++ .../templates/prometheus/django.rules.yml.j2 | 106 +++++ .../templates/prometheus/prometheus.yml.j2 | 55 +++ .../templates/prometheus/snmp.yml.j2 | 387 ++++++++++++++++++ .../templates/update-motd.d/05-service.j2 | 4 + 8 files changed, 739 insertions(+) create mode 100644 roles/prometheus-federate/handlers/main.yml create mode 100644 roles/prometheus-federate/tasks/main.yml create mode 100644 roles/prometheus-federate/templates/prometheus/alert.rules.yml.j2 create mode 100644 roles/prometheus-federate/templates/prometheus/django.rules.yml.j2 create mode 100644 roles/prometheus-federate/templates/prometheus/prometheus.yml.j2 create mode 100644 roles/prometheus-federate/templates/prometheus/snmp.yml.j2 create mode 100755 roles/prometheus-federate/templates/update-motd.d/05-service.j2 diff --git a/monitoring.yml b/monitoring.yml index 10895bb..bcf4ef2 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -119,6 +119,8 @@ - prometheus-fleming.adm.auro.re - prometheus-pacaterie.adm.auro.re - prometheus-rives.adm.auro.re + - prometheus-aurore.adm.auro.re + - prometheus-ovh.adm.auro.re roles: - prometheus-federate diff --git a/roles/prometheus-federate/handlers/main.yml b/roles/prometheus-federate/handlers/main.yml new file mode 100644 index 0000000..670847b --- /dev/null +++ b/roles/prometheus-federate/handlers/main.yml @@ -0,0 +1,10 @@ +--- +- name: Restart Prometheus + service: + name: prometheus + state: restarted + +- name: Restart prometheus-snmp-exporter + service: + name: prometheus-snmp-exporter + state: restarted diff --git a/roles/prometheus-federate/tasks/main.yml b/roles/prometheus-federate/tasks/main.yml new file mode 100644 index 0000000..33feb90 --- /dev/null +++ b/roles/prometheus-federate/tasks/main.yml @@ -0,0 +1,46 @@ +--- +- name: Install Prometheus + apt: + update_cache: true + name: + - prometheus + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Configure Prometheus + template: + src: prometheus/prometheus.yml.j2 + dest: /etc/prometheus/prometheus.yml + mode: 0644 + notify: Restart Prometheus + +- name: Configure Prometheus alert rules + template: + src: "prometheus/{{ item }}.j2" + dest: "/etc/prometheus/{{ item }}" + mode: 0644 + notify: Restart Prometheus + loop: + - alert.rules.yml + - django.rules.yml + +# We don't need to restart Prometheus when updating nodes +- name: Configure Prometheus Federate devices + copy: + content: "{{ [{'targets': prometheus_targets }] | to_nice_json }}" + dest: /etc/prometheus/targets.json + mode: 0644 + when: prometheus_targets is defined + +- name: Activate prometheus service + systemd: + name: prometheus + enabled: true + state: started + +- name: Indicate role in motd + template: + src: update-motd.d/05-service.j2 + dest: /etc/update-motd.d/05-prometheus + mode: 0755 diff --git a/roles/prometheus-federate/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus-federate/templates/prometheus/alert.rules.yml.j2 new file mode 100644 index 0000000..e2cb42c --- /dev/null +++ b/roles/prometheus-federate/templates/prometheus/alert.rules.yml.j2 @@ -0,0 +1,129 @@ +# {{ ansible_managed }} +{# As this is also Jinja2 it will conflict without a raw block #} +{# Depending of Prometheus Node exporter version, rules can change depending of version #} +{% raw %} +groups: +- name: alert.rules + rules: + + # Alert for any instance that is unreachable for >3 minutes. + - alert: InstanceDown + expr: up == 0 + for: 3m + labels: + severity: critical + annotations: + summary: "{{ $labels.instance }} est invisible depuis plus de 3 minutes !" + + # Alert for out of memory + - alert: OutOfMemory + expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Mémoire libre de {{ $labels.instance }} à {{ humanize $value }}%." + + # Alert for out of disk space + - alert: OutOfDiskSpace + expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ humanize $value }}%." + + # Alert for out of inode space on disk + - alert: OutOfInodes + expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}." + + # Alert for high CPU usage + - alert: CpuUsage + expr: (100 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 75 + for: 10m + labels: + severity: warning + annotations: + summary: "CPU sur {{ $labels.instance }} à {{ humanize $value }}%." + + # Check systemd unit (> buster) + - alert: SystemdServiceFailed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" + + # Check UPS + - alert: UpsOutputSourceChanged + expr: upsOutputSource != 3 + for: 1m + labels: + severity: warning + annotations: + summary: "La source d'alimentation de {{ $labels.instance }} a changé !" + + - alert: UpsBatteryStatusWarning + expr: upsBatteryStatus == 3 + for: 2m + labels: + severity: warning + annotations: + summary: "L'état de la batterie de {{ $labels.instance }} est faible !" + + - alert: UpsBatteryStatusCritical + expr: upsBatteryStatus == 4 + for: 10m + labels: + severity: warning + annotations: + summary: "L'état de la batterie de {{ $labels.instance }} est affaibli !" + + - alert: UpsHighLoad + expr: upsOutputPercentLoad > 70 + for: 5m + labels: + severity: critical + annotations: + summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !" + + - alert: UpsWrongInputVoltage + expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) + for: 10m + labels: + severity: warning + annotations: + summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V." + + - alert: UpsWrongOutputVoltage + expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) + for: 10m + labels: + severity: warning + annotations: + summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V." + + - alert: UpsTimeRemainingWarning + expr: upsEstimatedMinutesRemaining < 15 + for: 1m + labels: + severity: warning + annotations: + summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." + + - alert: UpsTimeRemainingCritical + expr: upsEstimatedMinutesRemaining < 5 + for: 1m + labels: + severity: critical + annotations: + summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." + + +{% endraw %} diff --git a/roles/prometheus-federate/templates/prometheus/django.rules.yml.j2 b/roles/prometheus-federate/templates/prometheus/django.rules.yml.j2 new file mode 100644 index 0000000..fddd398 --- /dev/null +++ b/roles/prometheus-federate/templates/prometheus/django.rules.yml.j2 @@ -0,0 +1,106 @@ +# {{ ansible_managed }} +{# As this is also Jinja2 it will conflict without a raw block #} +{% raw %} +groups: +- name: django.rules + rules: + - record: job:django_http_requests_before_middlewares_total:sum_rate30s + expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job) + - record: job:django_http_requests_unknown_latency_total:sum_rate30s + expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job) + - record: job:django_http_ajax_requests_total:sum_rate30s + expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job) + - record: job:django_http_responses_before_middlewares_total:sum_rate30s + expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job) + - record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s + expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s])) + BY (job) + - record: job:django_http_requests_body_total_bytes:sum_rate30s + expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job) + - record: job:django_http_responses_streaming_total:sum_rate30s + expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job) + - record: job:django_http_responses_body_total_bytes:sum_rate30s + expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job) + - record: job:django_http_requests_total:sum_rate30s + expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job) + - record: job:django_http_requests_total_by_method:sum_rate30s + expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method) + - record: job:django_http_requests_total_by_transport:sum_rate30s + expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport) + - record: job:django_http_requests_total_by_view:sum_rate30s + expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, + view) + - record: job:django_http_requests_total_by_view_transport_method:sum_rate30s + expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, + view, transport, method) + - record: job:django_http_responses_total_by_templatename:sum_rate30s + expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename) + - record: job:django_http_responses_total_by_status:sum_rate30s + expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status) + - record: job:django_http_responses_total_by_charset:sum_rate30s + expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset) + - record: job:django_http_exceptions_total_by_type:sum_rate30s + expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type) + - record: job:django_http_exceptions_total_by_view:sum_rate30s + expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view) + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "50" + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "95" + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "99" + - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s + expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "99.9" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "50" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "95" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "99" + - record: job:django_http_requests_latency_seconds:quantile_rate30s + expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s])) + BY (job, le)) + labels: + quantile: "99.9" + - record: job:django_model_inserts_total:sum_rate1m + expr: sum(rate(django_model_inserts_total[1m])) BY (job, model) + - record: job:django_model_updates_total:sum_rate1m + expr: sum(rate(django_model_updates_total[1m])) BY (job, model) + - record: job:django_model_deletes_total:sum_rate1m + expr: sum(rate(django_model_deletes_total[1m])) BY (job, model) + - record: job:django_db_new_connections_total:sum_rate30s + expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor) + - record: job:django_db_new_connection_errors_total:sum_rate30s + expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor) + - record: job:django_db_execute_total:sum_rate30s + expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor) + - record: job:django_db_execute_many_total:sum_rate30s + expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor) + - record: job:django_db_errors_total:sum_rate30s + expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type) + - record: job:django_migrations_applied_total:max + expr: max(django_migrations_applied_total) BY (job, connection) + - record: job:django_migrations_unapplied_total:max + expr: max(django_migrations_unapplied_total) BY (job, connection) +{% endraw %} diff --git a/roles/prometheus-federate/templates/prometheus/prometheus.yml.j2 b/roles/prometheus-federate/templates/prometheus/prometheus.yml.j2 new file mode 100644 index 0000000..0d4c601 --- /dev/null +++ b/roles/prometheus-federate/templates/prometheus/prometheus.yml.j2 @@ -0,0 +1,55 @@ +# {{ ansible_managed }} + +global: + # scrape_interval is set to the global default (60s) + # evaluation_interval is set to the global default (60s) + # scrape_timeout is set to the global default (10s). + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'example' + +# Alertmanager configuration +# Use prometheus alertmanager installed on the same machine +alerting: + alertmanagers: + - static_configs: + - targets: ['{{ prometheus_alertmanager }}'] + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + - "alert.rules.yml" # Monitoring alerts, this is the file you may be searching! + - "django.rules.yml" # Custom rules specific for Django project monitoring + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The .json in file_sd_configs is dynamically reloaded + + + - job_name: federate + scrape_interval: 15s + metrics_path: '/federate' + file_sd_configs: + - files: + - '/etc/prometheus/targets.json' + relabel_configs: + # Do not put :9100 in instance name, rather here + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [__param_target] + target_label: __address__ + replacement: '$1:9090' + params: + 'match[]': + - '{job="servers"}' + - '{job="prometheus"}' + - '{job="unifi_snmp"}' + - '{job="django"}' + - '{job="ups_snmp"}' + - '{job="django"}' + - '{job="docker"}' + diff --git a/roles/prometheus-federate/templates/prometheus/snmp.yml.j2 b/roles/prometheus-federate/templates/prometheus/snmp.yml.j2 new file mode 100644 index 0000000..d4dc51c --- /dev/null +++ b/roles/prometheus-federate/templates/prometheus/snmp.yml.j2 @@ -0,0 +1,387 @@ +# {{ ansible_managed }} +# TODOlist : +# - Faire fonctionner le monitoring des switchs défini ici +# * Configurer tous les switchs avec un compte SNMPv3 +# * Mettre l'inventaire des switchs dans Ansible +# - Optimiser les règles pour les bornes Unifi, +# on pourrait indexer avec les SSID + +eatonups: + walk: + - 1.3.6.1.2.1.33.1.2 + - 1.3.6.1.2.1.33.1.3 + - 1.3.6.1.2.1.33.1.4 + - 1.3.6.1.4.1.534.1.6 + get: + - 1.3.6.1.2.1.1.3.0 + metrics: + - name: sysUpTime + oid: 1.3.6.1.2.1.1.3 + type: gauge + help: The time (in hundredths of a second) since the network management portion + of the system was last re-initialized. - 1.3.6.1.2.1.1.3 + - name: upsBatteryStatus + oid: 1.3.6.1.2.1.33.1.2.1 + type: gauge + help: The indication of the capacity remaining in the UPS system's batteries - + 1.3.6.1.2.1.33.1.2.1 + - name: upsEstimatedMinutesRemaining + oid: 1.3.6.1.2.1.33.1.2.3 + type: gauge + help: An estimate of the time to battery charge depletion under the present load + conditions if the utility power is off and remains off, or if it were to be + lost and remain off. - 1.3.6.1.2.1.33.1.2.3 + - name: upsInputVoltage + oid: 1.3.6.1.2.1.33.1.3.3.1.3 + type: gauge + help: The magnitude of the present input voltage. - 1.3.6.1.2.1.33.1.3.3.1.3 + indexes: + - labelname: upsInputLineIndex + type: gauge + - name: upsOutputSource + oid: 1.3.6.1.2.1.33.1.4.1 + type: gauge + help: The present source of output power - 1.3.6.1.2.1.33.1.4.1 + - name: upsOutputVoltage + oid: 1.3.6.1.2.1.33.1.4.4.1.2 + type: gauge + help: The present output voltage. - 1.3.6.1.2.1.33.1.4.4.1.2 + indexes: + - labelname: upsOutputLineIndex + type: gauge + - name: upsOutputPower + oid: 1.3.6.1.2.1.33.1.4.4.1.4 + type: gauge + help: The present output true power. - 1.3.6.1.2.1.33.1.4.4.1.4 + indexes: + - labelname: upsOutputLineIndex + type: gauge + - name: upsOutputPercentLoad + oid: 1.3.6.1.2.1.33.1.4.4.1.5 + type: gauge + help: The percentage of the UPS power capacity presently being used on this output + line, i.e., the greater of the percent load of true power capacity and the percent + load of VA. - 1.3.6.1.2.1.33.1.4.4.1.5 + indexes: + - labelname: upsOutputLineIndex + type: gauge + - name: xupsEnvRemoteTemp + oid: 1.3.6.1.4.1.534.1.6.5 + type: gauge + help: The reading of an EMP's temperature sensor. - 1.3.6.1.4.1.534.1.6.5 + - name: xupsEnvRemoteHumidity + oid: 1.3.6.1.4.1.534.1.6.6 + type: gauge + help: The reading of an EMP's humidity sensor. - 1.3.6.1.4.1.534.1.6.6 + version: 1 + auth: + community: public + + +procurve_switch: + walk: + - 1.3.6.1.2.1.31.1.1.1.10 + - 1.3.6.1.2.1.31.1.1.1.6 + get: + - 1.3.6.1.2.1.1.3.0 + - 1.3.6.1.2.1.1.5.0 + - 1.3.6.1.2.1.1.6.0 + metrics: + - name: sysUpTime + oid: 1.3.6.1.2.1.1.3 + type: gauge + help: The time (in hundredths of a second) since the network management portion + of the system was last re-initialized. - 1.3.6.1.2.1.1.3 + - name: sysName + oid: 1.3.6.1.2.1.1.5 + type: DisplayString + help: An administratively-assigned name for this managed node - 1.3.6.1.2.1.1.5 + - name: sysLocation + oid: 1.3.6.1.2.1.1.6 + type: DisplayString + help: The physical location of this node (e.g., 'telephone closet, 3rd floor') + - 1.3.6.1.2.1.1.6 + - name: ifHCOutOctets + oid: 1.3.6.1.2.1.31.1.1.1.10 + type: counter + help: The total number of octets transmitted out of the interface, including framing + characters - 1.3.6.1.2.1.31.1.1.1.10 + indexes: + - labelname: ifIndex + type: gauge + - name: ifHCInOctets + oid: 1.3.6.1.2.1.31.1.1.1.6 + type: counter + help: The total number of octets received on the interface, including framing + characters - 1.3.6.1.2.1.31.1.1.1.6 + indexes: + - labelname: ifIndex + type: gauge + version: 3 + auth: + username: prometheus + +ubiquiti_unifi: + walk: + - 1.3.6.1.4.1.41112.1.6 + get: + - 1.3.6.1.2.1.1.5.0 + - 1.3.6.1.2.1.1.6.0 + metrics: +# Pour faire une WifiMap un jour, on peut entrer la location dans la conf des bornes +# - name: sysLocation +# oid: 1.3.6.1.2.1.1.6 +# type: DisplayString +# help: The physical location of this node (e.g., 'telephone closet, 3rd floor') +# - 1.3.6.1.2.1.1.6 + - name: unifiVapIndex + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.1 + type: gauge + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.1' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapChannel + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.4 + type: gauge + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.4' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapEssId + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.6 + type: DisplayString + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.6' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapName + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.7 + type: DisplayString + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.7' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifi_vap_num_stations + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.8 + type: gauge + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.8' + indexes: + - labelname: unifi_vap_index + type: gauge + lookups: + - labels: [unifi_vap_index] + labelname: unifi_vap_essid + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.6 + type: DisplayString + - labels: [unifi_vap_index] + labelname: unifi_vap_radio + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.9 + type: DisplayString + - labels: [] + labelname: unifi_vap_index +# - name: unifiVapNumStations +# oid: 1.3.6.1.4.1.41112.1.6.1.2.1.8 +# type: gauge +# help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.8' +# indexes: +# - labelname: unifiVapIndex +# type: gauge + - name: unifiVapRadio + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.9 + type: DisplayString + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.9' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapRxBytes + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.10 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.10' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapRxCrypts + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.11 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.11' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapRxDropped + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.12 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.12' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapRxErrors + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.13 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.13' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapRxFrags + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.14 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.14' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapRxPackets + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.15 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.15' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapTxBytes + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.16 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.16' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapTxDropped + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.17 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.17' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapTxErrors + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.18 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.18' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapTxPackets + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.19 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.19' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapTxRetries + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.20 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.20' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapTxPower + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.21 + type: gauge + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.21' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapUp + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.22 + type: gauge + help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.22' + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiVapUsage + oid: 1.3.6.1.4.1.41112.1.6.1.2.1.23 + type: DisplayString + help: guest or regular user - 1.3.6.1.4.1.41112.1.6.1.2.1.23 + indexes: + - labelname: unifiVapIndex + type: gauge + - name: unifiIfIndex + oid: 1.3.6.1.4.1.41112.1.6.2.1.1.1 + type: gauge + help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.1' + indexes: + - labelname: unifiIfIndex + type: gauge + - name: unifiIfName + oid: 1.3.6.1.4.1.41112.1.6.2.1.1.5 + type: DisplayString + help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.5' + indexes: + - labelname: unifiIfIndex + type: gauge + - name: unifiIfRxBytes + oid: 1.3.6.1.4.1.41112.1.6.2.1.1.6 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.6' + indexes: + - labelname: unifiIfIndex + type: gauge + - name: unifiIfRxDropped + oid: 1.3.6.1.4.1.41112.1.6.2.1.1.7 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.7' + indexes: + - labelname: unifiIfIndex + type: gauge + - name: unifiIfRxError + oid: 1.3.6.1.4.1.41112.1.6.2.1.1.8 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.8' + indexes: + - labelname: unifiIfIndex + type: gauge + - name: unifiIfRxMulticast + oid: 1.3.6.1.4.1.41112.1.6.2.1.1.9 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.9' + indexes: + - labelname: unifiIfIndex + type: gauge + - name: unifiIfRxPackets + oid: 1.3.6.1.4.1.41112.1.6.2.1.1.10 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.10' + indexes: + - labelname: unifiIfIndex + type: gauge + - name: unifiIfTxBytes + oid: 1.3.6.1.4.1.41112.1.6.2.1.1.12 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.12' + indexes: + - labelname: unifiIfIndex + type: gauge + - name: unifiIfTxDropped + oid: 1.3.6.1.4.1.41112.1.6.2.1.1.13 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.13' + indexes: + - labelname: unifiIfIndex + type: gauge + - name: unifiIfTxError + oid: 1.3.6.1.4.1.41112.1.6.2.1.1.14 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.14' + indexes: + - labelname: unifiIfIndex + type: gauge + - name: unifiIfTxPackets + oid: 1.3.6.1.4.1.41112.1.6.2.1.1.15 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.15' + indexes: + - labelname: unifiIfIndex + type: gauge + - name: unifiApSystemModel + oid: 1.3.6.1.4.1.41112.1.6.3.3 + type: DisplayString + help: ' - 1.3.6.1.4.1.41112.1.6.3.3' + - name: unifiApSystemUptime + oid: 1.3.6.1.4.1.41112.1.6.3.5 + type: counter + help: ' - 1.3.6.1.4.1.41112.1.6.3.5' + version: 3 + auth: + security_level: authPriv + username: snmp_prometheus + password: {{ snmp_unifi_password }} + auth_protocol: SHA + priv_protocol: AES + priv_password: {{ snmp_unifi_password }} diff --git a/roles/prometheus-federate/templates/update-motd.d/05-service.j2 b/roles/prometheus-federate/templates/update-motd.d/05-service.j2 new file mode 100755 index 0000000..f027dc4 --- /dev/null +++ b/roles/prometheus-federate/templates/update-motd.d/05-service.j2 @@ -0,0 +1,4 @@ +#!/bin/sh +# {{ ansible_managed }} +echo "> prometheus a été déployé sur cette machine." +echo " Voir /etc/prometheus/" From 6ec449c3b33e11fa5b9f9c2e1d7cea51f7700d7f Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 10 Feb 2021 20:43:43 +0100 Subject: [PATCH 06/11] Fix restarting prometheus snmp (not installed) --- roles/prometheus-federate/handlers/main.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/roles/prometheus-federate/handlers/main.yml b/roles/prometheus-federate/handlers/main.yml index 670847b..d648db2 100644 --- a/roles/prometheus-federate/handlers/main.yml +++ b/roles/prometheus-federate/handlers/main.yml @@ -4,7 +4,3 @@ name: prometheus state: restarted -- name: Restart prometheus-snmp-exporter - service: - name: prometheus-snmp-exporter - state: restarted From a5b4deaceeb63d2713073fe2765f21a58d47662d Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 17 Feb 2021 17:42:24 +0100 Subject: [PATCH 07/11] Rename federate role; update of alerts of federate prometheus; update of configuration of federate prometheus --- monitoring.yml | 3 +- roles/prometheus/tasks/main.yml.save | 84 +++++++++++++++++++ .../handlers/main.yml | 0 .../tasks/main.yml | 0 .../templates/prometheus/alert.rules.yml.j2 | 30 +++---- .../templates/prometheus/django.rules.yml.j2 | 0 .../templates/prometheus/prometheus.yml.j2 | 1 + .../templates/prometheus/snmp.yml.j2 | 0 .../templates/update-motd.d/05-service.j2 | 0 9 files changed, 102 insertions(+), 16 deletions(-) create mode 100644 roles/prometheus/tasks/main.yml.save rename roles/{prometheus-federate => prometheus_federate}/handlers/main.yml (100%) rename roles/{prometheus-federate => prometheus_federate}/tasks/main.yml (100%) rename roles/{prometheus-federate => prometheus_federate}/templates/prometheus/alert.rules.yml.j2 (65%) rename roles/{prometheus-federate => prometheus_federate}/templates/prometheus/django.rules.yml.j2 (100%) rename roles/{prometheus-federate => prometheus_federate}/templates/prometheus/prometheus.yml.j2 (98%) rename roles/{prometheus-federate => prometheus_federate}/templates/prometheus/snmp.yml.j2 (100%) rename roles/{prometheus-federate => prometheus_federate}/templates/update-motd.d/05-service.j2 (100%) diff --git a/monitoring.yml b/monitoring.yml index bcf4ef2..c81934c 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -121,8 +121,9 @@ - prometheus-rives.adm.auro.re - prometheus-aurore.adm.auro.re - prometheus-ovh.adm.auro.re + - prometheus-federate.adm.auro.re roles: - - prometheus-federate + - prometheus_federate # Monitor all hosts diff --git a/roles/prometheus/tasks/main.yml.save b/roles/prometheus/tasks/main.yml.save new file mode 100644 index 0000000..57945ce --- /dev/null +++ b/roles/prometheus/tasks/main.yml.save @@ -0,0 +1,84 @@ +--- +- name: Install Prometheus + apt: + update_cache: true + name: + - prometheus + - prometheus-snmp-exporter + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Configure Prometheus + template: + src: prometheus/prometheus.yml.j2 + dest: /etc/prometheus/prometheus.yml + mode: 0644 + notify: Restart Prometheus + +- name: Configure Prometheus alert rules + template: + src: "prometheus/{{ item }}.j2" + dest: "/etc/prometheus/{{ item }}" + mode: 0644 + notify: Restart Prometheus + loop: + - alert.rules.yml + - django.rules.yml + +- name: Make Prometheus snmp-exporter listen on localhost only + lineinfile: + path: /etc/default/prometheus-snmp-exporter + regexp: '^ARGS=' + line: "ARGS=\"--web.listen-address=127.0.0.1:9116\"" + notify: Restart prometheus-snmp-exporter + +# This file store SNMP OIDs +- name: Configure Prometheus snmp-exporter + template: + src: "prometheus/snmp.yml.j2" + dest: "/etc/prometheus/snmp.yml" + mode: 0600 + owner: prometheus + notify: Restart prometheus-snmp-exporter + +# We don't need to restart Prometheus when updating nodes +- name: Configure Prometheus nodes + copy: + content: "{{ prometheus_targets | to_nice_json }}" + dest: /etc/prometheus/targets.json + mode: 0644 + +# We don't need to restart Prometheus when updating nodes +- name: Configure Prometheus Ubiquity Unifi SNMP devices + copy: + content: "{{ prometheus_unifi_snmp_targets | to_nice_json }}" + dest: /etc/prometheus/targets_unifi_snmp.json + mode: 0644 + when: prometheus_unifi_snmp_targets is defined + +- name: Configure Prometheus UPS SNMP devices + copy: + content: "{{ [{'targets': prometheus_ups_snmp_targets }]7yk[:Cp_g$#dT'yv!. | to_nice_json }}\n" + dest: /etc/prometheus/targets_ups_snmp.json + mode: 0644 + when: prometheus_ups_snmp_targets is defined + +- name: Configure Prometheus docker monitoring + copy: + content: "{{ [{'targets': prometheus_docker_targets }] | to_nice_json }}\n" + dest: /etc/prometheus/targets_docker.json + mode: 0644 + when: prometheus_docker_targets is defined + +- name: Activate prometheus service + systemd: + name: prometheus + enabled: true + state: started + +- name: Indicate role in motd + template: + src: update-motd.d/05-service.j2 + dest: /etc/update-motd.d/05-prometheus + mode: 0755 diff --git a/roles/prometheus-federate/handlers/main.yml b/roles/prometheus_federate/handlers/main.yml similarity index 100% rename from roles/prometheus-federate/handlers/main.yml rename to roles/prometheus_federate/handlers/main.yml diff --git a/roles/prometheus-federate/tasks/main.yml b/roles/prometheus_federate/tasks/main.yml similarity index 100% rename from roles/prometheus-federate/tasks/main.yml rename to roles/prometheus_federate/tasks/main.yml diff --git a/roles/prometheus-federate/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 similarity index 65% rename from roles/prometheus-federate/templates/prometheus/alert.rules.yml.j2 rename to roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 index e2cb42c..0fd14f5 100644 --- a/roles/prometheus-federate/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 @@ -13,7 +13,7 @@ groups: labels: severity: critical annotations: - summary: "{{ $labels.instance }} est invisible depuis plus de 3 minutes !" + summary: "Federate : {{ $labels.exported_instance }} est invisible depuis plus de 3 minutes !" # Alert for out of memory - alert: OutOfMemory @@ -22,7 +22,7 @@ groups: labels: severity: warning annotations: - summary: "Mémoire libre de {{ $labels.instance }} à {{ humanize $value }}%." + summary: "Federate : Mémoire libre de {{ $labels.exported_instance }} à {{ humanize $value }}%." # Alert for out of disk space - alert: OutOfDiskSpace @@ -31,7 +31,7 @@ groups: labels: severity: warning annotations: - summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ humanize $value }}%." + summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.exported_instance }} à {{ humanize $value }}%." # Alert for out of inode space on disk - alert: OutOfInodes @@ -40,7 +40,7 @@ groups: labels: severity: warning annotations: - summary: "Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.instance }}." + summary: "Federate : Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.exported_instance }}." # Alert for high CPU usage - alert: CpuUsage @@ -49,7 +49,7 @@ groups: labels: severity: warning annotations: - summary: "CPU sur {{ $labels.instance }} à {{ humanize $value }}%." + summary: "Federate : CPU sur {{ $labels.exported_instance }} à {{ humanize $value }}%." # Check systemd unit (> buster) - alert: SystemdServiceFailed @@ -58,8 +58,8 @@ groups: labels: severity: warning annotations: - summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" - + summary: "Federate : {{ $labels.name }} a échoué sur {{ $labels.exported_instance }}" + # Check UPS - alert: UpsOutputSourceChanged expr: upsOutputSource != 3 @@ -67,7 +67,7 @@ groups: labels: severity: warning annotations: - summary: "La source d'alimentation de {{ $labels.instance }} a changé !" + summary: "Federate : La source d'alimentation de {{ $labels.exported_instance }} a changé !" - alert: UpsBatteryStatusWarning expr: upsBatteryStatus == 3 @@ -75,7 +75,7 @@ groups: labels: severity: warning annotations: - summary: "L'état de la batterie de {{ $labels.instance }} est faible !" + summary: "Federate : L'état de la batterie de {{ $labels.exported_instance }} est faible !" - alert: UpsBatteryStatusCritical expr: upsBatteryStatus == 4 @@ -83,7 +83,7 @@ groups: labels: severity: warning annotations: - summary: "L'état de la batterie de {{ $labels.instance }} est affaibli !" + summary: "L'état de la batterie de {{ $labels.exported_instance }} est affaibli !" - alert: UpsHighLoad expr: upsOutputPercentLoad > 70 @@ -91,7 +91,7 @@ groups: labels: severity: critical annotations: - summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !" + summary: "Federate : La charge de {{ $labels.exported_instance }} est de {{ $value }}% !" - alert: UpsWrongInputVoltage expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) @@ -99,7 +99,7 @@ groups: labels: severity: warning annotations: - summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V." + summary: "Federate : La tension d'entrée de {{ $labels.exported_instance }} est de {{ $value }}V." - alert: UpsWrongOutputVoltage expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) @@ -107,7 +107,7 @@ groups: labels: severity: warning annotations: - summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V." + summary: "Federate : La tension de sortie de {{ $labels.exported_instance }} est de {{ $value }}V." - alert: UpsTimeRemainingWarning expr: upsEstimatedMinutesRemaining < 15 @@ -115,7 +115,7 @@ groups: labels: severity: warning annotations: - summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." + summary: "Federate : L'autonomie restante sur {{ $labels.exported_instance }} est de {{ $value }} min." - alert: UpsTimeRemainingCritical expr: upsEstimatedMinutesRemaining < 5 @@ -123,7 +123,7 @@ groups: labels: severity: critical annotations: - summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." + summary: "Federate : L'autonomie restante sur {{ $labels.exported_instance }} est de {{ $value }} min." {% endraw %} diff --git a/roles/prometheus-federate/templates/prometheus/django.rules.yml.j2 b/roles/prometheus_federate/templates/prometheus/django.rules.yml.j2 similarity index 100% rename from roles/prometheus-federate/templates/prometheus/django.rules.yml.j2 rename to roles/prometheus_federate/templates/prometheus/django.rules.yml.j2 diff --git a/roles/prometheus-federate/templates/prometheus/prometheus.yml.j2 b/roles/prometheus_federate/templates/prometheus/prometheus.yml.j2 similarity index 98% rename from roles/prometheus-federate/templates/prometheus/prometheus.yml.j2 rename to roles/prometheus_federate/templates/prometheus/prometheus.yml.j2 index 0d4c601..52e5a92 100644 --- a/roles/prometheus-federate/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus/prometheus.yml.j2 @@ -52,4 +52,5 @@ scrape_configs: - '{job="ups_snmp"}' - '{job="django"}' - '{job="docker"}' + - '{job="switch"}' diff --git a/roles/prometheus-federate/templates/prometheus/snmp.yml.j2 b/roles/prometheus_federate/templates/prometheus/snmp.yml.j2 similarity index 100% rename from roles/prometheus-federate/templates/prometheus/snmp.yml.j2 rename to roles/prometheus_federate/templates/prometheus/snmp.yml.j2 diff --git a/roles/prometheus-federate/templates/update-motd.d/05-service.j2 b/roles/prometheus_federate/templates/update-motd.d/05-service.j2 similarity index 100% rename from roles/prometheus-federate/templates/update-motd.d/05-service.j2 rename to roles/prometheus_federate/templates/update-motd.d/05-service.j2 From 61001e09f52ba9bd34e094ff26981464a129d1b5 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 17 Feb 2021 18:08:39 +0100 Subject: [PATCH 08/11] Add alert for load usage --- roles/prometheus/templates/prometheus/alert.rules.yml.j2 | 9 +++++++++ .../templates/prometheus/alert.rules.yml.j2 | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index e2cb42c..d4eec79 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -59,6 +59,15 @@ groups: severity: warning annotations: summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" + + # Check load of instance + - alert: LoadUsage + expr: node_load1 > 5 + for: 2m + labels: + severity: warning + annotations: + summary: "La charge de {{ $labels.instance }} est à {{ $value }} % !" # Check UPS - alert: UpsOutputSourceChanged diff --git a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 index 0fd14f5..d05b451 100644 --- a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 @@ -60,6 +60,15 @@ groups: annotations: summary: "Federate : {{ $labels.name }} a échoué sur {{ $labels.exported_instance }}" + # Check load of instance + - alert: LoadUsage + expr: node_load1 > 5 + for: 2m + labels: + severity: warning + annotations: + summary: "Federate : la charge de {{ $labels.exported_instance }} est à {{ $value }} % !" + # Check UPS - alert: UpsOutputSourceChanged expr: upsOutputSource != 3 From 0b90c9944b56698185a8e7fe9198efb1fcf0f9be Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 17 Feb 2021 18:15:31 +0100 Subject: [PATCH 09/11] Fix CI warning from last commit --- roles/prometheus_federate/handlers/main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/roles/prometheus_federate/handlers/main.yml b/roles/prometheus_federate/handlers/main.yml index d648db2..4214def 100644 --- a/roles/prometheus_federate/handlers/main.yml +++ b/roles/prometheus_federate/handlers/main.yml @@ -3,4 +3,3 @@ service: name: prometheus state: restarted - From b278b02bc2f3d5f5119aba709aca93fcd6dc679c Mon Sep 17 00:00:00 2001 From: pz2891 Date: Wed, 17 Feb 2021 19:37:33 +0100 Subject: [PATCH 10/11] Remove percentage sign for load alert --- roles/prometheus/templates/prometheus/alert.rules.yml.j2 | 2 +- .../prometheus_federate/templates/prometheus/alert.rules.yml.j2 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index d4eec79..028d5d0 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -67,7 +67,7 @@ groups: labels: severity: warning annotations: - summary: "La charge de {{ $labels.instance }} est à {{ $value }} % !" + summary: "La charge de {{ $labels.instance }} est à {{ $value }} !" # Check UPS - alert: UpsOutputSourceChanged diff --git a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 index d05b451..f78df48 100644 --- a/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus/alert.rules.yml.j2 @@ -67,7 +67,7 @@ groups: labels: severity: warning annotations: - summary: "Federate : la charge de {{ $labels.exported_instance }} est à {{ $value }} % !" + summary: "Federate : la charge de {{ $labels.exported_instance }} est à {{ $value }} !" # Check UPS - alert: UpsOutputSourceChanged From d7d0676f5e82349907b3230e41dc290275705159 Mon Sep 17 00:00:00 2001 From: pz2891 Date: Thu, 18 Feb 2021 17:53:15 +0100 Subject: [PATCH 11/11] Remove .save file; remove fo fleming prometheus --- monitoring.yml | 2 +- roles/prometheus/tasks/main.yml.save | 84 ---------------------------- 2 files changed, 1 insertion(+), 85 deletions(-) delete mode 100644 roles/prometheus/tasks/main.yml.save diff --git a/monitoring.yml b/monitoring.yml index c81934c..53bdae7 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -1,6 +1,6 @@ #!/usr/bin/env ansible-playbook --- -- hosts: prometheus-fleming.adm.auro.re,prometheus-fleming-fo.adm.auro.re +- hosts: prometheus-fleming.adm.auro.re vars: prometheus_alertmanager: docker-ovh.adm.auro.re:9093 snmp_unifi_password: "{{ vault_snmp_unifi_password }}" diff --git a/roles/prometheus/tasks/main.yml.save b/roles/prometheus/tasks/main.yml.save deleted file mode 100644 index 57945ce..0000000 --- a/roles/prometheus/tasks/main.yml.save +++ /dev/null @@ -1,84 +0,0 @@ ---- -- name: Install Prometheus - apt: - update_cache: true - name: - - prometheus - - prometheus-snmp-exporter - register: apt_result - retries: 3 - until: apt_result is succeeded - -- name: Configure Prometheus - template: - src: prometheus/prometheus.yml.j2 - dest: /etc/prometheus/prometheus.yml - mode: 0644 - notify: Restart Prometheus - -- name: Configure Prometheus alert rules - template: - src: "prometheus/{{ item }}.j2" - dest: "/etc/prometheus/{{ item }}" - mode: 0644 - notify: Restart Prometheus - loop: - - alert.rules.yml - - django.rules.yml - -- name: Make Prometheus snmp-exporter listen on localhost only - lineinfile: - path: /etc/default/prometheus-snmp-exporter - regexp: '^ARGS=' - line: "ARGS=\"--web.listen-address=127.0.0.1:9116\"" - notify: Restart prometheus-snmp-exporter - -# This file store SNMP OIDs -- name: Configure Prometheus snmp-exporter - template: - src: "prometheus/snmp.yml.j2" - dest: "/etc/prometheus/snmp.yml" - mode: 0600 - owner: prometheus - notify: Restart prometheus-snmp-exporter - -# We don't need to restart Prometheus when updating nodes -- name: Configure Prometheus nodes - copy: - content: "{{ prometheus_targets | to_nice_json }}" - dest: /etc/prometheus/targets.json - mode: 0644 - -# We don't need to restart Prometheus when updating nodes -- name: Configure Prometheus Ubiquity Unifi SNMP devices - copy: - content: "{{ prometheus_unifi_snmp_targets | to_nice_json }}" - dest: /etc/prometheus/targets_unifi_snmp.json - mode: 0644 - when: prometheus_unifi_snmp_targets is defined - -- name: Configure Prometheus UPS SNMP devices - copy: - content: "{{ [{'targets': prometheus_ups_snmp_targets }]7yk[:Cp_g$#dT'yv!. | to_nice_json }}\n" - dest: /etc/prometheus/targets_ups_snmp.json - mode: 0644 - when: prometheus_ups_snmp_targets is defined - -- name: Configure Prometheus docker monitoring - copy: - content: "{{ [{'targets': prometheus_docker_targets }] | to_nice_json }}\n" - dest: /etc/prometheus/targets_docker.json - mode: 0644 - when: prometheus_docker_targets is defined - -- name: Activate prometheus service - systemd: - name: prometheus - enabled: true - state: started - -- name: Indicate role in motd - template: - src: update-motd.d/05-service.j2 - dest: /etc/update-motd.d/05-prometheus - mode: 0755