diff --git a/hosts b/hosts index 98a435e..eec54a0 100644 --- a/hosts +++ b/hosts @@ -29,13 +29,15 @@ stream.adm.auro.re re2o-server.adm.auro.re re2o-ldap.adm.auro.re re2o-db.adm.auro.re -pendragon.adm.auro.re services-bdd-local.adm.auro.re backup.adm.auro.re services-web.adm.auro.re mail.adm.auro.re wikijs.adm.auro.re +prometheus-aurore.adm.auro.re +[aurore_testing_vm] +pendragon.adm.auro.re ############################################################################### # OVH @@ -337,6 +339,7 @@ gf-5-1.borne.auro.re # Les Rives [rives_pve] thor.adm.auro.re +loki.adm.auro.re [rives_vm] dhcp-rives-backup.adm.auro.re @@ -345,6 +348,7 @@ dns-rives-backup.adm.auro.re radius-rives-backup.adm.auro.re routeur-rives-backup.adm.auro.re ldap-replica-rives.adm.auro.re +prometheus-rives.adm.auro.re [rives_unifi] r3-4-4.borne.auro.re @@ -396,29 +400,31 @@ ovh_vm [fleming:children] fleming_pve fleming_vm -#fleming_unifi +fleming_unifi # everything at pacaterie [pacaterie:children] pacaterie_pve pacaterie_vm -#pacaterie_unifi +pacaterie_unifi # everything at edc [edc:children] edc_pve edc_vm +edc_unifi # everything at georgesand [gs:children] gs_pve gs_vm +gs_unifi # everything at Les Rives [rives:children] rives_pve rives_vm - +rives_unifi ############################################################################### # Groups by type diff --git a/monitoring.yml b/monitoring.yml index 714baa6..c31fe86 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -14,7 +14,7 @@ roles: - prometheus -- hosts: prometheus-pacaterie.adm.auro.re,prometheus-pacaterie-fo.adm.auro.re +- hosts: prometheus-pacaterie.adm.auro.re vars: prometheus_alertmanager: docker-ovh.adm.auro.re:9093 snmp_unifi_password: "{{ vault_snmp_unifi_password }}" @@ -25,6 +25,8 @@ {{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }} prometheus_unifi_snmp_targets: - targets: "{{ groups['pacaterie_unifi'] | list | sort }}" + prometheus_ups_snmp_targets: + - ups-pn-1.ups.auro.re roles: - prometheus @@ -34,6 +36,9 @@ snmp_unifi_password: "{{ vault_snmp_unifi_password }}" # Prometheus targets.json + prometheus_ups_snmp_targets: + - ups-ec-1.ups.auro.re + prometheus_targets: - targets: | {{ groups['edc_pve'] + groups['edc_vm'] | list | sort }} @@ -53,10 +58,42 @@ {{ groups['gs_pve'] + groups['gs_vm'] | list | sort }} prometheus_unifi_snmp_targets: - targets: "{{ groups['gs_unifi'] | list | sort }}" + prometheus_ups_snmp_targets: + - ups-gk-1.ups.auro.re roles: - prometheus +- hosts: prometheus-rives.adm.auro.re + vars: + prometheus_alertmanager: docker-ovh.adm.auro.re:9093 + snmp_unifi_password: "{{ vault_snmp_unifi_password }}" + + # Prometheus targets.json + prometheus_ups_snmp_targets: + - ups-r3-1.ups.auro.re + + prometheus_targets: + - targets: | + {{ groups['rives_pve'] + groups['rives_vm'] | list | sort }} + prometheus_unifi_snmp_targets: + - targets: "{{ groups['rives_unifi'] | list | sort }}" + roles: + - prometheus + +- hosts: prometheus-aurore.adm.auro.re + vars: + prometheus_alertmanager: docker-ovh.adm.auro.re:9093 + snmp_unifi_password: "{{ vault_snmp_unifi_password }}" + + # Prometheus targets.json + prometheus_targets: + - targets: | + {{ groups['aurore_pve'] + groups['aurore_vm'] + groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} + roles: + - prometheus + + # Monitor all hosts -- hosts: all,!unifi,!ovh +- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container roles: - prometheus_node diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 211aee3..8697ef9 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -55,6 +55,14 @@ content: "{{ prometheus_unifi_snmp_targets | to_nice_json }}" dest: /etc/prometheus/targets_unifi_snmp.json mode: 0644 + when: prometheus_unifi_snmp_targets is defined + +- name: Configure Prometheus UPS SNMP devices + copy: + content: "{{ [{'targets': prometheus_ups_snmp_targets }] | to_nice_json }}\n" + dest: /etc/prometheus/targets_ups_snmp.json + mode: 0644 + when: prometheus_ups_snmp_targets is defined - name: Activate prometheus service systemd: diff --git a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 index 2a10358..7ae1928 100644 --- a/roles/prometheus/templates/prometheus/alert.rules.yml.j2 +++ b/roles/prometheus/templates/prometheus/alert.rules.yml.j2 @@ -59,4 +59,71 @@ groups: severity: warning annotations: summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" + + # Check UPS + - alert: UpsOutputSourceChanged + expr: upsOutputSource != 3 + for: 5m + labels: + severity: warning + annotations: + summary: "La source d'alimentation de {{ $labels.instance }} a changé !" + + - alert: UpsBatteryStatusWarning + expr: upsBatteryStatus == 3 + for: 5m + labels: + severity: warning + annotations: + summary: "L'état de la batterie de {{ $labels.instance }} est faible !" + + - alert: UpsBatteryStatusCritical + expr: upsBatteryStatus == 4 + for: 5m + labels: + severity: warning + annotations: + summary: "L'état de la batterie de {{ $labels.instance }} est affaibli !" + + - alert: UpsHighLoad + expr: upsOutputPercentLoad > 70 + for: 5m + labels: + severity: critical + annotations: + summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !" + + - alert: UpsWrongInputVoltage + expr: (upsInputVoltage < 210) or (upsInputVoltage > 250) + for: 5m + labels: + severity: warning + annotations: + summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V." + + - alert: UpsWrongOutputVoltage + expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240) + for: 5m + labels: + severity: warning + annotations: + summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V." + + - alert: UpsTimeRemainingWarning + expr: upsEstimatedMinutesRemaining < 15 + for: 5m + labels: + severity: warning + annotations: + summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." + + - alert: UpsTimeRemainingCritical + expr: upsEstimatedMinutesRemaining < 5 + for: 5m + labels: + severity: critical + annotations: + summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min." + + {% endraw %} diff --git a/roles/prometheus/templates/prometheus/prometheus.yml.j2 b/roles/prometheus/templates/prometheus/prometheus.yml.j2 index 31df6bd..e35a0cf 100644 --- a/roles/prometheus/templates/prometheus/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus/prometheus.yml.j2 @@ -65,3 +65,19 @@ scrape_configs: scheme: https static_configs: - targets: [] + + - job_name: ups_snmp + file_sd_configs: + - files: + - '/etc/prometheus/targets_ups_snmp.json' + metrics_path: /snmp + params: + module: [eatonups] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: 127.0.0.1:9116 + diff --git a/roles/prometheus/templates/prometheus/snmp.yml.j2 b/roles/prometheus/templates/prometheus/snmp.yml.j2 index 84dcb65..5968095 100644 --- a/roles/prometheus/templates/prometheus/snmp.yml.j2 +++ b/roles/prometheus/templates/prometheus/snmp.yml.j2 @@ -6,6 +6,78 @@ # - Optimiser les règles pour les bornes Unifi, # on pourrait indexer avec les SSID +eatonups: + walk: + - 1.3.6.1.2.1.33.1.2 + - 1.3.6.1.2.1.33.1.3 + - 1.3.6.1.2.1.33.1.4 + - 1.3.6.1.4.1.534.1.6 + get: + - 1.3.6.1.2.1.1.3.0 + metrics: + - name: sysUpTime + oid: 1.3.6.1.2.1.1.3 + type: gauge + help: The time (in hundredths of a second) since the network management portion + of the system was last re-initialized. - 1.3.6.1.2.1.1.3 + - name: upsBatteryStatus + oid: 1.3.6.1.2.1.33.1.2.1 + type: gauge + help: The indication of the capacity remaining in the UPS system's batteries - + 1.3.6.1.2.1.33.1.2.1 + - name: upsEstimatedMinutesRemaining + oid: 1.3.6.1.2.1.33.1.2.3 + type: gauge + help: An estimate of the time to battery charge depletion under the present load + conditions if the utility power is off and remains off, or if it were to be + lost and remain off. - 1.3.6.1.2.1.33.1.2.3 + - name: upsInputVoltage + oid: 1.3.6.1.2.1.33.1.3.3.1.3 + type: gauge + help: The magnitude of the present input voltage. - 1.3.6.1.2.1.33.1.3.3.1.3 + indexes: + - labelname: upsInputLineIndex + type: gauge + - name: upsOutputSource + oid: 1.3.6.1.2.1.33.1.4.1 + type: gauge + help: The present source of output power - 1.3.6.1.2.1.33.1.4.1 + - name: upsOutputVoltage + oid: 1.3.6.1.2.1.33.1.4.4.1.2 + type: gauge + help: The present output voltage. - 1.3.6.1.2.1.33.1.4.4.1.2 + indexes: + - labelname: upsOutputLineIndex + type: gauge + - name: upsOutputPower + oid: 1.3.6.1.2.1.33.1.4.4.1.4 + type: gauge + help: The present output true power. - 1.3.6.1.2.1.33.1.4.4.1.4 + indexes: + - labelname: upsOutputLineIndex + type: gauge + - name: upsOutputPercentLoad + oid: 1.3.6.1.2.1.33.1.4.4.1.5 + type: gauge + help: The percentage of the UPS power capacity presently being used on this output + line, i.e., the greater of the percent load of true power capacity and the percent + load of VA. - 1.3.6.1.2.1.33.1.4.4.1.5 + indexes: + - labelname: upsOutputLineIndex + type: gauge + - name: xupsEnvRemoteTemp + oid: 1.3.6.1.4.1.534.1.6.5 + type: gauge + help: The reading of an EMP's temperature sensor. - 1.3.6.1.4.1.534.1.6.5 + - name: xupsEnvRemoteHumidity + oid: 1.3.6.1.4.1.534.1.6.6 + type: gauge + help: The reading of an EMP's humidity sensor. - 1.3.6.1.4.1.534.1.6.6 + version: 1 + auth: + community: public + + procurve_switch: walk: - 1.3.6.1.2.1.31.1.1.1.10 diff --git a/test.sh b/test.sh deleted file mode 100755 index 3e77d04..0000000 --- a/test.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -for ip in `cat hosts|grep pacaterie.adm.auro.re`; do - ssh-copy-id $ip -done -