Merge pull request 'monitoring_pdu' (#3) from monitoring_pdu into master
Some checks failed
continuous-integration/drone/push Build is failing

Reviewed-on: Aurore/ansible#3
This commit is contained in:
otthorn 2021-01-29 20:32:12 +01:00
commit 3da5dde917
7 changed files with 212 additions and 11 deletions

14
hosts
View file

@ -29,13 +29,15 @@ stream.adm.auro.re
re2o-server.adm.auro.re re2o-server.adm.auro.re
re2o-ldap.adm.auro.re re2o-ldap.adm.auro.re
re2o-db.adm.auro.re re2o-db.adm.auro.re
pendragon.adm.auro.re
services-bdd-local.adm.auro.re services-bdd-local.adm.auro.re
backup.adm.auro.re backup.adm.auro.re
services-web.adm.auro.re services-web.adm.auro.re
mail.adm.auro.re mail.adm.auro.re
wikijs.adm.auro.re wikijs.adm.auro.re
prometheus-aurore.adm.auro.re
[aurore_testing_vm]
pendragon.adm.auro.re
############################################################################### ###############################################################################
# OVH # OVH
@ -337,6 +339,7 @@ gf-5-1.borne.auro.re
# Les Rives # Les Rives
[rives_pve] [rives_pve]
thor.adm.auro.re thor.adm.auro.re
loki.adm.auro.re
[rives_vm] [rives_vm]
dhcp-rives-backup.adm.auro.re dhcp-rives-backup.adm.auro.re
@ -345,6 +348,7 @@ dns-rives-backup.adm.auro.re
radius-rives-backup.adm.auro.re radius-rives-backup.adm.auro.re
routeur-rives-backup.adm.auro.re routeur-rives-backup.adm.auro.re
ldap-replica-rives.adm.auro.re ldap-replica-rives.adm.auro.re
prometheus-rives.adm.auro.re
[rives_unifi] [rives_unifi]
r3-4-4.borne.auro.re r3-4-4.borne.auro.re
@ -396,29 +400,31 @@ ovh_vm
[fleming:children] [fleming:children]
fleming_pve fleming_pve
fleming_vm fleming_vm
#fleming_unifi fleming_unifi
# everything at pacaterie # everything at pacaterie
[pacaterie:children] [pacaterie:children]
pacaterie_pve pacaterie_pve
pacaterie_vm pacaterie_vm
#pacaterie_unifi pacaterie_unifi
# everything at edc # everything at edc
[edc:children] [edc:children]
edc_pve edc_pve
edc_vm edc_vm
edc_unifi
# everything at georgesand # everything at georgesand
[gs:children] [gs:children]
gs_pve gs_pve
gs_vm gs_vm
gs_unifi
# everything at Les Rives # everything at Les Rives
[rives:children] [rives:children]
rives_pve rives_pve
rives_vm rives_vm
rives_unifi
############################################################################### ###############################################################################
# Groups by type # Groups by type

View file

@ -14,7 +14,7 @@
roles: roles:
- prometheus - prometheus
- hosts: prometheus-pacaterie.adm.auro.re,prometheus-pacaterie-fo.adm.auro.re - hosts: prometheus-pacaterie.adm.auro.re
vars: vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093 prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}" snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
@ -25,6 +25,8 @@
{{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }} {{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }}
prometheus_unifi_snmp_targets: prometheus_unifi_snmp_targets:
- targets: "{{ groups['pacaterie_unifi'] | list | sort }}" - targets: "{{ groups['pacaterie_unifi'] | list | sort }}"
prometheus_ups_snmp_targets:
- ups-pn-1.ups.auro.re
roles: roles:
- prometheus - prometheus
@ -34,6 +36,9 @@
snmp_unifi_password: "{{ vault_snmp_unifi_password }}" snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# Prometheus targets.json # Prometheus targets.json
prometheus_ups_snmp_targets:
- ups-ec-1.ups.auro.re
prometheus_targets: prometheus_targets:
- targets: | - targets: |
{{ groups['edc_pve'] + groups['edc_vm'] | list | sort }} {{ groups['edc_pve'] + groups['edc_vm'] | list | sort }}
@ -53,10 +58,42 @@
{{ groups['gs_pve'] + groups['gs_vm'] | list | sort }} {{ groups['gs_pve'] + groups['gs_vm'] | list | sort }}
prometheus_unifi_snmp_targets: prometheus_unifi_snmp_targets:
- targets: "{{ groups['gs_unifi'] | list | sort }}" - targets: "{{ groups['gs_unifi'] | list | sort }}"
prometheus_ups_snmp_targets:
- ups-gk-1.ups.auro.re
roles: roles:
- prometheus - prometheus
- hosts: prometheus-rives.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# Prometheus targets.json
prometheus_ups_snmp_targets:
- ups-r3-1.ups.auro.re
prometheus_targets:
- targets: |
{{ groups['rives_pve'] + groups['rives_vm'] | list | sort }}
prometheus_unifi_snmp_targets:
- targets: "{{ groups['rives_unifi'] | list | sort }}"
roles:
- prometheus
- hosts: prometheus-aurore.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# Prometheus targets.json
prometheus_targets:
- targets: |
{{ groups['aurore_pve'] + groups['aurore_vm'] + groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
roles:
- prometheus
# Monitor all hosts # Monitor all hosts
- hosts: all,!unifi,!ovh - hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
roles: roles:
- prometheus_node - prometheus_node

View file

@ -55,6 +55,14 @@
content: "{{ prometheus_unifi_snmp_targets | to_nice_json }}" content: "{{ prometheus_unifi_snmp_targets | to_nice_json }}"
dest: /etc/prometheus/targets_unifi_snmp.json dest: /etc/prometheus/targets_unifi_snmp.json
mode: 0644 mode: 0644
when: prometheus_unifi_snmp_targets is defined
- name: Configure Prometheus UPS SNMP devices
copy:
content: "{{ [{'targets': prometheus_ups_snmp_targets }] | to_nice_json }}\n"
dest: /etc/prometheus/targets_ups_snmp.json
mode: 0644
when: prometheus_ups_snmp_targets is defined
- name: Activate prometheus service - name: Activate prometheus service
systemd: systemd:

View file

@ -59,4 +59,71 @@ groups:
severity: warning severity: warning
annotations: annotations:
summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}" summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
# Check UPS
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 5m
labels:
severity: warning
annotations:
summary: "La source d'alimentation de {{ $labels.instance }} a changé !"
- alert: UpsBatteryStatusWarning
expr: upsBatteryStatus == 3
for: 5m
labels:
severity: warning
annotations:
summary: "L'état de la batterie de {{ $labels.instance }} est faible !"
- alert: UpsBatteryStatusCritical
expr: upsBatteryStatus == 4
for: 5m
labels:
severity: warning
annotations:
summary: "L'état de la batterie de {{ $labels.instance }} est affaibli !"
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 5m
labels:
severity: critical
annotations:
summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !"
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
labels:
severity: warning
annotations:
summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V."
- alert: UpsWrongOutputVoltage
expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240)
for: 5m
labels:
severity: warning
annotations:
summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V."
- alert: UpsTimeRemainingWarning
expr: upsEstimatedMinutesRemaining < 15
for: 5m
labels:
severity: warning
annotations:
summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min."
- alert: UpsTimeRemainingCritical
expr: upsEstimatedMinutesRemaining < 5
for: 5m
labels:
severity: critical
annotations:
summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min."
{% endraw %} {% endraw %}

View file

@ -65,3 +65,19 @@ scrape_configs:
scheme: https scheme: https
static_configs: static_configs:
- targets: [] - targets: []
- job_name: ups_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_ups_snmp.json'
metrics_path: /snmp
params:
module: [eatonups]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116

View file

@ -6,6 +6,78 @@
# - Optimiser les règles pour les bornes Unifi, # - Optimiser les règles pour les bornes Unifi,
# on pourrait indexer avec les SSID # on pourrait indexer avec les SSID
eatonups:
walk:
- 1.3.6.1.2.1.33.1.2
- 1.3.6.1.2.1.33.1.3
- 1.3.6.1.2.1.33.1.4
- 1.3.6.1.4.1.534.1.6
get:
- 1.3.6.1.2.1.1.3.0
metrics:
- name: sysUpTime
oid: 1.3.6.1.2.1.1.3
type: gauge
help: The time (in hundredths of a second) since the network management portion
of the system was last re-initialized. - 1.3.6.1.2.1.1.3
- name: upsBatteryStatus
oid: 1.3.6.1.2.1.33.1.2.1
type: gauge
help: The indication of the capacity remaining in the UPS system's batteries -
1.3.6.1.2.1.33.1.2.1
- name: upsEstimatedMinutesRemaining
oid: 1.3.6.1.2.1.33.1.2.3
type: gauge
help: An estimate of the time to battery charge depletion under the present load
conditions if the utility power is off and remains off, or if it were to be
lost and remain off. - 1.3.6.1.2.1.33.1.2.3
- name: upsInputVoltage
oid: 1.3.6.1.2.1.33.1.3.3.1.3
type: gauge
help: The magnitude of the present input voltage. - 1.3.6.1.2.1.33.1.3.3.1.3
indexes:
- labelname: upsInputLineIndex
type: gauge
- name: upsOutputSource
oid: 1.3.6.1.2.1.33.1.4.1
type: gauge
help: The present source of output power - 1.3.6.1.2.1.33.1.4.1
- name: upsOutputVoltage
oid: 1.3.6.1.2.1.33.1.4.4.1.2
type: gauge
help: The present output voltage. - 1.3.6.1.2.1.33.1.4.4.1.2
indexes:
- labelname: upsOutputLineIndex
type: gauge
- name: upsOutputPower
oid: 1.3.6.1.2.1.33.1.4.4.1.4
type: gauge
help: The present output true power. - 1.3.6.1.2.1.33.1.4.4.1.4
indexes:
- labelname: upsOutputLineIndex
type: gauge
- name: upsOutputPercentLoad
oid: 1.3.6.1.2.1.33.1.4.4.1.5
type: gauge
help: The percentage of the UPS power capacity presently being used on this output
line, i.e., the greater of the percent load of true power capacity and the percent
load of VA. - 1.3.6.1.2.1.33.1.4.4.1.5
indexes:
- labelname: upsOutputLineIndex
type: gauge
- name: xupsEnvRemoteTemp
oid: 1.3.6.1.4.1.534.1.6.5
type: gauge
help: The reading of an EMP's temperature sensor. - 1.3.6.1.4.1.534.1.6.5
- name: xupsEnvRemoteHumidity
oid: 1.3.6.1.4.1.534.1.6.6
type: gauge
help: The reading of an EMP's humidity sensor. - 1.3.6.1.4.1.534.1.6.6
version: 1
auth:
community: public
procurve_switch: procurve_switch:
walk: walk:
- 1.3.6.1.2.1.31.1.1.1.10 - 1.3.6.1.2.1.31.1.1.1.10

View file

@ -1,5 +0,0 @@
#!/bin/bash
for ip in `cat hosts|grep pacaterie.adm.auro.re`; do
ssh-copy-id $ip
done