monitoring_pdu #3

Merged
otthorn merged 9 commits from monitoring_pdu into master 2021-01-29 20:32:13 +01:00
7 changed files with 212 additions and 11 deletions

14
hosts
View file

@ -29,13 +29,15 @@ stream.adm.auro.re
re2o-server.adm.auro.re
re2o-ldap.adm.auro.re
re2o-db.adm.auro.re
pendragon.adm.auro.re
services-bdd-local.adm.auro.re
backup.adm.auro.re
services-web.adm.auro.re
mail.adm.auro.re
wikijs.adm.auro.re
prometheus-aurore.adm.auro.re
[aurore_testing_vm]
pendragon.adm.auro.re
###############################################################################
# OVH
@ -337,6 +339,7 @@ gf-5-1.borne.auro.re
# Les Rives
[rives_pve]
thor.adm.auro.re
loki.adm.auro.re
[rives_vm]
dhcp-rives-backup.adm.auro.re
@ -345,6 +348,7 @@ dns-rives-backup.adm.auro.re
radius-rives-backup.adm.auro.re
routeur-rives-backup.adm.auro.re
ldap-replica-rives.adm.auro.re
prometheus-rives.adm.auro.re
[rives_unifi]
r3-4-4.borne.auro.re
@ -396,29 +400,31 @@ ovh_vm
[fleming:children]
fleming_pve
fleming_vm
#fleming_unifi
fleming_unifi
# everything at pacaterie
[pacaterie:children]
pacaterie_pve
pacaterie_vm
#pacaterie_unifi
pacaterie_unifi
# everything at edc
[edc:children]
edc_pve
edc_vm
edc_unifi
# everything at georgesand
[gs:children]
gs_pve
gs_vm
gs_unifi
# everything at Les Rives
[rives:children]
rives_pve
rives_vm
rives_unifi
###############################################################################
# Groups by type

View file

@ -14,7 +14,7 @@
roles:
- prometheus
- hosts: prometheus-pacaterie.adm.auro.re,prometheus-pacaterie-fo.adm.auro.re
- hosts: prometheus-pacaterie.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
@ -25,6 +25,8 @@
{{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }}
prometheus_unifi_snmp_targets:
- targets: "{{ groups['pacaterie_unifi'] | list | sort }}"
prometheus_ups_snmp_targets:
- ups-pn-1.ups.auro.re
roles:
- prometheus
@ -34,6 +36,9 @@
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# Prometheus targets.json
prometheus_ups_snmp_targets:
- ups-ec-1.ups.auro.re
prometheus_targets:
- targets: |
{{ groups['edc_pve'] + groups['edc_vm'] | list | sort }}
@ -53,10 +58,42 @@
{{ groups['gs_pve'] + groups['gs_vm'] | list | sort }}
prometheus_unifi_snmp_targets:
- targets: "{{ groups['gs_unifi'] | list | sort }}"
prometheus_ups_snmp_targets:
- ups-gk-1.ups.auro.re
roles:
- prometheus
- hosts: prometheus-rives.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# Prometheus targets.json
prometheus_ups_snmp_targets:
- ups-r3-1.ups.auro.re
prometheus_targets:
- targets: |
{{ groups['rives_pve'] + groups['rives_vm'] | list | sort }}
prometheus_unifi_snmp_targets:
- targets: "{{ groups['rives_unifi'] | list | sort }}"
roles:
- prometheus
- hosts: prometheus-aurore.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# Prometheus targets.json
prometheus_targets:
- targets: |
{{ groups['aurore_pve'] + groups['aurore_vm'] + groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
roles:
- prometheus
# Monitor all hosts
- hosts: all,!unifi,!ovh
- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
roles:
- prometheus_node

View file

@ -55,6 +55,14 @@
content: "{{ prometheus_unifi_snmp_targets | to_nice_json }}"
dest: /etc/prometheus/targets_unifi_snmp.json
mode: 0644
when: prometheus_unifi_snmp_targets is defined
- name: Configure Prometheus UPS SNMP devices
copy:
content: "{{ [{'targets': prometheus_ups_snmp_targets }] | to_nice_json }}\n"
dest: /etc/prometheus/targets_ups_snmp.json
mode: 0644
when: prometheus_ups_snmp_targets is defined
- name: Activate prometheus service
systemd:

View file

@ -59,4 +59,71 @@ groups:
severity: warning
annotations:
summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
# Check UPS
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 5m
labels:
severity: warning
annotations:
summary: "La source d'alimentation de {{ $labels.instance }} a changé !"
- alert: UpsBatteryStatusWarning
expr: upsBatteryStatus == 3
for: 5m
labels:
severity: warning
annotations:
summary: "L'état de la batterie de {{ $labels.instance }} est faible !"
- alert: UpsBatteryStatusCritical
expr: upsBatteryStatus == 4
for: 5m
labels:
severity: warning
annotations:
summary: "L'état de la batterie de {{ $labels.instance }} est affaibli !"
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 5m
labels:
severity: critical
annotations:
summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !"
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
labels:
severity: warning
annotations:
summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V."
- alert: UpsWrongOutputVoltage
expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240)
for: 5m
labels:
severity: warning
annotations:
summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V."
- alert: UpsTimeRemainingWarning
expr: upsEstimatedMinutesRemaining < 15
for: 5m
labels:
severity: warning
annotations:
summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min."
- alert: UpsTimeRemainingCritical
expr: upsEstimatedMinutesRemaining < 5
for: 5m
labels:
severity: critical
annotations:
summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min."
{% endraw %}

View file

@ -65,3 +65,19 @@ scrape_configs:
scheme: https
static_configs:
- targets: []
- job_name: ups_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_ups_snmp.json'
metrics_path: /snmp
params:
module: [eatonups]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116

View file

@ -6,6 +6,78 @@
# - Optimiser les règles pour les bornes Unifi,
# on pourrait indexer avec les SSID
eatonups:
walk:
- 1.3.6.1.2.1.33.1.2
- 1.3.6.1.2.1.33.1.3
- 1.3.6.1.2.1.33.1.4
- 1.3.6.1.4.1.534.1.6
get:
- 1.3.6.1.2.1.1.3.0
metrics:
- name: sysUpTime
oid: 1.3.6.1.2.1.1.3
type: gauge
help: The time (in hundredths of a second) since the network management portion
of the system was last re-initialized. - 1.3.6.1.2.1.1.3
- name: upsBatteryStatus
oid: 1.3.6.1.2.1.33.1.2.1
type: gauge
help: The indication of the capacity remaining in the UPS system's batteries -
1.3.6.1.2.1.33.1.2.1
- name: upsEstimatedMinutesRemaining
oid: 1.3.6.1.2.1.33.1.2.3
type: gauge
help: An estimate of the time to battery charge depletion under the present load
conditions if the utility power is off and remains off, or if it were to be
lost and remain off. - 1.3.6.1.2.1.33.1.2.3
- name: upsInputVoltage
oid: 1.3.6.1.2.1.33.1.3.3.1.3
type: gauge
help: The magnitude of the present input voltage. - 1.3.6.1.2.1.33.1.3.3.1.3
indexes:
- labelname: upsInputLineIndex
type: gauge
- name: upsOutputSource
oid: 1.3.6.1.2.1.33.1.4.1
type: gauge
help: The present source of output power - 1.3.6.1.2.1.33.1.4.1
- name: upsOutputVoltage
oid: 1.3.6.1.2.1.33.1.4.4.1.2
type: gauge
help: The present output voltage. - 1.3.6.1.2.1.33.1.4.4.1.2
indexes:
- labelname: upsOutputLineIndex
type: gauge
- name: upsOutputPower
oid: 1.3.6.1.2.1.33.1.4.4.1.4
type: gauge
help: The present output true power. - 1.3.6.1.2.1.33.1.4.4.1.4
indexes:
- labelname: upsOutputLineIndex
type: gauge
- name: upsOutputPercentLoad
oid: 1.3.6.1.2.1.33.1.4.4.1.5
type: gauge
help: The percentage of the UPS power capacity presently being used on this output
line, i.e., the greater of the percent load of true power capacity and the percent
load of VA. - 1.3.6.1.2.1.33.1.4.4.1.5
indexes:
- labelname: upsOutputLineIndex
type: gauge
- name: xupsEnvRemoteTemp
oid: 1.3.6.1.4.1.534.1.6.5
type: gauge
help: The reading of an EMP's temperature sensor. - 1.3.6.1.4.1.534.1.6.5
- name: xupsEnvRemoteHumidity
oid: 1.3.6.1.4.1.534.1.6.6
type: gauge
help: The reading of an EMP's humidity sensor. - 1.3.6.1.4.1.534.1.6.6
version: 1
auth:
community: public
procurve_switch:
walk:
- 1.3.6.1.2.1.31.1.1.1.10

View file

@ -1,5 +0,0 @@
#!/bin/bash
for ip in `cat hosts|grep pacaterie.adm.auro.re`; do
ssh-copy-id $ip
done