Merge pull request 'monitoring_pdu' (#3) from monitoring_pdu into master
Some checks failed
continuous-integration/drone/push Build is failing
Some checks failed
continuous-integration/drone/push Build is failing
Reviewed-on: Aurore/ansible#3
This commit is contained in:
commit
3da5dde917
7 changed files with 212 additions and 11 deletions
14
hosts
14
hosts
|
@ -29,13 +29,15 @@ stream.adm.auro.re
|
||||||
re2o-server.adm.auro.re
|
re2o-server.adm.auro.re
|
||||||
re2o-ldap.adm.auro.re
|
re2o-ldap.adm.auro.re
|
||||||
re2o-db.adm.auro.re
|
re2o-db.adm.auro.re
|
||||||
pendragon.adm.auro.re
|
|
||||||
services-bdd-local.adm.auro.re
|
services-bdd-local.adm.auro.re
|
||||||
backup.adm.auro.re
|
backup.adm.auro.re
|
||||||
services-web.adm.auro.re
|
services-web.adm.auro.re
|
||||||
mail.adm.auro.re
|
mail.adm.auro.re
|
||||||
wikijs.adm.auro.re
|
wikijs.adm.auro.re
|
||||||
|
prometheus-aurore.adm.auro.re
|
||||||
|
|
||||||
|
[aurore_testing_vm]
|
||||||
|
pendragon.adm.auro.re
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
# OVH
|
# OVH
|
||||||
|
@ -337,6 +339,7 @@ gf-5-1.borne.auro.re
|
||||||
# Les Rives
|
# Les Rives
|
||||||
[rives_pve]
|
[rives_pve]
|
||||||
thor.adm.auro.re
|
thor.adm.auro.re
|
||||||
|
loki.adm.auro.re
|
||||||
|
|
||||||
[rives_vm]
|
[rives_vm]
|
||||||
dhcp-rives-backup.adm.auro.re
|
dhcp-rives-backup.adm.auro.re
|
||||||
|
@ -345,6 +348,7 @@ dns-rives-backup.adm.auro.re
|
||||||
radius-rives-backup.adm.auro.re
|
radius-rives-backup.adm.auro.re
|
||||||
routeur-rives-backup.adm.auro.re
|
routeur-rives-backup.adm.auro.re
|
||||||
ldap-replica-rives.adm.auro.re
|
ldap-replica-rives.adm.auro.re
|
||||||
|
prometheus-rives.adm.auro.re
|
||||||
|
|
||||||
[rives_unifi]
|
[rives_unifi]
|
||||||
r3-4-4.borne.auro.re
|
r3-4-4.borne.auro.re
|
||||||
|
@ -396,29 +400,31 @@ ovh_vm
|
||||||
[fleming:children]
|
[fleming:children]
|
||||||
fleming_pve
|
fleming_pve
|
||||||
fleming_vm
|
fleming_vm
|
||||||
#fleming_unifi
|
fleming_unifi
|
||||||
|
|
||||||
# everything at pacaterie
|
# everything at pacaterie
|
||||||
[pacaterie:children]
|
[pacaterie:children]
|
||||||
pacaterie_pve
|
pacaterie_pve
|
||||||
pacaterie_vm
|
pacaterie_vm
|
||||||
#pacaterie_unifi
|
pacaterie_unifi
|
||||||
|
|
||||||
# everything at edc
|
# everything at edc
|
||||||
[edc:children]
|
[edc:children]
|
||||||
edc_pve
|
edc_pve
|
||||||
edc_vm
|
edc_vm
|
||||||
|
edc_unifi
|
||||||
|
|
||||||
# everything at georgesand
|
# everything at georgesand
|
||||||
[gs:children]
|
[gs:children]
|
||||||
gs_pve
|
gs_pve
|
||||||
gs_vm
|
gs_vm
|
||||||
|
gs_unifi
|
||||||
|
|
||||||
# everything at Les Rives
|
# everything at Les Rives
|
||||||
[rives:children]
|
[rives:children]
|
||||||
rives_pve
|
rives_pve
|
||||||
rives_vm
|
rives_vm
|
||||||
|
rives_unifi
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
# Groups by type
|
# Groups by type
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
roles:
|
roles:
|
||||||
- prometheus
|
- prometheus
|
||||||
|
|
||||||
- hosts: prometheus-pacaterie.adm.auro.re,prometheus-pacaterie-fo.adm.auro.re
|
- hosts: prometheus-pacaterie.adm.auro.re
|
||||||
vars:
|
vars:
|
||||||
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
||||||
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
||||||
|
@ -25,6 +25,8 @@
|
||||||
{{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }}
|
{{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }}
|
||||||
prometheus_unifi_snmp_targets:
|
prometheus_unifi_snmp_targets:
|
||||||
- targets: "{{ groups['pacaterie_unifi'] | list | sort }}"
|
- targets: "{{ groups['pacaterie_unifi'] | list | sort }}"
|
||||||
|
prometheus_ups_snmp_targets:
|
||||||
|
- ups-pn-1.ups.auro.re
|
||||||
roles:
|
roles:
|
||||||
- prometheus
|
- prometheus
|
||||||
|
|
||||||
|
@ -34,6 +36,9 @@
|
||||||
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
||||||
|
|
||||||
# Prometheus targets.json
|
# Prometheus targets.json
|
||||||
|
prometheus_ups_snmp_targets:
|
||||||
|
- ups-ec-1.ups.auro.re
|
||||||
|
|
||||||
prometheus_targets:
|
prometheus_targets:
|
||||||
- targets: |
|
- targets: |
|
||||||
{{ groups['edc_pve'] + groups['edc_vm'] | list | sort }}
|
{{ groups['edc_pve'] + groups['edc_vm'] | list | sort }}
|
||||||
|
@ -53,10 +58,42 @@
|
||||||
{{ groups['gs_pve'] + groups['gs_vm'] | list | sort }}
|
{{ groups['gs_pve'] + groups['gs_vm'] | list | sort }}
|
||||||
prometheus_unifi_snmp_targets:
|
prometheus_unifi_snmp_targets:
|
||||||
- targets: "{{ groups['gs_unifi'] | list | sort }}"
|
- targets: "{{ groups['gs_unifi'] | list | sort }}"
|
||||||
|
prometheus_ups_snmp_targets:
|
||||||
|
- ups-gk-1.ups.auro.re
|
||||||
roles:
|
roles:
|
||||||
- prometheus
|
- prometheus
|
||||||
|
|
||||||
|
- hosts: prometheus-rives.adm.auro.re
|
||||||
|
vars:
|
||||||
|
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
||||||
|
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
||||||
|
|
||||||
|
# Prometheus targets.json
|
||||||
|
prometheus_ups_snmp_targets:
|
||||||
|
- ups-r3-1.ups.auro.re
|
||||||
|
|
||||||
|
prometheus_targets:
|
||||||
|
- targets: |
|
||||||
|
{{ groups['rives_pve'] + groups['rives_vm'] | list | sort }}
|
||||||
|
prometheus_unifi_snmp_targets:
|
||||||
|
- targets: "{{ groups['rives_unifi'] | list | sort }}"
|
||||||
|
roles:
|
||||||
|
- prometheus
|
||||||
|
|
||||||
|
- hosts: prometheus-aurore.adm.auro.re
|
||||||
|
vars:
|
||||||
|
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
||||||
|
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
||||||
|
|
||||||
|
# Prometheus targets.json
|
||||||
|
prometheus_targets:
|
||||||
|
- targets: |
|
||||||
|
{{ groups['aurore_pve'] + groups['aurore_vm'] + groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
|
||||||
|
roles:
|
||||||
|
- prometheus
|
||||||
|
|
||||||
|
|
||||||
# Monitor all hosts
|
# Monitor all hosts
|
||||||
- hosts: all,!unifi,!ovh
|
- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
|
||||||
roles:
|
roles:
|
||||||
- prometheus_node
|
- prometheus_node
|
||||||
|
|
|
@ -55,6 +55,14 @@
|
||||||
content: "{{ prometheus_unifi_snmp_targets | to_nice_json }}"
|
content: "{{ prometheus_unifi_snmp_targets | to_nice_json }}"
|
||||||
dest: /etc/prometheus/targets_unifi_snmp.json
|
dest: /etc/prometheus/targets_unifi_snmp.json
|
||||||
mode: 0644
|
mode: 0644
|
||||||
|
when: prometheus_unifi_snmp_targets is defined
|
||||||
|
|
||||||
|
- name: Configure Prometheus UPS SNMP devices
|
||||||
|
copy:
|
||||||
|
content: "{{ [{'targets': prometheus_ups_snmp_targets }] | to_nice_json }}\n"
|
||||||
|
dest: /etc/prometheus/targets_ups_snmp.json
|
||||||
|
mode: 0644
|
||||||
|
when: prometheus_ups_snmp_targets is defined
|
||||||
|
|
||||||
- name: Activate prometheus service
|
- name: Activate prometheus service
|
||||||
systemd:
|
systemd:
|
||||||
|
|
|
@ -59,4 +59,71 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
|
summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
|
||||||
|
|
||||||
|
# Check UPS
|
||||||
|
- alert: UpsOutputSourceChanged
|
||||||
|
expr: upsOutputSource != 3
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "La source d'alimentation de {{ $labels.instance }} a changé !"
|
||||||
|
|
||||||
|
- alert: UpsBatteryStatusWarning
|
||||||
|
expr: upsBatteryStatus == 3
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "L'état de la batterie de {{ $labels.instance }} est faible !"
|
||||||
|
|
||||||
|
- alert: UpsBatteryStatusCritical
|
||||||
|
expr: upsBatteryStatus == 4
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "L'état de la batterie de {{ $labels.instance }} est affaibli !"
|
||||||
|
|
||||||
|
- alert: UpsHighLoad
|
||||||
|
expr: upsOutputPercentLoad > 70
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !"
|
||||||
|
|
||||||
|
- alert: UpsWrongInputVoltage
|
||||||
|
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V."
|
||||||
|
|
||||||
|
- alert: UpsWrongOutputVoltage
|
||||||
|
expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V."
|
||||||
|
|
||||||
|
- alert: UpsTimeRemainingWarning
|
||||||
|
expr: upsEstimatedMinutesRemaining < 15
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min."
|
||||||
|
|
||||||
|
- alert: UpsTimeRemainingCritical
|
||||||
|
expr: upsEstimatedMinutesRemaining < 5
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min."
|
||||||
|
|
||||||
|
|
||||||
{% endraw %}
|
{% endraw %}
|
||||||
|
|
|
@ -65,3 +65,19 @@ scrape_configs:
|
||||||
scheme: https
|
scheme: https
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: []
|
- targets: []
|
||||||
|
|
||||||
|
- job_name: ups_snmp
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- '/etc/prometheus/targets_ups_snmp.json'
|
||||||
|
metrics_path: /snmp
|
||||||
|
params:
|
||||||
|
module: [eatonups]
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: 127.0.0.1:9116
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,78 @@
|
||||||
# - Optimiser les règles pour les bornes Unifi,
|
# - Optimiser les règles pour les bornes Unifi,
|
||||||
# on pourrait indexer avec les SSID
|
# on pourrait indexer avec les SSID
|
||||||
|
|
||||||
|
eatonups:
|
||||||
|
walk:
|
||||||
|
- 1.3.6.1.2.1.33.1.2
|
||||||
|
- 1.3.6.1.2.1.33.1.3
|
||||||
|
- 1.3.6.1.2.1.33.1.4
|
||||||
|
- 1.3.6.1.4.1.534.1.6
|
||||||
|
get:
|
||||||
|
- 1.3.6.1.2.1.1.3.0
|
||||||
|
metrics:
|
||||||
|
- name: sysUpTime
|
||||||
|
oid: 1.3.6.1.2.1.1.3
|
||||||
|
type: gauge
|
||||||
|
help: The time (in hundredths of a second) since the network management portion
|
||||||
|
of the system was last re-initialized. - 1.3.6.1.2.1.1.3
|
||||||
|
- name: upsBatteryStatus
|
||||||
|
oid: 1.3.6.1.2.1.33.1.2.1
|
||||||
|
type: gauge
|
||||||
|
help: The indication of the capacity remaining in the UPS system's batteries -
|
||||||
|
1.3.6.1.2.1.33.1.2.1
|
||||||
|
- name: upsEstimatedMinutesRemaining
|
||||||
|
oid: 1.3.6.1.2.1.33.1.2.3
|
||||||
|
type: gauge
|
||||||
|
help: An estimate of the time to battery charge depletion under the present load
|
||||||
|
conditions if the utility power is off and remains off, or if it were to be
|
||||||
|
lost and remain off. - 1.3.6.1.2.1.33.1.2.3
|
||||||
|
- name: upsInputVoltage
|
||||||
|
oid: 1.3.6.1.2.1.33.1.3.3.1.3
|
||||||
|
type: gauge
|
||||||
|
help: The magnitude of the present input voltage. - 1.3.6.1.2.1.33.1.3.3.1.3
|
||||||
|
indexes:
|
||||||
|
- labelname: upsInputLineIndex
|
||||||
|
type: gauge
|
||||||
|
- name: upsOutputSource
|
||||||
|
oid: 1.3.6.1.2.1.33.1.4.1
|
||||||
|
type: gauge
|
||||||
|
help: The present source of output power - 1.3.6.1.2.1.33.1.4.1
|
||||||
|
- name: upsOutputVoltage
|
||||||
|
oid: 1.3.6.1.2.1.33.1.4.4.1.2
|
||||||
|
type: gauge
|
||||||
|
help: The present output voltage. - 1.3.6.1.2.1.33.1.4.4.1.2
|
||||||
|
indexes:
|
||||||
|
- labelname: upsOutputLineIndex
|
||||||
|
type: gauge
|
||||||
|
- name: upsOutputPower
|
||||||
|
oid: 1.3.6.1.2.1.33.1.4.4.1.4
|
||||||
|
type: gauge
|
||||||
|
help: The present output true power. - 1.3.6.1.2.1.33.1.4.4.1.4
|
||||||
|
indexes:
|
||||||
|
- labelname: upsOutputLineIndex
|
||||||
|
type: gauge
|
||||||
|
- name: upsOutputPercentLoad
|
||||||
|
oid: 1.3.6.1.2.1.33.1.4.4.1.5
|
||||||
|
type: gauge
|
||||||
|
help: The percentage of the UPS power capacity presently being used on this output
|
||||||
|
line, i.e., the greater of the percent load of true power capacity and the percent
|
||||||
|
load of VA. - 1.3.6.1.2.1.33.1.4.4.1.5
|
||||||
|
indexes:
|
||||||
|
- labelname: upsOutputLineIndex
|
||||||
|
type: gauge
|
||||||
|
- name: xupsEnvRemoteTemp
|
||||||
|
oid: 1.3.6.1.4.1.534.1.6.5
|
||||||
|
type: gauge
|
||||||
|
help: The reading of an EMP's temperature sensor. - 1.3.6.1.4.1.534.1.6.5
|
||||||
|
- name: xupsEnvRemoteHumidity
|
||||||
|
oid: 1.3.6.1.4.1.534.1.6.6
|
||||||
|
type: gauge
|
||||||
|
help: The reading of an EMP's humidity sensor. - 1.3.6.1.4.1.534.1.6.6
|
||||||
|
version: 1
|
||||||
|
auth:
|
||||||
|
community: public
|
||||||
|
|
||||||
|
|
||||||
procurve_switch:
|
procurve_switch:
|
||||||
walk:
|
walk:
|
||||||
- 1.3.6.1.2.1.31.1.1.1.10
|
- 1.3.6.1.2.1.31.1.1.1.10
|
||||||
|
|
5
test.sh
5
test.sh
|
@ -1,5 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
for ip in `cat hosts|grep pacaterie.adm.auro.re`; do
|
|
||||||
ssh-copy-id $ip
|
|
||||||
done
|
|
||||||
|
|
Loading…
Reference in a new issue