monitoring_pdu #3
7 changed files with 212 additions and 11 deletions
14
hosts
14
hosts
|
@ -29,13 +29,15 @@ stream.adm.auro.re
|
|||
re2o-server.adm.auro.re
|
||||
re2o-ldap.adm.auro.re
|
||||
re2o-db.adm.auro.re
|
||||
pendragon.adm.auro.re
|
||||
services-bdd-local.adm.auro.re
|
||||
backup.adm.auro.re
|
||||
services-web.adm.auro.re
|
||||
mail.adm.auro.re
|
||||
wikijs.adm.auro.re
|
||||
prometheus-aurore.adm.auro.re
|
||||
|
||||
[aurore_testing_vm]
|
||||
pendragon.adm.auro.re
|
||||
|
||||
###############################################################################
|
||||
# OVH
|
||||
|
@ -337,6 +339,7 @@ gf-5-1.borne.auro.re
|
|||
# Les Rives
|
||||
[rives_pve]
|
||||
thor.adm.auro.re
|
||||
loki.adm.auro.re
|
||||
|
||||
[rives_vm]
|
||||
dhcp-rives-backup.adm.auro.re
|
||||
|
@ -345,6 +348,7 @@ dns-rives-backup.adm.auro.re
|
|||
radius-rives-backup.adm.auro.re
|
||||
routeur-rives-backup.adm.auro.re
|
||||
ldap-replica-rives.adm.auro.re
|
||||
prometheus-rives.adm.auro.re
|
||||
|
||||
[rives_unifi]
|
||||
r3-4-4.borne.auro.re
|
||||
|
@ -396,29 +400,31 @@ ovh_vm
|
|||
[fleming:children]
|
||||
fleming_pve
|
||||
fleming_vm
|
||||
#fleming_unifi
|
||||
fleming_unifi
|
||||
|
||||
# everything at pacaterie
|
||||
[pacaterie:children]
|
||||
pacaterie_pve
|
||||
pacaterie_vm
|
||||
#pacaterie_unifi
|
||||
pacaterie_unifi
|
||||
|
||||
# everything at edc
|
||||
[edc:children]
|
||||
edc_pve
|
||||
edc_vm
|
||||
edc_unifi
|
||||
|
||||
# everything at georgesand
|
||||
[gs:children]
|
||||
gs_pve
|
||||
gs_vm
|
||||
gs_unifi
|
||||
|
||||
# everything at Les Rives
|
||||
[rives:children]
|
||||
rives_pve
|
||||
rives_vm
|
||||
|
||||
rives_unifi
|
||||
|
||||
###############################################################################
|
||||
# Groups by type
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
roles:
|
||||
- prometheus
|
||||
|
||||
- hosts: prometheus-pacaterie.adm.auro.re,prometheus-pacaterie-fo.adm.auro.re
|
||||
- hosts: prometheus-pacaterie.adm.auro.re
|
||||
vars:
|
||||
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
||||
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
||||
|
@ -25,6 +25,8 @@
|
|||
{{ groups['pacaterie_pve'] + groups['pacaterie_vm'] | list | sort }}
|
||||
prometheus_unifi_snmp_targets:
|
||||
- targets: "{{ groups['pacaterie_unifi'] | list | sort }}"
|
||||
prometheus_ups_snmp_targets:
|
||||
- ups-pn-1.ups.auro.re
|
||||
roles:
|
||||
- prometheus
|
||||
|
||||
|
@ -34,6 +36,9 @@
|
|||
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
||||
|
||||
# Prometheus targets.json
|
||||
prometheus_ups_snmp_targets:
|
||||
- ups-ec-1.ups.auro.re
|
||||
|
||||
prometheus_targets:
|
||||
- targets: |
|
||||
{{ groups['edc_pve'] + groups['edc_vm'] | list | sort }}
|
||||
|
@ -53,10 +58,42 @@
|
|||
{{ groups['gs_pve'] + groups['gs_vm'] | list | sort }}
|
||||
prometheus_unifi_snmp_targets:
|
||||
- targets: "{{ groups['gs_unifi'] | list | sort }}"
|
||||
prometheus_ups_snmp_targets:
|
||||
- ups-gk-1.ups.auro.re
|
||||
roles:
|
||||
- prometheus
|
||||
|
||||
- hosts: prometheus-rives.adm.auro.re
|
||||
vars:
|
||||
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
||||
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
||||
|
||||
# Prometheus targets.json
|
||||
prometheus_ups_snmp_targets:
|
||||
- ups-r3-1.ups.auro.re
|
||||
|
||||
prometheus_targets:
|
||||
- targets: |
|
||||
{{ groups['rives_pve'] + groups['rives_vm'] | list | sort }}
|
||||
prometheus_unifi_snmp_targets:
|
||||
- targets: "{{ groups['rives_unifi'] | list | sort }}"
|
||||
roles:
|
||||
- prometheus
|
||||
|
||||
- hosts: prometheus-aurore.adm.auro.re
|
||||
vars:
|
||||
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
|
||||
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
|
||||
|
||||
# Prometheus targets.json
|
||||
prometheus_targets:
|
||||
- targets: |
|
||||
{{ groups['aurore_pve'] + groups['aurore_vm'] + groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
|
||||
roles:
|
||||
- prometheus
|
||||
|
||||
|
||||
# Monitor all hosts
|
||||
- hosts: all,!unifi,!ovh
|
||||
- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container
|
||||
roles:
|
||||
- prometheus_node
|
||||
|
|
|
@ -55,6 +55,14 @@
|
|||
content: "{{ prometheus_unifi_snmp_targets | to_nice_json }}"
|
||||
dest: /etc/prometheus/targets_unifi_snmp.json
|
||||
mode: 0644
|
||||
when: prometheus_unifi_snmp_targets is defined
|
||||
|
||||
- name: Configure Prometheus UPS SNMP devices
|
||||
copy:
|
||||
content: "{{ [{'targets': prometheus_ups_snmp_targets }] | to_nice_json }}\n"
|
||||
dest: /etc/prometheus/targets_ups_snmp.json
|
||||
mode: 0644
|
||||
when: prometheus_ups_snmp_targets is defined
|
||||
|
||||
- name: Activate prometheus service
|
||||
systemd:
|
||||
|
|
|
@ -59,4 +59,71 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
|
||||
|
||||
# Check UPS
|
||||
- alert: UpsOutputSourceChanged
|
||||
expr: upsOutputSource != 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "La source d'alimentation de {{ $labels.instance }} a changé !"
|
||||
|
||||
- alert: UpsBatteryStatusWarning
|
||||
expr: upsBatteryStatus == 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "L'état de la batterie de {{ $labels.instance }} est faible !"
|
||||
|
||||
- alert: UpsBatteryStatusCritical
|
||||
expr: upsBatteryStatus == 4
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "L'état de la batterie de {{ $labels.instance }} est affaibli !"
|
||||
|
||||
- alert: UpsHighLoad
|
||||
expr: upsOutputPercentLoad > 70
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "La charge de {{ $labels.instance }} est de {{ $value }}% !"
|
||||
|
||||
- alert: UpsWrongInputVoltage
|
||||
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "La tension d'entrée de {{ $labels.instance }} est de {{ $value }}V."
|
||||
|
||||
- alert: UpsWrongOutputVoltage
|
||||
expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "La tension de sortie de {{ $labels.instance }} est de {{ $value }}V."
|
||||
|
||||
- alert: UpsTimeRemainingWarning
|
||||
expr: upsEstimatedMinutesRemaining < 15
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min."
|
||||
|
||||
- alert: UpsTimeRemainingCritical
|
||||
expr: upsEstimatedMinutesRemaining < 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "L'autonomie restante sur {{ $labels.instance }} est de {{ $value }} min."
|
||||
|
||||
|
||||
{% endraw %}
|
||||
|
|
|
@ -65,3 +65,19 @@ scrape_configs:
|
|||
scheme: https
|
||||
static_configs:
|
||||
- targets: []
|
||||
|
||||
- job_name: ups_snmp
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- '/etc/prometheus/targets_ups_snmp.json'
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [eatonups]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: 127.0.0.1:9116
|
||||
|
||||
|
|
|
@ -6,6 +6,78 @@
|
|||
# - Optimiser les règles pour les bornes Unifi,
|
||||
# on pourrait indexer avec les SSID
|
||||
|
||||
eatonups:
|
||||
walk:
|
||||
- 1.3.6.1.2.1.33.1.2
|
||||
- 1.3.6.1.2.1.33.1.3
|
||||
- 1.3.6.1.2.1.33.1.4
|
||||
- 1.3.6.1.4.1.534.1.6
|
||||
get:
|
||||
- 1.3.6.1.2.1.1.3.0
|
||||
metrics:
|
||||
- name: sysUpTime
|
||||
oid: 1.3.6.1.2.1.1.3
|
||||
type: gauge
|
||||
help: The time (in hundredths of a second) since the network management portion
|
||||
of the system was last re-initialized. - 1.3.6.1.2.1.1.3
|
||||
- name: upsBatteryStatus
|
||||
oid: 1.3.6.1.2.1.33.1.2.1
|
||||
type: gauge
|
||||
help: The indication of the capacity remaining in the UPS system's batteries -
|
||||
1.3.6.1.2.1.33.1.2.1
|
||||
- name: upsEstimatedMinutesRemaining
|
||||
oid: 1.3.6.1.2.1.33.1.2.3
|
||||
type: gauge
|
||||
help: An estimate of the time to battery charge depletion under the present load
|
||||
conditions if the utility power is off and remains off, or if it were to be
|
||||
lost and remain off. - 1.3.6.1.2.1.33.1.2.3
|
||||
- name: upsInputVoltage
|
||||
oid: 1.3.6.1.2.1.33.1.3.3.1.3
|
||||
type: gauge
|
||||
help: The magnitude of the present input voltage. - 1.3.6.1.2.1.33.1.3.3.1.3
|
||||
indexes:
|
||||
- labelname: upsInputLineIndex
|
||||
type: gauge
|
||||
- name: upsOutputSource
|
||||
oid: 1.3.6.1.2.1.33.1.4.1
|
||||
type: gauge
|
||||
help: The present source of output power - 1.3.6.1.2.1.33.1.4.1
|
||||
- name: upsOutputVoltage
|
||||
oid: 1.3.6.1.2.1.33.1.4.4.1.2
|
||||
type: gauge
|
||||
help: The present output voltage. - 1.3.6.1.2.1.33.1.4.4.1.2
|
||||
indexes:
|
||||
- labelname: upsOutputLineIndex
|
||||
type: gauge
|
||||
- name: upsOutputPower
|
||||
oid: 1.3.6.1.2.1.33.1.4.4.1.4
|
||||
type: gauge
|
||||
help: The present output true power. - 1.3.6.1.2.1.33.1.4.4.1.4
|
||||
indexes:
|
||||
- labelname: upsOutputLineIndex
|
||||
type: gauge
|
||||
- name: upsOutputPercentLoad
|
||||
oid: 1.3.6.1.2.1.33.1.4.4.1.5
|
||||
type: gauge
|
||||
help: The percentage of the UPS power capacity presently being used on this output
|
||||
line, i.e., the greater of the percent load of true power capacity and the percent
|
||||
load of VA. - 1.3.6.1.2.1.33.1.4.4.1.5
|
||||
indexes:
|
||||
- labelname: upsOutputLineIndex
|
||||
type: gauge
|
||||
- name: xupsEnvRemoteTemp
|
||||
oid: 1.3.6.1.4.1.534.1.6.5
|
||||
type: gauge
|
||||
help: The reading of an EMP's temperature sensor. - 1.3.6.1.4.1.534.1.6.5
|
||||
- name: xupsEnvRemoteHumidity
|
||||
oid: 1.3.6.1.4.1.534.1.6.6
|
||||
type: gauge
|
||||
help: The reading of an EMP's humidity sensor. - 1.3.6.1.4.1.534.1.6.6
|
||||
version: 1
|
||||
auth:
|
||||
community: public
|
||||
|
||||
|
||||
procurve_switch:
|
||||
walk:
|
||||
- 1.3.6.1.2.1.31.1.1.1.10
|
||||
|
|
5
test.sh
5
test.sh
|
@ -1,5 +0,0 @@
|
|||
#!/bin/bash
|
||||
for ip in `cat hosts|grep pacaterie.adm.auro.re`; do
|
||||
ssh-copy-id $ip
|
||||
done
|
||||
|
Loading…
Reference in a new issue