WIP: prometheus-ipmi-exporter #63

Draft
pz2891 wants to merge 17 commits from prometheus-ipmi-exporter into master
11 changed files with 164 additions and 11 deletions

33
hosts
View file

@ -10,6 +10,8 @@
viviane.adm.auro.re viviane.adm.auro.re
[aurore_server]
[aurore_pve] [aurore_pve]
merlin.adm.auro.re merlin.adm.auro.re
@ -41,21 +43,20 @@ litl.adm.auro.re
log.adm.auro.re log.adm.auro.re
[aurore_testing_vm] [aurore_testing_vm]
pendragon.adm.auro.re
############################################################################### ###############################################################################
# OVH # OVH
[ovh_server]
[ovh_pve] [ovh_pve]
horus.adm.auro.re horus.adm.auro.re
[ovh_container] [ovh_container]
synapse.adm.auro.re synapse.adm.auro.re
phabricator.adm.auro.re wikijs.adm.auro.re
wiki.adm.auro.re
www.adm.auro.re www.adm.auro.re
proxy-ovh.adm.auro.re proxy-ovh.adm.auro.re
matrix-services.adm.auro.re
[ovh_vm] [ovh_vm]
serge.adm.auro.re serge.adm.auro.re
@ -192,6 +193,8 @@ fl-4-2.borne.auro.re
############################################################################### ###############################################################################
# Pacaterie # Pacaterie
[pacaterie_server]
[pacaterie_pve] [pacaterie_pve]
mordred.adm.auro.re mordred.adm.auro.re
titan.adm.auro.re titan.adm.auro.re
@ -270,6 +273,7 @@ ee-2-1.borne.auro.re
ee-2-2.borne.auro.re ee-2-2.borne.auro.re
eo-0-1.borne.auro.re eo-0-1.borne.auro.re
eo-2-1.borne.auro.re eo-2-1.borne.auro.re
eo-3-1.borne.auro.re
ep-0-1.borne.auro.re ep-0-1.borne.auro.re
ep-1-1.borne.auro.re ep-1-1.borne.auro.re
ep-1-2.borne.auro.re ep-1-2.borne.auro.re
@ -279,6 +283,8 @@ ep-1-3.borne.auro.re
############################################################################### ###############################################################################
# George Sand # George Sand
[gs_server]
[gs_pve] [gs_pve]
lancelot.adm.auro.re lancelot.adm.auro.re
odin.adm.auro.re odin.adm.auro.re
@ -323,7 +329,6 @@ gd-1-2.borne.auro.re
gd-2-1.borne.auro.re gd-2-1.borne.auro.re
gd-3-1.borne.auro.re gd-3-1.borne.auro.re
gd-4-1.borne.auro.re gd-4-1.borne.auro.re
gd-4-2.borne.auro.re
gd-5-1.borne.auro.re gd-5-1.borne.auro.re
gd-5-2.borne.auro.re gd-5-2.borne.auro.re
gd-garage-1.borne.auro.re gd-garage-1.borne.auro.re
@ -340,7 +345,6 @@ gf-0-1.borne.auro.re
gf-1-1.borne.auro.re gf-1-1.borne.auro.re
gf-2-1.borne.auro.re gf-2-1.borne.auro.re
gf-3-1.borne.auro.re gf-3-1.borne.auro.re
gf-3-2.borne.auro.re
gf-4-1.borne.auro.re gf-4-1.borne.auro.re
gf-5-1.borne.auro.re gf-5-1.borne.auro.re
gg-5-1.borne.auro.re gg-5-1.borne.auro.re
@ -349,6 +353,9 @@ gh-1-2.borne.auro.re
############################################################################### ###############################################################################
# Les Rives # Les Rives
[rives_server]
[rives_pve] [rives_pve]
thor.adm.auro.re thor.adm.auro.re
loki.adm.auro.re loki.adm.auro.re
@ -367,6 +374,8 @@ radius-rives.adm.auro.re
routeur-rives.adm.auro.re routeur-rives.adm.auro.re
[rives_unifi] [rives_unifi]
r1-0-1.borne.auro.re
r1-0-2.borne.auro.re
r1-1-1.borne.auro.re r1-1-1.borne.auro.re
r1-1-2.borne.auro.re r1-1-2.borne.auro.re
r1-1-3.borne.auro.re r1-1-3.borne.auro.re
@ -383,6 +392,7 @@ r1-3-3.borne.auro.re
r1-3-4.borne.auro.re r1-3-4.borne.auro.re
r1-3-5.borne.auro.re r1-3-5.borne.auro.re
r1-3-6.borne.auro.re r1-3-6.borne.auro.re
r2-0-1.borne.auro.re
r2-1-1.borne.auro.re r2-1-1.borne.auro.re
r2-1-2.borne.auro.re r2-1-2.borne.auro.re
r2-1-3.borne.auro.re r2-1-3.borne.auro.re
@ -430,11 +440,14 @@ r3-4-8.borne.auro.re
# -aurore services # -aurore services
[aurore:children] [aurore:children]
aurore_server
aurore_pve
aurore_vm aurore_vm
# everything at ovh # everything at ovh
[ovh:children] [ovh:children]
ovh_server
ovh_pve ovh_pve
ovh_container ovh_container
ovh_vm ovh_vm
@ -448,6 +461,7 @@ fleming_unifi
# everything at pacaterie # everything at pacaterie
[pacaterie:children] [pacaterie:children]
pacaterie_server
pacaterie_pve pacaterie_pve
pacaterie_vm pacaterie_vm
pacaterie_unifi pacaterie_unifi
@ -461,12 +475,14 @@ edc_unifi
# everything at georgesand # everything at georgesand
[gs:children] [gs:children]
gs_server
gs_pve gs_pve
gs_vm gs_vm
gs_unifi gs_unifi
# everything at Les Rives # everything at Les Rives
[rives:children] [rives:children]
rives_server
rives_pve rives_pve
rives_vm rives_vm
rives_unifi rives_unifi
@ -489,6 +505,11 @@ rives_vm
# every server # every server
[server:children] [server:children]
ovh_server
aurore_server
pacaterie_server
gs_server
rives_server
fleming_server fleming_server
edc_server edc_server

View file

@ -14,6 +14,9 @@
{{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }} {{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }}
prometheus_unifi_snmp_targets: prometheus_unifi_snmp_targets:
- targets: "{{ groups['fleming_unifi'] | list | sort }}" - targets: "{{ groups['fleming_unifi'] | list | sort }}"
prometheus_ipmi_targets:
- targets: |
{{ groups['fleming_pve'] + groups['fleming_server'] | list | sort }}
roles: roles:
- prometheus - prometheus
@ -34,6 +37,9 @@
prometheus_ups_snmp_targets: prometheus_ups_snmp_targets:
- ups-pn-1.ups.auro.re - ups-pn-1.ups.auro.re
- ups-ps-1.ups.auro.re - ups-ps-1.ups.auro.re
prometheus_ipmi_targets:
- targets: |
{{ groups['pacaterie_pve'] + groups['pacaterie_server'] | list | sort }}
roles: roles:
- prometheus - prometheus
@ -54,6 +60,9 @@
prometheus_targets: prometheus_targets:
- targets: | - targets: |
{{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }} {{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }}
prometheus_ipmi_targets:
- targets: |
{{ groups['edc_pve'] + groups['edc_server'] | list | sort }}
prometheus_unifi_snmp_targets: prometheus_unifi_snmp_targets:
- targets: "{{ groups['edc_unifi'] | list | sort }}" - targets: "{{ groups['edc_unifi'] | list | sort }}"
roles: roles:
@ -77,6 +86,9 @@
- ups-gk-1.ups.auro.re - ups-gk-1.ups.auro.re
prometheus_pdu_snmp_targets: prometheus_pdu_snmp_targets:
- pdu-ga-1.ups.auro.re - pdu-ga-1.ups.auro.re
prometheus_ipmi_targets:
- targets: |
{{ groups['gs_pve'] + groups['gs_server'] | list | sort }}
roles: roles:
- prometheus - prometheus
@ -98,6 +110,9 @@
{{ groups['rives_pve'] + groups['rives_vm'] | list | sort }} {{ groups['rives_pve'] + groups['rives_vm'] | list | sort }}
prometheus_unifi_snmp_targets: prometheus_unifi_snmp_targets:
- targets: "{{ groups['rives_unifi'] | list | sort }}" - targets: "{{ groups['rives_unifi'] | list | sort }}"
prometheus_ipmi_targets:
- targets: |
{{ groups['rives_pve'] + groups['rives_server'] | list | sort }}
roles: roles:
- prometheus - prometheus
@ -113,6 +128,9 @@
prometheus_targets: prometheus_targets:
- targets: | - targets: |
{{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }} {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
prometheus_ipmi_targets:
- targets: |
{{ groups['aurore_pve'] + groups['aurore_server'] | list | sort }}
prometheus_postgres_targets: prometheus_postgres_targets:
- targets: | - targets: |
{{ groups['bdd'] + groups['radius'] | list | sort }} {{ groups['bdd'] + groups['radius'] | list | sort }}
@ -146,7 +164,7 @@
# Prometheus targets.json # Prometheus targets.json
prometheus_targets: prometheus_targets:
- targets: | - targets: |
{{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} {{ groups['ovh_pve'] + groups['ovh_vm'] + groups['ovh_container'] | list | sort }}
prometheus_postgres_targets: prometheus_postgres_targets:
- targets: - targets:
- bdd-ovh.adm.auro.re - bdd-ovh.adm.auro.re
@ -180,8 +198,13 @@
roles: roles:
- prometheus_postgres - prometheus_postgres
# IPMI Exporters
- hosts: server,pve
roles:
- prometheus_ipmi
# Monitor all hosts # Monitor all hosts
- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container - hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm
roles: roles:
- prometheus_node - prometheus_node

View file

@ -113,6 +113,13 @@
mode: 0644 mode: 0644
when: prometheus_pdu_snmp_targets is defined when: prometheus_pdu_snmp_targets is defined
- name: Configure Prometheus ipmi targets monitoring
copy:
content: "{{ prometheus_ipmi_targets | to_nice_json }}\n"
dest: /etc/prometheus/targets_ipmi.json
mode: 0644
when: prometheus_ipmi_targets is defined
- name: Activate prometheus service - name: Activate prometheus service
systemd: systemd:
name: prometheus name: prometheus

View file

@ -112,7 +112,7 @@ groups:
rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) /
rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m])
) * 100 ) * 100
> 7 > 20
for: 0m for: 0m
labels: labels:
severity: warning severity: warning

View file

@ -140,4 +140,22 @@ scrape_configs:
- target_label: __address__ - target_label: __address__
replacement: 127.0.0.1:9116 replacement: 127.0.0.1:9116
- job_name: ipmi
file_sd_configs:
- files:
- '/etc/prometheus/targets_ipmi.json'
metrics_path: /metrics
params:
module: [default]
relabel_configs:
# Do not put :9290 in instance name, rather here
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9290'
... ...

View file

@ -11,7 +11,7 @@ groups:
rules: rules:
- alert: InstanceDown - alert: InstanceDown
expr: up{instance!~".*.borne.auro.re$"} == 0 expr: up{instance!~".*.borne.auro.re$", job="servers"} == 0
for: 3m for: 3m
labels: labels:
severity: critical severity: critical
@ -50,7 +50,7 @@ groups:
node_memory_SwapFree_bytes node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes / node_memory_SwapTotal_bytes
) )
) * 100 >= 20 ) * 100 >= 50
for: 3m for: 3m
labels: labels:
severity: warning severity: warning

View file

@ -39,4 +39,5 @@ scrape_configs:
- '{job="apc_pdu_snmp"}' - '{job="apc_pdu_snmp"}'
- '{job="docker"}' - '{job="docker"}'
- '{job="switch_snmp"}' - '{job="switch_snmp"}'
- '{job="ipmi"}'
... ...

View file

@ -0,0 +1,3 @@
#!/bin/sh
sudo /usr/sbin/$(basename $0) "$@"

View file

@ -0,0 +1,9 @@
# Prometheus can be sudo for ipmi collector
prometheus ALL = NOPASSWD: /usr/sbin/ipmimonitoring,\
/usr/sbin/ipmi-sensors,\
/usr/sbin/ipmi-dcmi,\
/usr/sbin/ipmi-raw,\
/usr/sbin/bmc-info,\
/usr/sbin/ipmi-chassis,\
/usr/sbin/ipmi-sel

View file

@ -0,0 +1,6 @@
---
- name: Restart prometheus-ipmi-exporter
service:
name: prometheus-ipmi-exporter
state: restarted
daemon_reload: true

View file

@ -0,0 +1,65 @@
---
- name: Install Prometheus ipmi-exporter
apt:
update_cache: true
name: prometheus-ipmi-exporter
register: apt_result
retries: 3
until: apt_result is succeeded
- name: Make Prometheus ipmi-exporter scrap on localhost only
lineinfile:
path: /etc/default/prometheus-ipmi-exporter
regexp: '^ARGS='
line: |
ARGS="--config.file=/etc/prometheus/ipmi_local.yml --freeipmi.path='/etc/prometheus/ipmi_conf/'"
notify: Restart prometheus-ipmi-exporter
- name: Define prometheus to be sudo for some command
copy:
src: files/prometheus
dest: /etc/sudoers.d/prometheus
owner: root
group: root
mode: u=r,g=r,o=
notify: Restart prometheus-ipmi-exporter
- name: Override commands launchs by prometheus for ipmi
copy:
src: files/ipmi_conf
dest: /etc/prometheus
owner: prometheus
group: prometheus
mode: u=rx,g=r,o=
notify: Restart prometheus-ipmi-exporter
- name: Override commands launchs by prometheus for ipmi
file:
src: '/etc/prometheus/ipmi_conf/ipmi.sh'
dest: '/etc/prometheus/ipmi_conf/{{ item }}'
state: link
force: true
owner: prometheus
group: prometheus
mode: u=rx,g=,o=
loop:
- bmc-info
- ipmi-chassis
- ipmi-dcmi
- ipmimonitoring
- ipmi-raw
- ipmi-sensors
- ipmi-sel
notify: Restart prometheus-ipmi-exporter
- name: Activate prometheus-ipmi-exporter service
systemd:
name: prometheus-ipmi-exporter
enabled: true
state: started
- name: Restart prometheus-ipmi-exporter
systemd:
state: restarted
daemon_reload: true
name: prometheus-ipmi-exporter