diff --git a/hosts b/hosts index 5b24998..2185284 100644 --- a/hosts +++ b/hosts @@ -10,6 +10,8 @@ viviane.adm.auro.re +[aurore_server] + [aurore_pve] merlin.adm.auro.re @@ -41,21 +43,20 @@ litl.adm.auro.re log.adm.auro.re [aurore_testing_vm] -pendragon.adm.auro.re ############################################################################### # OVH +[ovh_server] + [ovh_pve] horus.adm.auro.re [ovh_container] synapse.adm.auro.re -phabricator.adm.auro.re -wiki.adm.auro.re +wikijs.adm.auro.re www.adm.auro.re proxy-ovh.adm.auro.re -matrix-services.adm.auro.re [ovh_vm] serge.adm.auro.re @@ -192,6 +193,8 @@ fl-4-2.borne.auro.re ############################################################################### # Pacaterie +[pacaterie_server] + [pacaterie_pve] mordred.adm.auro.re titan.adm.auro.re @@ -270,6 +273,7 @@ ee-2-1.borne.auro.re ee-2-2.borne.auro.re eo-0-1.borne.auro.re eo-2-1.borne.auro.re +eo-3-1.borne.auro.re ep-0-1.borne.auro.re ep-1-1.borne.auro.re ep-1-2.borne.auro.re @@ -279,6 +283,8 @@ ep-1-3.borne.auro.re ############################################################################### # George Sand +[gs_server] + [gs_pve] lancelot.adm.auro.re odin.adm.auro.re @@ -323,7 +329,6 @@ gd-1-2.borne.auro.re gd-2-1.borne.auro.re gd-3-1.borne.auro.re gd-4-1.borne.auro.re -gd-4-2.borne.auro.re gd-5-1.borne.auro.re gd-5-2.borne.auro.re gd-garage-1.borne.auro.re @@ -340,7 +345,6 @@ gf-0-1.borne.auro.re gf-1-1.borne.auro.re gf-2-1.borne.auro.re gf-3-1.borne.auro.re -gf-3-2.borne.auro.re gf-4-1.borne.auro.re gf-5-1.borne.auro.re gg-5-1.borne.auro.re @@ -349,6 +353,9 @@ gh-1-2.borne.auro.re ############################################################################### # Les Rives + +[rives_server] + [rives_pve] thor.adm.auro.re loki.adm.auro.re @@ -367,6 +374,8 @@ radius-rives.adm.auro.re routeur-rives.adm.auro.re [rives_unifi] +r1-0-1.borne.auro.re +r1-0-2.borne.auro.re r1-1-1.borne.auro.re r1-1-2.borne.auro.re r1-1-3.borne.auro.re @@ -383,6 +392,7 @@ r1-3-3.borne.auro.re r1-3-4.borne.auro.re r1-3-5.borne.auro.re r1-3-6.borne.auro.re +r2-0-1.borne.auro.re r2-1-1.borne.auro.re r2-1-2.borne.auro.re r2-1-3.borne.auro.re @@ -430,11 +440,14 @@ r3-4-8.borne.auro.re # -aurore services [aurore:children] +aurore_server +aurore_pve aurore_vm # everything at ovh [ovh:children] +ovh_server ovh_pve ovh_container ovh_vm @@ -448,6 +461,7 @@ fleming_unifi # everything at pacaterie [pacaterie:children] +pacaterie_server pacaterie_pve pacaterie_vm pacaterie_unifi @@ -461,12 +475,14 @@ edc_unifi # everything at georgesand [gs:children] +gs_server gs_pve gs_vm gs_unifi # everything at Les Rives [rives:children] +rives_server rives_pve rives_vm rives_unifi @@ -489,6 +505,11 @@ rives_vm # every server [server:children] +ovh_server +aurore_server +pacaterie_server +gs_server +rives_server fleming_server edc_server diff --git a/monitoring.yml b/monitoring.yml index bb3c92a..b058a45 100755 --- a/monitoring.yml +++ b/monitoring.yml @@ -14,6 +14,9 @@ {{ groups['fleming_pve'] + groups['fleming_vm'] | list | sort }} prometheus_unifi_snmp_targets: - targets: "{{ groups['fleming_unifi'] | list | sort }}" + prometheus_ipmi_targets: + - targets: | + {{ groups['fleming_pve'] + groups['fleming_server'] | list | sort }} roles: - prometheus @@ -34,6 +37,9 @@ prometheus_ups_snmp_targets: - ups-pn-1.ups.auro.re - ups-ps-1.ups.auro.re + prometheus_ipmi_targets: + - targets: | + {{ groups['pacaterie_pve'] + groups['pacaterie_server'] | list | sort }} roles: - prometheus @@ -54,6 +60,9 @@ prometheus_targets: - targets: | {{ groups['edc_pve'] + groups['edc_vm'] + groups['edc_server'] | list | sort }} + prometheus_ipmi_targets: + - targets: | + {{ groups['edc_pve'] + groups['edc_server'] | list | sort }} prometheus_unifi_snmp_targets: - targets: "{{ groups['edc_unifi'] | list | sort }}" roles: @@ -77,6 +86,9 @@ - ups-gk-1.ups.auro.re prometheus_pdu_snmp_targets: - pdu-ga-1.ups.auro.re + prometheus_ipmi_targets: + - targets: | + {{ groups['gs_pve'] + groups['gs_server'] | list | sort }} roles: - prometheus @@ -98,6 +110,9 @@ {{ groups['rives_pve'] + groups['rives_vm'] | list | sort }} prometheus_unifi_snmp_targets: - targets: "{{ groups['rives_unifi'] | list | sort }}" + prometheus_ipmi_targets: + - targets: | + {{ groups['rives_pve'] + groups['rives_server'] | list | sort }} roles: - prometheus @@ -113,6 +128,9 @@ prometheus_targets: - targets: | {{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }} + prometheus_ipmi_targets: + - targets: | + {{ groups['aurore_pve'] + groups['aurore_server'] | list | sort }} prometheus_postgres_targets: - targets: | {{ groups['bdd'] + groups['radius'] | list | sort }} @@ -146,7 +164,7 @@ # Prometheus targets.json prometheus_targets: - targets: | - {{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }} + {{ groups['ovh_pve'] + groups['ovh_vm'] + groups['ovh_container'] | list | sort }} prometheus_postgres_targets: - targets: - bdd-ovh.adm.auro.re @@ -180,8 +198,13 @@ roles: - prometheus_postgres +# IPMI Exporters +- hosts: server,pve + roles: + - prometheus_ipmi + # Monitor all hosts -- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container +- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm roles: - prometheus_node diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index f9e48e8..92df23a 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -113,6 +113,13 @@ mode: 0644 when: prometheus_pdu_snmp_targets is defined +- name: Configure Prometheus ipmi targets monitoring + copy: + content: "{{ prometheus_ipmi_targets | to_nice_json }}\n" + dest: /etc/prometheus/targets_ipmi.json + mode: 0644 + when: prometheus_ipmi_targets is defined + - name: Activate prometheus service systemd: name: prometheus diff --git a/roles/prometheus/templates/postgres.rules.yml.j2 b/roles/prometheus/templates/postgres.rules.yml.j2 index 0ec4952..aa24537 100644 --- a/roles/prometheus/templates/postgres.rules.yml.j2 +++ b/roles/prometheus/templates/postgres.rules.yml.j2 @@ -112,7 +112,7 @@ groups: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) ) * 100 - > 7 + > 20 for: 0m labels: severity: warning diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 4400de3..2402ae6 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -140,4 +140,22 @@ scrape_configs: - target_label: __address__ replacement: 127.0.0.1:9116 + - job_name: ipmi + file_sd_configs: + - files: + - '/etc/prometheus/targets_ipmi.json' + metrics_path: /metrics + params: + module: [default] + relabel_configs: + # Do not put :9290 in instance name, rather here + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - source_labels: [__param_target] + target_label: __address__ + replacement: '$1:9290' + + ... diff --git a/roles/prometheus/templates/server.rules.yml.j2 b/roles/prometheus/templates/server.rules.yml.j2 index 5277fdf..1248a18 100644 --- a/roles/prometheus/templates/server.rules.yml.j2 +++ b/roles/prometheus/templates/server.rules.yml.j2 @@ -11,7 +11,7 @@ groups: rules: - alert: InstanceDown - expr: up{instance!~".*.borne.auro.re$"} == 0 + expr: up{instance!~".*.borne.auro.re$", job="servers"} == 0 for: 3m labels: severity: critical @@ -50,7 +50,7 @@ groups: node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes ) - ) * 100 >= 20 + ) * 100 >= 50 for: 3m labels: severity: warning diff --git a/roles/prometheus_federate/templates/prometheus.yml.j2 b/roles/prometheus_federate/templates/prometheus.yml.j2 index 4757b98..fc848db 100644 --- a/roles/prometheus_federate/templates/prometheus.yml.j2 +++ b/roles/prometheus_federate/templates/prometheus.yml.j2 @@ -39,4 +39,5 @@ scrape_configs: - '{job="apc_pdu_snmp"}' - '{job="docker"}' - '{job="switch_snmp"}' + - '{job="ipmi"}' ... diff --git a/roles/prometheus_ipmi/files/ipmi_conf/ipmi.sh b/roles/prometheus_ipmi/files/ipmi_conf/ipmi.sh new file mode 100755 index 0000000..501a91b --- /dev/null +++ b/roles/prometheus_ipmi/files/ipmi_conf/ipmi.sh @@ -0,0 +1,3 @@ +#!/bin/sh +sudo /usr/sbin/$(basename $0) "$@" + diff --git a/roles/prometheus_ipmi/files/prometheus b/roles/prometheus_ipmi/files/prometheus new file mode 100644 index 0000000..6e69147 --- /dev/null +++ b/roles/prometheus_ipmi/files/prometheus @@ -0,0 +1,9 @@ +# Prometheus can be sudo for ipmi collector + +prometheus ALL = NOPASSWD: /usr/sbin/ipmimonitoring,\ + /usr/sbin/ipmi-sensors,\ + /usr/sbin/ipmi-dcmi,\ + /usr/sbin/ipmi-raw,\ + /usr/sbin/bmc-info,\ + /usr/sbin/ipmi-chassis,\ + /usr/sbin/ipmi-sel diff --git a/roles/prometheus_ipmi/handlers/main.yml b/roles/prometheus_ipmi/handlers/main.yml new file mode 100644 index 0000000..639818e --- /dev/null +++ b/roles/prometheus_ipmi/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Restart prometheus-ipmi-exporter + service: + name: prometheus-ipmi-exporter + state: restarted + daemon_reload: true diff --git a/roles/prometheus_ipmi/tasks/main.yml b/roles/prometheus_ipmi/tasks/main.yml new file mode 100644 index 0000000..b60897e --- /dev/null +++ b/roles/prometheus_ipmi/tasks/main.yml @@ -0,0 +1,65 @@ +--- +- name: Install Prometheus ipmi-exporter + apt: + update_cache: true + name: prometheus-ipmi-exporter + register: apt_result + retries: 3 + until: apt_result is succeeded + +- name: Make Prometheus ipmi-exporter scrap on localhost only + lineinfile: + path: /etc/default/prometheus-ipmi-exporter + regexp: '^ARGS=' + line: | + ARGS="--config.file=/etc/prometheus/ipmi_local.yml --freeipmi.path='/etc/prometheus/ipmi_conf/'" + notify: Restart prometheus-ipmi-exporter + +- name: Define prometheus to be sudo for some command + copy: + src: files/prometheus + dest: /etc/sudoers.d/prometheus + owner: root + group: root + mode: u=r,g=r,o= + notify: Restart prometheus-ipmi-exporter + +- name: Override commands launchs by prometheus for ipmi + copy: + src: files/ipmi_conf + dest: /etc/prometheus + owner: prometheus + group: prometheus + mode: u=rx,g=r,o= + notify: Restart prometheus-ipmi-exporter + +- name: Override commands launchs by prometheus for ipmi + file: + src: '/etc/prometheus/ipmi_conf/ipmi.sh' + dest: '/etc/prometheus/ipmi_conf/{{ item }}' + state: link + force: true + owner: prometheus + group: prometheus + mode: u=rx,g=,o= + loop: + - bmc-info + - ipmi-chassis + - ipmi-dcmi + - ipmimonitoring + - ipmi-raw + - ipmi-sensors + - ipmi-sel + notify: Restart prometheus-ipmi-exporter + +- name: Activate prometheus-ipmi-exporter service + systemd: + name: prometheus-ipmi-exporter + enabled: true + state: started + +- name: Restart prometheus-ipmi-exporter + systemd: + state: restarted + daemon_reload: true + name: prometheus-ipmi-exporter