Merge crans version

This commit is contained in:
Alexandre Iooss 2019-11-01 14:16:32 +01:00
parent 6c1d904791
commit 5b3ac2a21a
No known key found for this signature in database
GPG key ID: 6C79278F3FCDCC02
11 changed files with 460 additions and 78 deletions

1
.gitignore vendored
View file

@ -1 +1,2 @@
*.retry
__pycache__

View file

@ -86,16 +86,12 @@ On va utiliser plutôt `ProxyJump`.
Dans la configuration SSH :
```
# Use a key to log on all Crans servers
# and use a proxy server
# Use a proxy jump server to log on all Aurore inventory
Host 10.128.0.* *.adm.auro.re
IdentityFile ~/.ssh/id_rsa
ProxyJump auro.re
```
Il faut savoir que depuis Ansible 2.5, des connexions persistantes sont créées
vers les serveurs puis détruites à la fin de l'exécution.
Il faut sa clé SSH configurée sur le serveur que l'on déploit.
```bash
ssh-copy-id proxy.adm.auro.re
@ -103,6 +99,8 @@ ssh-copy-id proxy.adm.auro.re
### Lancer Ansible
Il faut `python3-netaddr` sur sa machine.
Pour tester le playbook `base.yml` :
```bash
ansible-playbook --ask-vault-pass base.yml --check

98
hosts
View file

@ -2,10 +2,9 @@
# How to name your server ?
# > We name servers according to location, then type.
# > So all containers at OVH are in ovh-container.
# > Then we regroup everything in global geographic and type groups.
[ovh_pve]
[ovh_physical]
horus.adm.auro.re
[ovh_container]
@ -27,49 +26,52 @@ vpn-ovh.adm.auro.re
docker-ovh.adm.auro.re
switchs-manager.adm.auro.re
[ovh_testing_vm]
re2o-test.adm.auro.re
[fleming_pve]
[fleming_physical]
freya.adm.auro.re
#odin.adm.auro.re
marki.adm.auro.re
[fleming_vm]
ldap-replica-fleming1.adm.auro.re
#ldap-replica-fleming1.adm.auro.re
#ldap-replica-fleming2.adm.auro.re
dhcp-fleming.adm.auro.re
dns-fleming.adm.auro.re
prometheus-fleming.adm.auro.re
radius-fleming.adm.auro.re
unifi-fleming.adm.auro.re
#dhcp-fleming.adm.auro.re
#dns-fleming.adm.auro.re
#prometheus-fleming.adm.auro.re
#radius-fleming.adm.auro.re
#unifi-fleming.adm.auro.re
[pacaterie_pve]
[pacaterie_physical]
mordred.adm.auro.re
titan.adm.auro.re
[pacaterie_vm]
ldap-replica-pacaterie.adm.auro.re
dhcp-pacaterie.adm.auro.re
dns-pacaterie.adm.auro.re
prometheus-pacaterie.adm.auro.re
radius-pacaterie.adm.auro.re
unifi-pacaterie.adm.auro.re
#ldap-replica-pacaterie.adm.auro.re
#dhcp-pacaterie.adm.auro.re
#dns-pacaterie.adm.auro.re
#prometheus-pacaterie.adm.auro.re
#radius-pacaterie.adm.auro.re
#unifi-pacaterie.adm.auro.re
[edc_pve]
leodagan.adm.auro.re
[edc_physical]
chapalux.adm.auro.re
[georgesand_pve]
merlin.adm.auro.re
[edc_vm]
[georgesand_physical]
perceval.adm.auro.re
[georgesand_vm]
#####################
# Geographic groups #
#####################
# everything at ovh
[ovh:children]
ovh_pve
ovh_physical
ovh_container
ovh_vm
# everything at ovh_testing
[ovh_testing:children]
ovh_testing_vm
# everything at fleming
[fleming:children]
fleming_pve
@ -77,16 +79,22 @@ fleming_vm
# everything at pacaterie
[pacaterie:children]
pacaterie_pve
pacaterie_physical
pacaterie_vm
# everything at edc
[edc:children]
edc_pve
edc_physical
edc_vm
# everything at georgesand
[georgesand:children]
georgesand_pve
georgesand_physical
georgesand_vm
#####################
# Type groups #
#####################
# every LXC container
[container:children]
@ -97,11 +105,23 @@ ovh_container
ovh_vm
fleming_vm
pacaterie_vm
edc_vm
georgesand_vm
# every PVE
[pve:children]
ovh_pve
fleming_pve
pacaterie_pve
edc_pve
georgesand_pve
# every physical
[physical:children]
ovh_physical
fleming_physical
pacaterie_physical
edc_physical
georgesand_physical
# every server (except access points)
[server:children]
container
physical
vm
[all:vars]
# Force remote to use Python 3
ansible_python_interpreter=/usr/bin/python3

View file

@ -4,10 +4,9 @@
vars:
# Prometheus targets.json
prometheus_targets:
- labels: {job: node}
targets: "{{ groups['fleming'] | map('replace', '.re', '.re:9100') | list | sort }}"
- labels: {job: prometheus}
targets: ['localhost:9090']
- targets: "{{ groups['server'] | list | sort }}"
prometheus_unifi_snmp_targets:
- targets: []
roles:
- prometheus
- prometheus-alertmanager
@ -16,10 +15,9 @@
vars:
# Prometheus targets.json
prometheus_targets:
- labels: {job: node}
targets: "{{ groups['pacaterie'] | map('replace', '.re', '.re:9100') | list | sort }}"
- labels: {job: prometheus}
targets: ['localhost:9090']
- targets: "{{ groups['server'] | list | sort }}"
prometheus_unifi_snmp_targets:
- targets: []
roles:
- prometheus
- prometheus-alertmanager

View file

@ -1,12 +1,16 @@
# {{ ansible_managed }}
# See https://prometheus.io/docs/alerting/configuration/ for documentation.
global:
# The smarthost and SMTP sender used for mail notifications.
smtp_smarthost: 'proxy.auro.re:25'
smtp_from: 'prometheus@auro.re'
smtp_smarthost: 'localhost:25'
smtp_from: 'alertmanager@example.org'
#smtp_auth_username: 'alertmanager'
#smtp_auth_password: 'password'
smtp_require_tls: false
# The auth token for Hipchat.
hipchat_auth_token: '1234556789'
# Alternative host for Hipchat.
hipchat_api_url: 'https://hipchat.foobar.org/'
# The directory from which notification templates are read.
templates:
@ -17,26 +21,25 @@ route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
#group_by: ['alertname', 'cluster', 'service']
group_by: [] # do not group for text chat
group_by: ['instance'] # group per instance
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 1m
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 1m
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 12h
# A default receiver
receiver: team-monitoring-mails
receiver: webhook
# Inhibition rules allow to mute a set of alerts given that another alert is
@ -53,7 +56,7 @@ inhibit_rules:
receivers:
- name: 'team-monitoring-mails'
email_configs:
- to: 'monitoring.aurore@lists.crans.org'
- name: 'webhook'
webhook_configs:
- url: 'http://URL A METTRE ICI VERS WEBHOOK DISCORD TODO/'
send_resolved: true

View file

@ -3,9 +3,25 @@
apt:
update_cache: true
name: prometheus-node-exporter
install_recommends: false # Do not install smartmontools
register: apt_result
retries: 3
until: apt_result is succeeded
when:
- ansible_lsb.codename == 'buster'
# Prometheus 2 node is in stretch-backports
- name: Install Prometheus node-exporter (stretch-backports)
apt:
update_cache: true
name: prometheus-node-exporter
install_recommends: false
default_release: stretch-backports
register: apt_result
retries: 3
until: apt_result is succeeded
when:
- ansible_lsb.codename == 'stretch'
- name: Activate prometheus-node-exporter service
systemd:
@ -13,12 +29,11 @@
enabled: true
state: started
# Doesn't work on Debian Stretch
# Doesn't work on Debian Stretch with the old prometheus package
- name: Make Prometheus node-exporter listen on adm only
when:
- ansible_lsb.codename == 'buster'
lineinfile:
path: /etc/default/prometheus-node-exporter
regexp: '^ARGS='
line: "ARGS=\"--web.listen-address={{ ansible_hostname }}.adm.auro.re:9100\""
line: |
ARGS="--web.listen-address={{ ansible_hostname }}.adm.auro.re:9100"
notify: Restart prometheus-node-exporter

View file

@ -31,12 +31,27 @@
line: "ARGS=\"--web.listen-address=127.0.0.1:9116\""
notify: Restart prometheus-snmp-exporter
# This file store SNMP OIDs
- name: Configure Prometheus snmp-exporter
template:
src: "prometheus/snmp.yml.j2"
dest: "/etc/prometheus/snmp.yml"
mode: 0600
owner: prometheus
notify: Restart prometheus-snmp-exporter
# We don't need to restart Prometheus when updating nodes
- name: Configure Prometheus nodes
copy:
content: "{{ prometheus_targets | to_nice_json }}"
dest: /etc/prometheus/targets.json
# We don't need to restart Prometheus when updating nodes
- name: Configure Prometheus Ubiquity Unifi SNMP devices
copy:
content: "{{ prometheus_unifi_snmp_targets | to_nice_json }}"
dest: /etc/prometheus/targets_unifi_snmp.json
- name: Activate prometheus service
systemd:
name: prometheus

View file

@ -17,7 +17,7 @@ groups:
# Alert for out of memory
- alert: OutOfMemory
expr: ((node_memory_MemFree_bytes or node_memory_MemFree) + (node_memory_Cached_bytes or node_memory_Cached) + (node_memory_Buffers_bytes or node_memory_Buffers)) / (node_memory_MemTotal_bytes or node_memory_MemTotal) * 100 < 10
expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: warning
@ -26,7 +26,7 @@ groups:
# Alert for out of disk space
- alert: OutOfDiskSpace
expr: (node_filesystem_free_bytes{fstype="ext4"} or node_filesystem_free{fstype="ext4"}) / (node_filesystem_size_bytes{fstype="ext4"} or node_filesystem_size{fstype="ext4"}) * 100 < 10
expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10
for: 5m
labels:
severity: warning
@ -44,8 +44,8 @@ groups:
# Alert for high CPU usage
- alert: CpuUsage
expr: ((100 - avg by (instance) (irate(node_cpu{mode="idle"}[5m])) * 100) or (100 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) > 75
for: 5m
expr: (100 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 75
for: 10m
labels:
severity: warning
annotations:

View file

@ -11,6 +11,7 @@ global:
monitor: 'example'
# Alertmanager configuration
# Use prometheus alertmanager installed on the same machine
alerting:
alertmanagers:
- static_configs:
@ -18,15 +19,49 @@ alerting:
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "alert.rules.yml"
- "django.rules.yml"
- "alert.rules.yml" # Monitoring alerts, this is the file you may be searching!
- "django.rules.yml" # Custom rules specific for Django project monitoring
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
- job_name: dummy
# This reload dynamically the list of targets
# You don't need to restart Prometheus when updating targets.json
# The .json in file_sd_configs is dynamically reloaded
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
- job_name: servers
file_sd_configs:
- files:
- '/etc/prometheus/targets.json'
relabel_configs:
# Do not put :9100 in instance name, rather here
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9100'
- job_name: unifi_snmp
file_sd_configs:
- files:
- '/etc/prometheus/targets_unifi_snmp.json'
metrics_path: /snmp
params:
module: [ubiquiti_unifi]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116
- job_name: django
scheme: https
static_configs:
- targets: []

View file

@ -0,0 +1,297 @@
# {{ ansible_managed }}
# TODOlist :
# - Faire fonctionner le monitoring des switchs défini ici
# * Configurer tous les switchs avec un compte SNMPv3
# * Mettre l'inventaire des switchs dans Ansible
# - Optimiser les règles pour les bornes Unifi,
# on pourrait indexer avec les SSID
procurve_switch:
walk:
- 1.3.6.1.2.1.31.1.1.1.10
- 1.3.6.1.2.1.31.1.1.1.6
get:
- 1.3.6.1.2.1.1.3.0
- 1.3.6.1.2.1.1.5.0
- 1.3.6.1.2.1.1.6.0
metrics:
- name: sysUpTime
oid: 1.3.6.1.2.1.1.3
type: gauge
help: The time (in hundredths of a second) since the network management portion
of the system was last re-initialized. - 1.3.6.1.2.1.1.3
- name: sysName
oid: 1.3.6.1.2.1.1.5
type: DisplayString
help: An administratively-assigned name for this managed node - 1.3.6.1.2.1.1.5
- name: sysLocation
oid: 1.3.6.1.2.1.1.6
type: DisplayString
help: The physical location of this node (e.g., 'telephone closet, 3rd floor')
- 1.3.6.1.2.1.1.6
- name: ifHCOutOctets
oid: 1.3.6.1.2.1.31.1.1.1.10
type: counter
help: The total number of octets transmitted out of the interface, including framing
characters - 1.3.6.1.2.1.31.1.1.1.10
indexes:
- labelname: ifIndex
type: gauge
- name: ifHCInOctets
oid: 1.3.6.1.2.1.31.1.1.1.6
type: counter
help: The total number of octets received on the interface, including framing
characters - 1.3.6.1.2.1.31.1.1.1.6
indexes:
- labelname: ifIndex
type: gauge
version: 3
auth:
username: prometheus
ubiquiti_unifi:
walk:
- 1.3.6.1.4.1.41112.1.6
get:
- 1.3.6.1.2.1.1.5.0
- 1.3.6.1.2.1.1.6.0
metrics:
# Pour faire une WifiMap un jour, on peut entrer la location dans la conf des bornes
# - name: sysLocation
# oid: 1.3.6.1.2.1.1.6
# type: DisplayString
# help: The physical location of this node (e.g., 'telephone closet, 3rd floor')
# - 1.3.6.1.2.1.1.6
- name: unifiVapIndex
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.1
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.1'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapChannel
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.4
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.4'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapEssId
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.6
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.6'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapName
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.7
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.7'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapNumStations
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.8
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.8'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRadio
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.9
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.9'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxBytes
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.10
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.10'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxCrypts
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.11
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.11'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxDropped
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.12
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.12'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxErrors
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.13
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.13'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxFrags
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.14
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.14'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxPackets
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.15
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.15'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxBytes
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.16
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.16'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxDropped
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.17
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.17'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxErrors
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.18
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.18'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxPackets
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.19
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.19'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxRetries
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.20
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.20'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxPower
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.21
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.21'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapUp
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.22
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.22'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapUsage
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.23
type: DisplayString
help: guest or regular user - 1.3.6.1.4.1.41112.1.6.1.2.1.23
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiIfIndex
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.1
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.1'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfName
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.5
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.5'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxBytes
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.6
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.6'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxDropped
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.7
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.7'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxError
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.8
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.8'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxMulticast
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.9
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.9'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxPackets
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.10
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.10'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfTxBytes
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.12
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.12'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfTxDropped
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.13
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.13'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfTxError
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.14
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.14'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfTxPackets
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.15
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.15'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiApSystemModel
oid: 1.3.6.1.4.1.41112.1.6.3.3
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.3.3'
- name: unifiApSystemUptime
oid: 1.3.6.1.4.1.41112.1.6.3.5
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.3.5'
version: 3
auth:
security_level: authPriv
username: snmp_prometheus
password: {{ snmp_unifi_password }}
auth_protocol: SHA
priv_protocol: AES
priv_password: {{ snmp_unifi_password }}