Merge branch 'master' into mailserver
Some checks failed
continuous-integration/drone/push Build is failing
continuous-integration/drone/pr Build is failing

This commit is contained in:
otthorn 2021-02-20 14:00:43 +01:00
commit b06ac0647c
42 changed files with 1014 additions and 59 deletions

View file

@ -1,7 +1,10 @@
skip_list:
- '301'
- no-changed-when
- load-failure
- document-start
warn_list:
- '305' # Use shell only when shell functionality is required
- '503' # Tasks that run when changed should likely be handlers
- experimental # all rules tagged as experimental
exclude_paths:
- group_vars/all/vault.yml

View file

@ -4,16 +4,9 @@ type: docker
name: check
steps:
- name: yamllint
image: python:3.9-alpine
- name: ansible and yaml linting
pull: never
image: aurore-ansible-lint-image
commands:
- pip install yamllint==1.25.0
- yamllint -c .yamllint.yml .
- name: ansible-lint
image: python:3.9-alpine
commands:
- apk add --no-cache gcc libc-dev libffi-dev openssl-dev
- pip install ansible-lint==4.3.7
- ansible-lint *.yml
- ansible-lint
...

View file

@ -1,19 +0,0 @@
---
image: python:3.9-alpine
stages:
- lint
yamllint:
stage: lint
script:
- pip install yamllint==1.25.0
- yamllint -c .yamllint.yml .
ansible-lint:
stage: lint
script:
- apk add gcc libc-dev libffi-dev openssl-dev
- pip install ansible-lint==4.3.7
- ansible-lint *.yml
...

View file

@ -6,6 +6,5 @@ rules:
max: 120
level: warning
document-start:
ignore: |
/groups_var/all/vault.yml
ignore: group_vars/all/vault.yml
...

View file

@ -1,3 +1,5 @@
[![Linter Status](https://drone.auro.re/api/badges/Aurore/ansible/status.svg)](https://drone.auro.re/Aurore/ansible)
# Recettes Ansible d'Aurore
Ensemble des recettes de déploiement Ansible pour les serveurs d'Aurore.

View file

@ -0,0 +1,8 @@
---
# Deploy a correclty configured postfix on non mailhost servers
- hosts: all,!unifi
vars:
local_network: 10.128.0.0/16
relay_host: proxy.adm.auro.re
roles:
- postfix_non_mailhost

View file

@ -0,0 +1,7 @@
FROM python:3.9-alpine
LABEL description="Aurore's docker image for ansible-lint"
RUN apk add --no-cache gcc musl-dev python3-dev libffi-dev openssl-dev cargo
RUN pip install "yamllint>=1.26.0,<2.0"
RUN pip install "ansible-lint==5.0.0"
RUN pip install "ansible>=2.10,<2.11"

View file

@ -0,0 +1,18 @@
# Ansible-lint image
In order to build this image when a new version comes out, you need to
1. ssh into the `drone.adm.auro.re` server
2. git pull this repo to the lastest version
3. optionally make the changes if it has not been done yet
4. `sudo docker build -t aurore-ansible-lint-image docker-ansible-lint/`
5. ???
6. enjoy
You can verify that the image was correclty built by running
```
# list the images present
sudo docker image ls
# run your image with an interactive shell
sudo docker run -it --rm aurore-ansible-lint-image /bin/sh
```

4
hosts
View file

@ -36,6 +36,7 @@ mail.auro.re
wikijs.adm.auro.re
prometheus-aurore.adm.auro.re
portail.adm.auro.re
jitsi-aurore.adm.auro.re
[aurore_testing_vm]
pendragon.adm.auro.re
@ -62,6 +63,8 @@ vpn-ovh.adm.auro.re
docker-ovh.adm.auro.re
switchs-manager.adm.auro.re
ldap-replica-ovh.adm.auro.re
prometheus-ovh.adm.auro.re
prometheus-federate.adm.auro.re
[ovh_testing_vm]
#re2o-test.adm.auro.re
@ -266,6 +269,7 @@ ep-1-3.borne.auro.re
ep-1-2.borne.auro.re
ep-0-1.borne.auro.re
eo-2-1.borne.auro.re
ee-2-1.borne.auro.re
###############################################################################
# George Sand

View file

@ -1,6 +1,6 @@
#!/usr/bin/env ansible-playbook
---
- hosts: prometheus-fleming.adm.auro.re,prometheus-fleming-fo.adm.auro.re
- hosts: prometheus-fleming.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
@ -88,10 +88,43 @@
# Prometheus targets.json
prometheus_targets:
- targets: |
{{ groups['aurore_pve'] + groups['aurore_vm'] + groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
{{ groups['aurore_pve'] + groups['aurore_vm'] | list | sort }}
roles:
- prometheus
- hosts: prometheus-ovh.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# Prometheus targets.json
prometheus_targets:
- targets: |
{{ groups['ovh_pve'] + groups['ovh_vm'] | list | sort }}
prometheus_docker_targets:
- docker-ovh.adm.auro.re:8087
roles:
- prometheus
- hosts: prometheus-federate.adm.auro.re
vars:
prometheus_alertmanager: docker-ovh.adm.auro.re:9093
snmp_unifi_password: "{{ vault_snmp_unifi_password }}"
# Prometheus targets.json
prometheus_targets:
- prometheus-edc.adm.auro.re
- prometheus-gs.adm.auro.re
- prometheus-fleming.adm.auro.re
- prometheus-pacaterie.adm.auro.re
- prometheus-rives.adm.auro.re
- prometheus-aurore.adm.auro.re
- prometheus-ovh.adm.auro.re
- prometheus-federate.adm.auro.re
roles:
- prometheus_federate
# Monitor all hosts
- hosts: all,!edc_unifi,!fleming_unifi,!pacaterie_unifi,!gs_unifi,!rives_unifi,!aurore_testing_vm,!ovh_container

View file

@ -43,7 +43,7 @@
# username: service-user
# password: "{{ vault_serviceuser_passwd }}"
# roles:
# - re2o-service
# - re2o_service
# Deploy Unifi Controller
@ -62,4 +62,4 @@
# username: service-user
# password: "{{ vault_serviceuser_passwd }}"
# roles:
# - re2o-service
# - re2o_service

View file

@ -1,6 +1,6 @@
---
- name: force run dhcp re2o-service
shell: /var/local/re2o-services/dhcp/main.py --force
command: /var/local/re2o-services/dhcp/main.py --force
become_user: re2o-services
- name: restart dhcpd

View file

@ -1,7 +1,7 @@
---
- name: Install dhcp (re2o-service)
import_role:
name: re2o-service
name: re2o_service
vars:
service_repo: https://gitlab.federez.net/re2o/dhcp.git
service_name: dhcp
@ -18,7 +18,7 @@
owner: re2o-services
group: nogroup
recurse: true
mode: 755
mode: 0755
- name: Install isc-dhcp-server
apt:

View file

@ -0,0 +1,10 @@
---
- name: restart postfix
service:
name: postfix
state: restarted
- name: reload postfix
service:
name: postfix
state: reloaded

View file

@ -0,0 +1,17 @@
---
- name: Install postfix
apt:
name: postfix
update_cache: true
register: result
retries: 3
until: result is succeeded
- name: Configure postfix
template:
src: main.cf.j2
dest: /etc/postfix/main.cf
mode: 0644
owner: root
group: root
notify: restart postfix

View file

@ -0,0 +1,32 @@
# {{ ansible_managed }}
# See /usr/share/postfix/main.cf.dist for a commented, more complete version
# Template based on /usr/share/postfix/main.cf.debian
smtpd_banner = $myhostname ESMTP $mail_name (Debian/GNU)
biff = no
# appending .domain is the MUA's job.
append_dot_mydomain = no
# Uncomment the next line to generate "delayed mail" warnings
#delay_warning_time = 4h
readme_directory = no
# See http://www.postfix.org/COMPATIBILITY_README.html -- default to 2 on
# fresh installs.
compatibility_level = 2
# Send mail as user@{{ ansible_fqdn }}
myhostname = {{ ansible_fqdn }}
myorigin = $myhostname
mydomain = $myhostname
# Specify the trusted networks
mynetworks = 127.0.0.0/8 {{ local_network }}
# This host does not relay mail from untrusted networks
relay_domains =
# This is needed if no direct Internet access is available
relayhost = {{ relay_host }}

View file

@ -64,6 +64,13 @@
mode: 0644
when: prometheus_ups_snmp_targets is defined
- name: Configure Prometheus docker monitoring
copy:
content: "{{ [{'targets': prometheus_docker_targets }] | to_nice_json }}\n"
dest: /etc/prometheus/targets_docker.json
mode: 0644
when: prometheus_docker_targets is defined
- name: Activate prometheus service
systemd:
name: prometheus

View file

@ -22,7 +22,7 @@ groups:
labels:
severity: warning
annotations:
summary: "Mémoire libre de {{ $labels.instance }} à {{ $value | printf "%.2f" }}%."
summary: "Mémoire libre de {{ $labels.instance }} à {{ humanize $value }}%."
# Alert for out of disk space
- alert: OutOfDiskSpace
@ -31,7 +31,7 @@ groups:
labels:
severity: warning
annotations:
summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ $value | printf "%.2f" }}%."
summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.instance }} à {{ humanize $value }}%."
# Alert for out of inode space on disk
- alert: OutOfInodes
@ -49,7 +49,7 @@ groups:
labels:
severity: warning
annotations:
summary: "CPU sur {{ $labels.instance }} à {{ $value | printf "%.2f" }}%."
summary: "CPU sur {{ $labels.instance }} à {{ humanize $value }}%."
# Check systemd unit (> buster)
- alert: SystemdServiceFailed
@ -59,11 +59,20 @@ groups:
severity: warning
annotations:
summary: "{{ $labels.name }} a échoué sur {{ $labels.instance }}"
# Check load of instance
- alert: LoadUsage
expr: node_load1 > 5
for: 2m
labels:
severity: warning
annotations:
summary: "La charge de {{ $labels.instance }} est à {{ $value }} !"
# Check UPS
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 5m
for: 1m
labels:
severity: warning
annotations:
@ -71,7 +80,7 @@ groups:
- alert: UpsBatteryStatusWarning
expr: upsBatteryStatus == 3
for: 5m
for: 2m
labels:
severity: warning
annotations:
@ -79,7 +88,7 @@ groups:
- alert: UpsBatteryStatusCritical
expr: upsBatteryStatus == 4
for: 5m
for: 10m
labels:
severity: warning
annotations:
@ -95,7 +104,7 @@ groups:
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 5m
for: 10m
labels:
severity: warning
annotations:
@ -103,7 +112,7 @@ groups:
- alert: UpsWrongOutputVoltage
expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240)
for: 5m
for: 10m
labels:
severity: warning
annotations:
@ -111,7 +120,7 @@ groups:
- alert: UpsTimeRemainingWarning
expr: upsEstimatedMinutesRemaining < 15
for: 5m
for: 1m
labels:
severity: warning
annotations:
@ -119,7 +128,7 @@ groups:
- alert: UpsTimeRemainingCritical
expr: upsEstimatedMinutesRemaining < 5
for: 5m
for: 1m
labels:
severity: critical
annotations:

View file

@ -81,3 +81,7 @@ scrape_configs:
- target_label: __address__
replacement: 127.0.0.1:9116
- job_name: docker
file_sd_configs:
- files:
- '/etc/prometheus/targets_docker.json'

View file

@ -162,13 +162,31 @@ ubiquiti_unifi:
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapNumStations
- name: unifi_vap_num_stations
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.8
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.8'
indexes:
- labelname: unifiVapIndex
- labelname: unifi_vap_index
type: gauge
lookups:
- labels: [unifi_vap_index]
labelname: unifi_vap_essid
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.6
type: DisplayString
- labels: [unifi_vap_index]
labelname: unifi_vap_radio
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.9
type: DisplayString
- labels: []
labelname: unifi_vap_index
# - name: unifiVapNumStations
# oid: 1.3.6.1.4.1.41112.1.6.1.2.1.8
# type: gauge
# help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.8'
# indexes:
# - labelname: unifiVapIndex
# type: gauge
- name: unifiVapRadio
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.9
type: DisplayString

View file

@ -0,0 +1,5 @@
---
- name: Restart Prometheus
service:
name: prometheus
state: restarted

View file

@ -0,0 +1,46 @@
---
- name: Install Prometheus
apt:
update_cache: true
name:
- prometheus
register: apt_result
retries: 3
until: apt_result is succeeded
- name: Configure Prometheus
template:
src: prometheus/prometheus.yml.j2
dest: /etc/prometheus/prometheus.yml
mode: 0644
notify: Restart Prometheus
- name: Configure Prometheus alert rules
template:
src: "prometheus/{{ item }}.j2"
dest: "/etc/prometheus/{{ item }}"
mode: 0644
notify: Restart Prometheus
loop:
- alert.rules.yml
- django.rules.yml
# We don't need to restart Prometheus when updating nodes
- name: Configure Prometheus Federate devices
copy:
content: "{{ [{'targets': prometheus_targets }] | to_nice_json }}"
dest: /etc/prometheus/targets.json
mode: 0644
when: prometheus_targets is defined
- name: Activate prometheus service
systemd:
name: prometheus
enabled: true
state: started
- name: Indicate role in motd
template:
src: update-motd.d/05-service.j2
dest: /etc/update-motd.d/05-prometheus
mode: 0755

View file

@ -0,0 +1,138 @@
# {{ ansible_managed }}
{# As this is also Jinja2 it will conflict without a raw block #}
{# Depending of Prometheus Node exporter version, rules can change depending of version #}
{% raw %}
groups:
- name: alert.rules
rules:
# Alert for any instance that is unreachable for >3 minutes.
- alert: InstanceDown
expr: up == 0
for: 3m
labels:
severity: critical
annotations:
summary: "Federate : {{ $labels.exported_instance }} est invisible depuis plus de 3 minutes !"
# Alert for out of memory
- alert: OutOfMemory
expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Federate : Mémoire libre de {{ $labels.exported_instance }} à {{ humanize $value }}%."
# Alert for out of disk space
- alert: OutOfDiskSpace
expr: node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Espace libre de {{ $labels.mountpoint }} sur {{ $labels.exported_instance }} à {{ humanize $value }}%."
# Alert for out of inode space on disk
- alert: OutOfInodes
expr: node_filesystem_files_free{fstype="ext4"} / node_filesystem_files{fstype="ext4"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Federate : Presque plus d'inodes disponibles ({{ $value }}% restant) dans {{ $labels.mountpoint }} sur {{ $labels.exported_instance }}."
# Alert for high CPU usage
- alert: CpuUsage
expr: (100 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 75
for: 10m
labels:
severity: warning
annotations:
summary: "Federate : CPU sur {{ $labels.exported_instance }} à {{ humanize $value }}%."
# Check systemd unit (> buster)
- alert: SystemdServiceFailed
expr: node_systemd_unit_state{state="failed"} == 1
for: 10m
labels:
severity: warning
annotations:
summary: "Federate : {{ $labels.name }} a échoué sur {{ $labels.exported_instance }}"
# Check load of instance
- alert: LoadUsage
expr: node_load1 > 5
for: 2m
labels:
severity: warning
annotations:
summary: "Federate : la charge de {{ $labels.exported_instance }} est à {{ $value }} !"
# Check UPS
- alert: UpsOutputSourceChanged
expr: upsOutputSource != 3
for: 1m
labels:
severity: warning
annotations:
summary: "Federate : La source d'alimentation de {{ $labels.exported_instance }} a changé !"
- alert: UpsBatteryStatusWarning
expr: upsBatteryStatus == 3
for: 2m
labels:
severity: warning
annotations:
summary: "Federate : L'état de la batterie de {{ $labels.exported_instance }} est faible !"
- alert: UpsBatteryStatusCritical
expr: upsBatteryStatus == 4
for: 10m
labels:
severity: warning
annotations:
summary: "L'état de la batterie de {{ $labels.exported_instance }} est affaibli !"
- alert: UpsHighLoad
expr: upsOutputPercentLoad > 70
for: 5m
labels:
severity: critical
annotations:
summary: "Federate : La charge de {{ $labels.exported_instance }} est de {{ $value }}% !"
- alert: UpsWrongInputVoltage
expr: (upsInputVoltage < 210) or (upsInputVoltage > 250)
for: 10m
labels:
severity: warning
annotations:
summary: "Federate : La tension d'entrée de {{ $labels.exported_instance }} est de {{ $value }}V."
- alert: UpsWrongOutputVoltage
expr: (upsOutputVoltage < 220) or (upsOutputVoltage > 240)
for: 10m
labels:
severity: warning
annotations:
summary: "Federate : La tension de sortie de {{ $labels.exported_instance }} est de {{ $value }}V."
- alert: UpsTimeRemainingWarning
expr: upsEstimatedMinutesRemaining < 15
for: 1m
labels:
severity: warning
annotations:
summary: "Federate : L'autonomie restante sur {{ $labels.exported_instance }} est de {{ $value }} min."
- alert: UpsTimeRemainingCritical
expr: upsEstimatedMinutesRemaining < 5
for: 1m
labels:
severity: critical
annotations:
summary: "Federate : L'autonomie restante sur {{ $labels.exported_instance }} est de {{ $value }} min."
{% endraw %}

View file

@ -0,0 +1,106 @@
# {{ ansible_managed }}
{# As this is also Jinja2 it will conflict without a raw block #}
{% raw %}
groups:
- name: django.rules
rules:
- record: job:django_http_requests_before_middlewares_total:sum_rate30s
expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job)
- record: job:django_http_requests_unknown_latency_total:sum_rate30s
expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job)
- record: job:django_http_ajax_requests_total:sum_rate30s
expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job)
- record: job:django_http_responses_before_middlewares_total:sum_rate30s
expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job)
- record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s
expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s]))
BY (job)
- record: job:django_http_requests_body_total_bytes:sum_rate30s
expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job)
- record: job:django_http_responses_streaming_total:sum_rate30s
expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job)
- record: job:django_http_responses_body_total_bytes:sum_rate30s
expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job)
- record: job:django_http_requests_total:sum_rate30s
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job)
- record: job:django_http_requests_total_by_method:sum_rate30s
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method)
- record: job:django_http_requests_total_by_transport:sum_rate30s
expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport)
- record: job:django_http_requests_total_by_view:sum_rate30s
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
view)
- record: job:django_http_requests_total_by_view_transport_method:sum_rate30s
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
view, transport, method)
- record: job:django_http_responses_total_by_templatename:sum_rate30s
expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename)
- record: job:django_http_responses_total_by_status:sum_rate30s
expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status)
- record: job:django_http_responses_total_by_charset:sum_rate30s
expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset)
- record: job:django_http_exceptions_total_by_type:sum_rate30s
expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type)
- record: job:django_http_exceptions_total_by_view:sum_rate30s
expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view)
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "50"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "95"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99.9"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "50"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "95"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99.9"
- record: job:django_model_inserts_total:sum_rate1m
expr: sum(rate(django_model_inserts_total[1m])) BY (job, model)
- record: job:django_model_updates_total:sum_rate1m
expr: sum(rate(django_model_updates_total[1m])) BY (job, model)
- record: job:django_model_deletes_total:sum_rate1m
expr: sum(rate(django_model_deletes_total[1m])) BY (job, model)
- record: job:django_db_new_connections_total:sum_rate30s
expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor)
- record: job:django_db_new_connection_errors_total:sum_rate30s
expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor)
- record: job:django_db_execute_total:sum_rate30s
expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor)
- record: job:django_db_execute_many_total:sum_rate30s
expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor)
- record: job:django_db_errors_total:sum_rate30s
expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type)
- record: job:django_migrations_applied_total:max
expr: max(django_migrations_applied_total) BY (job, connection)
- record: job:django_migrations_unapplied_total:max
expr: max(django_migrations_unapplied_total) BY (job, connection)
{% endraw %}

View file

@ -0,0 +1,56 @@
# {{ ansible_managed }}
global:
# scrape_interval is set to the global default (60s)
# evaluation_interval is set to the global default (60s)
# scrape_timeout is set to the global default (10s).
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'example'
# Alertmanager configuration
# Use prometheus alertmanager installed on the same machine
alerting:
alertmanagers:
- static_configs:
- targets: ['{{ prometheus_alertmanager }}']
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "alert.rules.yml" # Monitoring alerts, this is the file you may be searching!
- "django.rules.yml" # Custom rules specific for Django project monitoring
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The .json in file_sd_configs is dynamically reloaded
- job_name: federate
scrape_interval: 15s
metrics_path: '/federate'
file_sd_configs:
- files:
- '/etc/prometheus/targets.json'
relabel_configs:
# Do not put :9100 in instance name, rather here
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
target_label: __address__
replacement: '$1:9090'
params:
'match[]':
- '{job="servers"}'
- '{job="prometheus"}'
- '{job="unifi_snmp"}'
- '{job="django"}'
- '{job="ups_snmp"}'
- '{job="django"}'
- '{job="docker"}'
- '{job="switch"}'

View file

@ -0,0 +1,387 @@
# {{ ansible_managed }}
# TODOlist :
# - Faire fonctionner le monitoring des switchs défini ici
# * Configurer tous les switchs avec un compte SNMPv3
# * Mettre l'inventaire des switchs dans Ansible
# - Optimiser les règles pour les bornes Unifi,
# on pourrait indexer avec les SSID
eatonups:
walk:
- 1.3.6.1.2.1.33.1.2
- 1.3.6.1.2.1.33.1.3
- 1.3.6.1.2.1.33.1.4
- 1.3.6.1.4.1.534.1.6
get:
- 1.3.6.1.2.1.1.3.0
metrics:
- name: sysUpTime
oid: 1.3.6.1.2.1.1.3
type: gauge
help: The time (in hundredths of a second) since the network management portion
of the system was last re-initialized. - 1.3.6.1.2.1.1.3
- name: upsBatteryStatus
oid: 1.3.6.1.2.1.33.1.2.1
type: gauge
help: The indication of the capacity remaining in the UPS system's batteries -
1.3.6.1.2.1.33.1.2.1
- name: upsEstimatedMinutesRemaining
oid: 1.3.6.1.2.1.33.1.2.3
type: gauge
help: An estimate of the time to battery charge depletion under the present load
conditions if the utility power is off and remains off, or if it were to be
lost and remain off. - 1.3.6.1.2.1.33.1.2.3
- name: upsInputVoltage
oid: 1.3.6.1.2.1.33.1.3.3.1.3
type: gauge
help: The magnitude of the present input voltage. - 1.3.6.1.2.1.33.1.3.3.1.3
indexes:
- labelname: upsInputLineIndex
type: gauge
- name: upsOutputSource
oid: 1.3.6.1.2.1.33.1.4.1
type: gauge
help: The present source of output power - 1.3.6.1.2.1.33.1.4.1
- name: upsOutputVoltage
oid: 1.3.6.1.2.1.33.1.4.4.1.2
type: gauge
help: The present output voltage. - 1.3.6.1.2.1.33.1.4.4.1.2
indexes:
- labelname: upsOutputLineIndex
type: gauge
- name: upsOutputPower
oid: 1.3.6.1.2.1.33.1.4.4.1.4
type: gauge
help: The present output true power. - 1.3.6.1.2.1.33.1.4.4.1.4
indexes:
- labelname: upsOutputLineIndex
type: gauge
- name: upsOutputPercentLoad
oid: 1.3.6.1.2.1.33.1.4.4.1.5
type: gauge
help: The percentage of the UPS power capacity presently being used on this output
line, i.e., the greater of the percent load of true power capacity and the percent
load of VA. - 1.3.6.1.2.1.33.1.4.4.1.5
indexes:
- labelname: upsOutputLineIndex
type: gauge
- name: xupsEnvRemoteTemp
oid: 1.3.6.1.4.1.534.1.6.5
type: gauge
help: The reading of an EMP's temperature sensor. - 1.3.6.1.4.1.534.1.6.5
- name: xupsEnvRemoteHumidity
oid: 1.3.6.1.4.1.534.1.6.6
type: gauge
help: The reading of an EMP's humidity sensor. - 1.3.6.1.4.1.534.1.6.6
version: 1
auth:
community: public
procurve_switch:
walk:
- 1.3.6.1.2.1.31.1.1.1.10
- 1.3.6.1.2.1.31.1.1.1.6
get:
- 1.3.6.1.2.1.1.3.0
- 1.3.6.1.2.1.1.5.0
- 1.3.6.1.2.1.1.6.0
metrics:
- name: sysUpTime
oid: 1.3.6.1.2.1.1.3
type: gauge
help: The time (in hundredths of a second) since the network management portion
of the system was last re-initialized. - 1.3.6.1.2.1.1.3
- name: sysName
oid: 1.3.6.1.2.1.1.5
type: DisplayString
help: An administratively-assigned name for this managed node - 1.3.6.1.2.1.1.5
- name: sysLocation
oid: 1.3.6.1.2.1.1.6
type: DisplayString
help: The physical location of this node (e.g., 'telephone closet, 3rd floor')
- 1.3.6.1.2.1.1.6
- name: ifHCOutOctets
oid: 1.3.6.1.2.1.31.1.1.1.10
type: counter
help: The total number of octets transmitted out of the interface, including framing
characters - 1.3.6.1.2.1.31.1.1.1.10
indexes:
- labelname: ifIndex
type: gauge
- name: ifHCInOctets
oid: 1.3.6.1.2.1.31.1.1.1.6
type: counter
help: The total number of octets received on the interface, including framing
characters - 1.3.6.1.2.1.31.1.1.1.6
indexes:
- labelname: ifIndex
type: gauge
version: 3
auth:
username: prometheus
ubiquiti_unifi:
walk:
- 1.3.6.1.4.1.41112.1.6
get:
- 1.3.6.1.2.1.1.5.0
- 1.3.6.1.2.1.1.6.0
metrics:
# Pour faire une WifiMap un jour, on peut entrer la location dans la conf des bornes
# - name: sysLocation
# oid: 1.3.6.1.2.1.1.6
# type: DisplayString
# help: The physical location of this node (e.g., 'telephone closet, 3rd floor')
# - 1.3.6.1.2.1.1.6
- name: unifiVapIndex
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.1
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.1'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapChannel
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.4
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.4'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapEssId
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.6
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.6'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapName
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.7
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.7'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifi_vap_num_stations
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.8
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.8'
indexes:
- labelname: unifi_vap_index
type: gauge
lookups:
- labels: [unifi_vap_index]
labelname: unifi_vap_essid
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.6
type: DisplayString
- labels: [unifi_vap_index]
labelname: unifi_vap_radio
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.9
type: DisplayString
- labels: []
labelname: unifi_vap_index
# - name: unifiVapNumStations
# oid: 1.3.6.1.4.1.41112.1.6.1.2.1.8
# type: gauge
# help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.8'
# indexes:
# - labelname: unifiVapIndex
# type: gauge
- name: unifiVapRadio
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.9
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.9'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxBytes
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.10
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.10'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxCrypts
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.11
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.11'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxDropped
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.12
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.12'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxErrors
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.13
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.13'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxFrags
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.14
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.14'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapRxPackets
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.15
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.15'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxBytes
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.16
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.16'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxDropped
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.17
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.17'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxErrors
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.18
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.18'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxPackets
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.19
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.19'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxRetries
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.20
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.20'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapTxPower
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.21
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.21'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapUp
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.22
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.1.2.1.22'
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiVapUsage
oid: 1.3.6.1.4.1.41112.1.6.1.2.1.23
type: DisplayString
help: guest or regular user - 1.3.6.1.4.1.41112.1.6.1.2.1.23
indexes:
- labelname: unifiVapIndex
type: gauge
- name: unifiIfIndex
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.1
type: gauge
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.1'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfName
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.5
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.5'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxBytes
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.6
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.6'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxDropped
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.7
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.7'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxError
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.8
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.8'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxMulticast
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.9
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.9'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfRxPackets
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.10
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.10'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfTxBytes
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.12
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.12'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfTxDropped
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.13
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.13'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfTxError
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.14
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.14'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiIfTxPackets
oid: 1.3.6.1.4.1.41112.1.6.2.1.1.15
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.2.1.1.15'
indexes:
- labelname: unifiIfIndex
type: gauge
- name: unifiApSystemModel
oid: 1.3.6.1.4.1.41112.1.6.3.3
type: DisplayString
help: ' - 1.3.6.1.4.1.41112.1.6.3.3'
- name: unifiApSystemUptime
oid: 1.3.6.1.4.1.41112.1.6.3.5
type: counter
help: ' - 1.3.6.1.4.1.41112.1.6.3.5'
version: 3
auth:
security_level: authPriv
username: snmp_prometheus
password: {{ snmp_unifi_password }}
auth_protocol: SHA
priv_protocol: AES
priv_password: {{ snmp_unifi_password }}

View file

@ -0,0 +1,4 @@
#!/bin/sh
# {{ ansible_managed }}
echo "> prometheus a été déployé sur cette machine."
echo " Voir /etc/prometheus/"

View file

@ -106,12 +106,11 @@
- name: Install radius requirements (except freeradius-python3)
shell:
cmd: "{{ item }}"
cmd: "cat apt_requirements_radius.txt | grep -v freeradius-python3 | xargs apt-get -y install"
chdir: /var/www/re2o/
loop:
- "cat apt_requirements_radius.txt | grep -v freeradius-python3 | xargs apt-get -y install"
- "pip3 install -r pip_requirements.txt"
- name: Install PyPi requirements for radius
command: "pip3 install -r /var/www/re2o/pip_requirements.txt"
# End of hideousness (hopefully).

View file

@ -40,7 +40,7 @@
- name: Install aurore-firewall (re2o-service)
import_role:
name: re2o-service
name: re2o_service
vars:
service_repo: https://gitea.auro.re/Aurore/aurore-firewall.git
service_name: aurore-firewall

4
utils/README.md Normal file
View file

@ -0,0 +1,4 @@
# Utils
A repository of Ansible Playbooks that are useful, as little script or various
utilities, but not used in production.

13
utils/re2o_mail_server.yml Executable file
View file

@ -0,0 +1,13 @@
---
# Deploy Re2o mail service
- hosts: mail.auro.re
vars:
service_repo: https://gitea.auro.re/aurore/re2o-mail-server.git
service_name: mail-server
service_version: aurore
service_config:
hostname: re2o-test.adm.auro.re # use test instance for now, should be changed for prod!
username: service-user
password: "{{ vault_serviceuser_passwd }}"
roles:
- re2o-service

31
utils/reboot_needed_check.yml Executable file
View file

@ -0,0 +1,31 @@
#!/usr/bin/env ansible-playbook
---
# Check if a reboot is required by the installation of some packages (ie kernel)
- hosts: localhost
tasks:
- name: Make sure local file exist but is empty # weird hack, I know
copy:
dest: /tmp/ansible_dump_reboot_needed.txt
content: ""
force: true
mode: 0644
- hosts: all,!unifi,!escalope.adm.auro.re,!loki.adm.auro.re,!viviane.adm.auro.re,!vpn-ovh.adm.auro.re
tasks:
# Register the output of the file /var/run/reboot-required.pkgs
- name: Register if boot is required
shell: if [ -e /var/run/reboot-required.pkgs ]; then cat /var/run/reboot-required.pkgs; fi
register: result
- name: DEBUG
debug:
msg: "{{ ansible_facts['nodename'] }} : {{ result.stdout }}"
when: result.stdout is defined
# Add info line by line
- name: Dump all info into the local file
delegate_to: localhost
lineinfile:
path: /tmp/ansible_dump_reboot_needed.txt
line: "{{ ansible_facts['nodename'] }} : {{ result.stdout }}"
when: result.stdout is defined

21
utils/version_check.yml Executable file
View file

@ -0,0 +1,21 @@
#!/usr/bin/env ansible-playbook
---
# Check for the distribution
- hosts: localhost
tasks:
- name: Make sure local file exist but is empty # weird hack, I know
copy:
dest: /tmp/ansible_dump_reboot_needed.txt
content: ""
force: true
mode: 0644
- hosts: all,!unifi
tasks:
# Add info line by line
- name: Dump all info into the local file
delegate_to: localhost
lineinfile:
path: /tmp/ansible_dump_dist_version.txt
line: "[{{ ansible_facts['nodename'] }}] {{ ansible_fqdn }} : {{
ansible_distribution }} {{ ansible_distribution_version }}"