diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 3fe3db8..075da46 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -42,6 +42,7 @@ mode: u=r,g=r,o= loop: - server.rules.yml + - docker.rules.yml - django.rules.yml - ups.rules.yml - postgres.rules.yml diff --git a/roles/prometheus/templates/docker.rules.yml.j2 b/roles/prometheus/templates/docker.rules.yml.j2 new file mode 100644 index 0000000..8ccc565 --- /dev/null +++ b/roles/prometheus/templates/docker.rules.yml.j2 @@ -0,0 +1,50 @@ +--- +{{ ansible_managed | comment }} + +{% macro raw(string) -%} +{{ "{{" }} {{ string }} {{ "}}" }} +{%- endmacro %} + +groups: + + - name: docker.rules + rules: + + - alert: ContainerDown + expr: docker_container_running_state + for: 0m + labels: + severity: critical + annotations: + summary: >- + Container Docker mort + (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}) + + - alert: ContainerFailed + expr: sum(increase(docker_container_restart_count[5m])) > 2 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Container Docker redémarre souvent + (instance raw('{{ $labels.instance') }}, container {{ raw('$labels.name') }}) + + - alert: ContainerFailed + expr: + ( + docker_container_cpu_used_total + / + docker_container_cpu_capacity_total + ) * 100 + > 30 + for: 0m + labels: + severity: critical + annotations: + summary: >- + Container Docker utilise beaucoup de CPU + (instance {{ raw('$labels.instance') }}, container {{ raw('$labels.name') }}, + value {{ raw('$value | printf "%.1f"'') }}) + +...