2023-11-01 05:57:21 +01:00
|
|
|
---
|
|
|
|
prometheus__alertmanager_targets:
|
|
|
|
- docker-ovh.adm.auro.re:9093
|
|
|
|
|
|
|
|
prometheus__tsdb_retention_time: 90d
|
|
|
|
|
|
|
|
prometheus__scraping:
|
|
|
|
node:
|
|
|
|
targets: "{{ groups.vm_network + groups.pve_network }}"
|
|
|
|
address:
|
|
|
|
port: 9100
|
|
|
|
prometheus:
|
|
|
|
targets: "{{ groups.prom }}"
|
|
|
|
address:
|
|
|
|
port: 9090
|
|
|
|
kresd:
|
|
|
|
targets: "{{ groups.dns }}"
|
|
|
|
address:
|
|
|
|
port: 8453
|
|
|
|
bird:
|
|
|
|
targets: "{{ groups.router }}"
|
|
|
|
address:
|
|
|
|
port: 9324
|
|
|
|
quanta:
|
|
|
|
targets: "{{ groups.quanta }}"
|
|
|
|
address: 127.0.0.1:9116
|
|
|
|
path: /snmp
|
|
|
|
params:
|
|
|
|
module:
|
|
|
|
- quanta
|
|
|
|
snmp:
|
|
|
|
targets: "{{ groups.prom }}"
|
|
|
|
address:
|
|
|
|
port: 9116
|
|
|
|
|
|
|
|
prometheus__alert_rules_prometheus:
|
|
|
|
- alert: PrometheusTsdbCompactionFailed
|
|
|
|
expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
|
|
|
|
prometheus__alert_rules_node:
|
|
|
|
- alert: MachineDown
|
|
|
|
expr: "up == 0"
|
|
|
|
for: 3m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: "Collecteur {{ '$labels.job' | interp }}"
|
|
|
|
- alert: OutOfMemory
|
|
|
|
expr: "( node_memory_MemFree_bytes
|
|
|
|
+ node_memory_Cached_bytes
|
|
|
|
+ node_memory_Buffers_bytes )
|
|
|
|
/ node_memory_MemTotal_bytes * 100 < 10"
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "Mémoire libre à {{ '$value' | interp_float }}%"
|
|
|
|
- alert: HostSwapIsFillingUp
|
|
|
|
expr: "( 1 - ( node_memory_SwapFree_bytes
|
|
|
|
/ node_memory_SwapTotal_bytes ) )
|
|
|
|
* 100 >= 50"
|
|
|
|
for: 3m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: "Swap {{ '$value' | interp_float }}%"
|
|
|
|
- alert: HostPhysicalComponentTooHot
|
|
|
|
expr: "node_hwmon_temp_celsius > 79"
|
|
|
|
for: 3m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$value' | interp_float }}°C :
|
|
|
|
{{ '$labels.chip' | interp }},
|
|
|
|
{{ '$labels.sensor' | interp }}"
|
|
|
|
- alert: HostNodeOvertemperatureAlarm
|
|
|
|
expr: "node_hwmon_temp_crit_alarm_celsius == 1"
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$labels.chip' | interp }},
|
|
|
|
{{ '$labels.sensor' | interp }}"
|
|
|
|
- alert: HostRaidArrayGotInactive
|
|
|
|
expr: 'node_md_state{state="inactive"} > 0'
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$labels.device' | interp }}"
|
|
|
|
- alert: HostRaidDiskFailure
|
|
|
|
expr: 'node_md_disks{state="failed"} > 0'
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
severity: "{{ '$labels.md_device' | interp }}"
|
|
|
|
- alert: HostOomKillDetected
|
|
|
|
expr: "increase(node_vmstat_oom_kill[1m]) > 0"
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "PID {{ '$value' | interp }}"
|
|
|
|
- alert: HostEdacCorrectableErrorsDetected
|
|
|
|
expr: "increase(node_edac_correctable_errors_total[1m]) > 0"
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$value' | interp }} erreurs corrigées"
|
|
|
|
- alert: HostEdacUncorrectableErrorsDetected
|
|
|
|
expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0"
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$value' | interp }} erreurs corrigées"
|
|
|
|
- alert: OutOfDiskSpace
|
|
|
|
expr: "( node_filesystem_free_bytes
|
|
|
|
/ node_filesystem_size_bytes * 100 < 10 )
|
|
|
|
and on (instance, device, mountpoint)
|
|
|
|
node_filesystem_readonly == 0"
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$labels.mountpoint' | interp }} :
|
|
|
|
{{ '$value' | interp_float }}% libre"
|
|
|
|
- alert: HostConntrackLimit
|
|
|
|
expr: "( node_nf_conntrack_entries
|
|
|
|
/ node_nf_conntrack_entries_limit ) * 100 > 80"
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$value' | interp_float }}% complet"
|
|
|
|
- alert: HostClockSkew
|
|
|
|
expr: "(node_timex_offset_seconds > 0.05
|
|
|
|
and deriv(node_timex_offset_seconds[5m]) >= 0)
|
|
|
|
or (node_timex_offset_seconds < -0.05
|
|
|
|
and deriv(node_timex_offset_seconds[5m]) <= 0)"
|
|
|
|
for: 2m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: HostClockNotSynchronising
|
|
|
|
expr: "min_over_time(node_timex_sync_status[1m]) == 0
|
|
|
|
and node_timex_maxerror_seconds >= 16"
|
|
|
|
for: 2m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: HostRequiresReboot
|
|
|
|
expr: "node_reboot_required > 0"
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
- alert: OutOfInodes
|
|
|
|
expr: "node_filesystem_files_free
|
|
|
|
/ node_filesystem_files * 100 < 10"
|
|
|
|
for: 3m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$labels.mountpoint' | interp }} :
|
|
|
|
{{ '$value' | interp_float }}% libre"
|
|
|
|
- alert: CpuUsage
|
|
|
|
expr: '( 100 - avg by (instance)
|
|
|
|
( irate(node_cpu_seconds_total{mode="idle"}[5m]) )
|
|
|
|
* 100 ) > 75'
|
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$value' | interp_float }}%"
|
|
|
|
- alert: SystemdServiceFailed
|
|
|
|
expr: 'node_systemd_unit_state{state="failed"} == 1'
|
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$labels.name' | interp }}"
|
|
|
|
- alert: LoadUsage
|
|
|
|
expr: "node_load1 > 5"
|
|
|
|
for: 2m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$value' | interp_float }}"
|
|
|
|
- alert: UnhealthyDisk
|
|
|
|
expr: "smartmon_device_smart_healthy < 1"
|
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$labels.disk' | interp }}"
|
|
|
|
- alert: HostCpuStealNoisyNeighbor
|
|
|
|
expr: 'avg by(instance)
|
|
|
|
(rate(node_cpu_seconds_total{mode="steal"}[5m]))
|
|
|
|
* 100 > 10'
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$labels.disk' | interp }}"
|
|
|
|
|
|
|
|
prometheus__alert_rules_keepalived:
|
|
|
|
- alert: KeepalivedVrrpFault
|
|
|
|
expr: 'keepalived_vrrp_state{state="fault"} > 0'
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$labels.instance' | interp }}"
|
|
|
|
- alert: KeepalivedMasterChange
|
|
|
|
expr: 'changes(
|
|
|
|
keepalived_vrrp_state
|
|
|
|
{keepalived_vvrp_state="master"}[1m]) > 1'
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$labels.instance' | interp }}"
|
|
|
|
|
|
|
|
prometheus__alert_rules_bird:
|
|
|
|
- alert: BirdProtocolDown
|
|
|
|
expr: "bird_protocol_up == 0"
|
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: "{{ '$labels.name' | interp }} :
|
|
|
|
{{ '$labels.state' | interp }}"
|
|
|
|
|
2023-11-01 07:11:30 +01:00
|
|
|
prometheus__alert_rules_quanta:
|
|
|
|
- alert: QuantaQueueOverflow
|
2023-11-01 16:43:50 +01:00
|
|
|
expr: "snAgGblQueueOverflow == 1"
|
2023-11-01 07:11:30 +01:00
|
|
|
for: 0m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
|
2023-11-01 05:57:21 +01:00
|
|
|
prometheus__alert_rules:
|
|
|
|
prometheus: "{{ prometheus__alert_rules_prometheus }}"
|
|
|
|
node: "{{ prometheus__alert_rules_node }}"
|
|
|
|
keepalived: "{{ prometheus__alert_rules_keepalived }}"
|
2023-11-01 07:11:30 +01:00
|
|
|
quanta: "{{ prometheus__alert_rules_quanta }}"
|
2023-11-01 05:57:21 +01:00
|
|
|
#bird: "{{ prometheus__alert_rules_bird }}"
|
|
|
|
...
|