You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ansible/group_vars/prom/prometheus.yml

324 lines
8.9 KiB
YAML

---
prometheus__alertmanager_targets:
- docker-ovh.adm.auro.re:9093
prometheus__tsdb_retention_time: 90d
prometheus__scraping:
node:
targets: "{{ groups.vm_network + groups.pve_network }}"
address:
port: 9100
prometheus:
targets: "{{ groups.prom }}"
address:
port: 9090
kresd:
targets: "{{ groups.dns }}"
address:
port: 8453
bird:
targets: "{{ groups.router }}"
address:
port: 9324
quanta:
targets: "{{ groups.quanta }}"
address: 127.0.0.1:9116
path: /snmp
params:
module:
- quanta
snmp:
targets: "{{ groups.prom }}"
address:
port: 9116
prometheus__alert_rules_prometheus:
- alert: PrometheusTsdbCompactionFailed
expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
for: 0m
labels:
severity: critical
prometheus__alert_rules_common:
- alert: CollectorDown
expr: 'up == 0'
for: 3m
labels:
severity: critical
annotations:
Job: !unsafe "{{ $labels.job }}"
prometheus__alert_rules_node:
- alert: OutOfMemory
expr: "( node_memory_MemFree_bytes
+ node_memory_Cached_bytes
+ node_memory_Buffers_bytes )
/ node_memory_MemTotal_bytes * 100 < 10"
for: 5m
labels:
severity: warning
annotations:
FreeMemory: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostSwapIsFillingUp
expr: "( 1 - ( node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes ) )
* 100 >= 50"
for: 3m
labels:
severity: critical
annotations:
UsedSwap: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostPhysicalComponentTooHot
expr: "node_hwmon_temp_celsius > 79"
for: 3m
labels:
severity: critical
annotations:
Temperature: !unsafe '{{ printf "%.0f" $value }} °C'
Chip: !unsafe "{{ $labels.chip }}"
Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostNodeOvertemperatureAlarm
expr: "node_hwmon_temp_crit_alarm_celsius == 1"
for: 0m
labels:
severity: critical
annotations:
Chip: !unsafe "{{ $labels.chip }}"
Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostRaidArrayGotInactive
expr: 'node_md_state{state="inactive"} > 0'
for: 0m
labels:
severity: critical
annotations:
Device: !unsafe "{{ $labels.device }}"
- alert: HostRaidDiskFailure
expr: 'node_md_disks{state="failed"} > 0'
for: 0m
labels:
severity: critical
annotations:
severity: !unsafe "{{ $labels.md_device }}"
- alert: HostOomKillDetected
expr: "increase(node_vmstat_oom_kill[1m]) > 0"
for: 0m
labels:
severity: warning
annotations:
PID: !unsafe "{{ $value }}"
- alert: HostEdacCorrectableErrorsDetected
expr: "increase(node_edac_correctable_errors_total[1m]) > 0"
for: 0m
labels:
severity: warning
annotations:
CorrectedErrors: !unsafe "{{ $value }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0"
for: 0m
labels:
severity: warning
annotations:
DetectedErrors: !unsafe "{{ $value }}"
- alert: OutOfDiskSpace
expr: "( node_filesystem_free_bytes
/ node_filesystem_size_bytes * 100 < 10 )
and on (instance, device, mountpoint)
node_filesystem_readonly == 0"
for: 5m
labels:
severity: critical
annotations:
Mountpoint: !unsafe '{{ $labels.mountpoint }}'
FreeSpace: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostConntrackLimit
expr: "( node_nf_conntrack_entries
/ node_nf_conntrack_entries_limit ) * 100 > 80"
for: 5m
labels:
severity: warning
annotations:
Filled: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostClockSkew
expr: "(node_timex_offset_seconds > 0.05
and deriv(node_timex_offset_seconds[5m]) >= 0)
or (node_timex_offset_seconds < -0.05
and deriv(node_timex_offset_seconds[5m]) <= 0)"
for: 2m
labels:
severity: warning
- alert: HostClockNotSynchronising
expr: "min_over_time(node_timex_sync_status[1m]) == 0
and node_timex_maxerror_seconds >= 16"
for: 2m
labels:
severity: warning
- alert: HostRequiresReboot
expr: "node_reboot_required > 0"
for: 5m
labels:
severity: warning
- alert: OutOfInodes
expr: "node_filesystem_files_free
/ node_filesystem_files * 100 < 10"
for: 3m
labels:
severity: warning
annotations:
Mountpoint: !unsafe "{{ $labels.mountpoint }}"
FreeInodes: !unsafe '{{ printf "%.0f" $value }} %'
- alert: CpuUsage
expr: '( 100 - avg by (instance)
( irate(node_cpu_seconds_total{mode="idle"}[5m]) )
* 100 ) > 75'
for: 10m
labels:
severity: warning
annotations:
Usage: !unsafe '{{ printf "%.0f" $value }} %'
- alert: SystemdServiceFailed
expr: 'node_systemd_unit_state{state="failed"} == 1'
for: 10m
labels:
severity: warning
annotations:
Service: !unsafe "{{ $labels.name }}"
- alert: LoadUsage
expr: "node_load1 > 5"
for: 2m
labels:
severity: warning
annotations:
Load1: !unsafe '{{ printf "%.0f" $value }}'
- alert: UnhealthyDisk
expr: "smartmon_device_smart_healthy < 1"
for: 10m
labels:
severity: critical
annotations:
Disk: !unsafe "{{ $labels.disk }}"
- alert: HostCpuStealNoisyNeighbor
expr: 'avg by(instance)
(rate(node_cpu_seconds_total{mode="steal"}[5m]))
* 100 > 10'
for: 5m
labels:
severity: warning
annotations:
Disk: !unsafe "{{ $labels.disk }}"
prometheus__alert_rules_keepalived:
- alert: KeepalivedVrrpFault
expr: 'keepalived_vrrp_state{state="fault"} > 0'
for: 0m
labels:
severity: critical
annotations:
Instance: !unsafe "{{ $labels.instance }}"
- alert: KeepalivedMasterChange
expr: 'changes(
keepalived_vrrp_state
{keepalived_vvrp_state="master"}[1m]) > 1'
for: 0m
labels:
severity: warning
annotations:
Instance: !unsafe "{{ $labels.instance }}"
prometheus__alert_rules_bird:
- alert: BirdProtocolDown
expr: "bird_protocol_up == 0"
for: 0m
labels:
severity: critical
annotations:
Protocol: !unsafe "{{ $labels.name }}"
State: !unsafe "{{ $labels.state }}"
prometheus__alert_rules_quanta:
- alert: QuantaQueueOverflow
expr: 'snAgGblQueueOverflow == 1'
for: 0m
labels:
severity: critical
- alert: QuantaCpuUsage
expr: 'snAgGblCpuUtil1MinAvg > 50'
for: 5m
labels:
severity: warning
annotations:
Usage: !unsafe "{{ $value }} %"
- alert: QuantaCpuUsage
expr: 'snAgGblCpuUtil1MinAvg > 80'
for: 5m
labels:
severity: critical
annotations:
Usage: !unsafe "{{ $value }} %"
- alert: QuantaMemoryUsage
expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50'
for: 5m
labels:
severity: warning
annotations:
UsedMemory: !unsafe "{{ $value }} %"
- alert: QuantaMemoryUsage
expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80'
for: 5m
labels:
severity: alert
annotations:
UsedMemory: !unsafe "{{ $value }} %"
- alert: QuantaFanHealth
expr: 'snChasFanOperStatus{snChasFanOperStatus="normal"} == 0'
for: 0m
labels:
severity: critical
annotations:
Description: !unsafe "{{ $labels.shChasFanDescription }}"
Status: !unsafe "{{ $labels.snChasFanOperStatus }}"
- alert: QuantaTemp
expr: '(snAgentTempValue / 2) > 45'
for: 0m
labels:
severity: warning
annotations:
Temperature: !unsafe "{{ $value }} °C"
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
- alert: QuantaTemp
expr: '(snAgentTempValue / 2) > 60'
for: 0m
labels:
severity: critical
annotations:
Temperature: !unsafe "{{ $value }} °C"
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
- alert: QuantaPowerRedundancyFailure
expr: 'count by (instance)
(snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"})
< 2'
for: 0m
labels:
severity: warning
prometheus__alert_rules_switch:
- alert: SwitchPromiscuousChange
expr: "changes(ifPromiscuousMode[5m]) > 0"
for: 0m
labels:
severity: warning
annotations:
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
prometheus__alert_rules:
common: "{{ prometheus__alert_rules_common }}"
switch: "{{ prometheus__alert_rules_switch }}"
prometheus: "{{ prometheus__alert_rules_prometheus }}"
node: "{{ prometheus__alert_rules_node }}"
keepalived: "{{ prometheus__alert_rules_keepalived }}"
quanta: "{{ prometheus__alert_rules_quanta }}"
#bird: "{{ prometheus__alert_rules_bird }}"
...