misc: various monitoring changes

This commit is contained in:
jeltz 2023-11-02 00:25:35 +01:00
parent 9e483d5285
commit fc7f59b231
Signed by: jeltz
GPG key ID: 800882B66C0C3326
4 changed files with 87 additions and 48 deletions

View file

@ -5,19 +5,9 @@ class FilterModule:
def filters(self):
return {
"prometheus__convert_jobs": convert_jobs,
"interp": interp,
"interp_float": interp_float,
}
def interp(string):
return AnsibleUnicode(f"{{{{ {string} }}}}")
def interp_float(string):
return AnsibleUnicode(f'{{{{ printf "%.0f" {string} }}}}')
def convert_jobs(config):
for name, job in config.items():

View file

@ -98,6 +98,10 @@ firewall__zones:
- 10.206.1.5
- 2a09:6840:206::1:6
- 10.206.1.6
docker-ovh.adm:
addrs:
- 2a09:6840:128::150
- 10.128.0.150
firewall__input:
- src: back
@ -134,6 +138,13 @@ firewall__forward:
- src: monit
dst: sw
verdict: accept
# Alertmanager
- src: monit
dst: docker-ovh.adm
protocols:
tcp:
dport: 9093
verdict: accept
- src: adm-legacy
dst: bmc
verdict: accept
@ -204,4 +215,9 @@ firewall__nat:
protocols: null
snat:
addr: 45.66.111.200/32
#- src: monit
# dst: adm-legacy
# protocols: null
# snat:
# addr: 10.203.1.3/32
...

View file

@ -47,7 +47,7 @@ prometheus__alert_rules_common:
labels:
severity: critical
annotations:
summary: "Collecteur {{ '$labels.job' | interp }}"
Job: !unsafe "{{ $labels.job }}"
prometheus__alert_rules_node:
- alert: OutOfMemory
@ -59,7 +59,7 @@ prometheus__alert_rules_node:
labels:
severity: warning
annotations:
summary: "Mémoire libre à {{ '$value' | interp_float }}%"
FreeMemory: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostSwapIsFillingUp
expr: "( 1 - ( node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes ) )
@ -68,59 +68,59 @@ prometheus__alert_rules_node:
labels:
severity: critical
annotations:
summary: "Swap {{ '$value' | interp_float }}%"
UsedSwap: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostPhysicalComponentTooHot
expr: "node_hwmon_temp_celsius > 79"
for: 3m
labels:
severity: critical
annotations:
summary: "{{ '$value' | interp_float }}°C :
{{ '$labels.chip' | interp }},
{{ '$labels.sensor' | interp }}"
Temperature: !unsafe '{{ printf "%.0f" $value }} °C'
Chip: !unsafe "{{ $labels.chip }}"
Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostNodeOvertemperatureAlarm
expr: "node_hwmon_temp_crit_alarm_celsius == 1"
for: 0m
labels:
severity: critical
annotations:
summary: "{{ '$labels.chip' | interp }},
{{ '$labels.sensor' | interp }}"
Chip: !unsafe "{{ $labels.chip }}"
Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostRaidArrayGotInactive
expr: 'node_md_state{state="inactive"} > 0'
for: 0m
labels:
severity: critical
annotations:
summary: "{{ '$labels.device' | interp }}"
Device: !unsafe "{{ $labels.device }}"
- alert: HostRaidDiskFailure
expr: 'node_md_disks{state="failed"} > 0'
for: 0m
labels:
severity: critical
annotations:
severity: "{{ '$labels.md_device' | interp }}"
severity: !unsafe "{{ $labels.md_device }}"
- alert: HostOomKillDetected
expr: "increase(node_vmstat_oom_kill[1m]) > 0"
for: 0m
labels:
severity: warning
annotations:
summary: "PID {{ '$value' | interp }}"
PID: !unsafe "{{ $value }}"
- alert: HostEdacCorrectableErrorsDetected
expr: "increase(node_edac_correctable_errors_total[1m]) > 0"
for: 0m
labels:
severity: warning
annotations:
summary: "{{ '$value' | interp }} erreurs corrigées"
CorrectedErrors: !unsafe "{{ $value }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0"
for: 0m
labels:
severity: warning
annotations:
summary: "{{ '$value' | interp }} erreurs corrigées"
DetectedErrors: !unsafe "{{ $value }}"
- alert: OutOfDiskSpace
expr: "( node_filesystem_free_bytes
/ node_filesystem_size_bytes * 100 < 10 )
@ -130,8 +130,8 @@ prometheus__alert_rules_node:
labels:
severity: critical
annotations:
summary: "{{ '$labels.mountpoint' | interp }} :
{{ '$value' | interp_float }}% libre"
Mountpoint: !unsafe '{{ $labels.mountpoint }}'
FreeSpace: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostConntrackLimit
expr: "( node_nf_conntrack_entries
/ node_nf_conntrack_entries_limit ) * 100 > 80"
@ -139,7 +139,7 @@ prometheus__alert_rules_node:
labels:
severity: warning
annotations:
summary: "{{ '$value' | interp_float }}% complet"
Filled: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostClockSkew
expr: "(node_timex_offset_seconds > 0.05
and deriv(node_timex_offset_seconds[5m]) >= 0)
@ -166,8 +166,8 @@ prometheus__alert_rules_node:
labels:
severity: warning
annotations:
summary: "{{ '$labels.mountpoint' | interp }} :
{{ '$value' | interp_float }}% libre"
Mountpoint: !unsafe "{{ $labels.mountpoint }}"
FreeInodes: !unsafe '{{ printf "%.0f" $value }} %'
- alert: CpuUsage
expr: '( 100 - avg by (instance)
( irate(node_cpu_seconds_total{mode="idle"}[5m]) )
@ -176,28 +176,28 @@ prometheus__alert_rules_node:
labels:
severity: warning
annotations:
summary: "{{ '$value' | interp_float }}%"
Usage: !unsafe '{{ printf "%.0f" $value }} %'
- alert: SystemdServiceFailed
expr: 'node_systemd_unit_state{state="failed"} == 1'
for: 10m
labels:
severity: warning
annotations:
summary: "{{ '$labels.name' | interp }}"
Service: !unsafe "{{ $labels.name }}"
- alert: LoadUsage
expr: "node_load1 > 5"
for: 2m
labels:
severity: warning
annotations:
summary: "{{ '$value' | interp_float }}"
Load1: !unsafe '{{ printf "%.0f" $value }}'
- alert: UnhealthyDisk
expr: "smartmon_device_smart_healthy < 1"
for: 10m
labels:
severity: critical
annotations:
summary: "{{ '$labels.disk' | interp }}"
Disk: !unsafe "{{ $labels.disk }}"
- alert: HostCpuStealNoisyNeighbor
expr: 'avg by(instance)
(rate(node_cpu_seconds_total{mode="steal"}[5m]))
@ -206,7 +206,7 @@ prometheus__alert_rules_node:
labels:
severity: warning
annotations:
summary: "{{ '$labels.disk' | interp }}"
Disk: !unsafe "{{ $labels.disk }}"
prometheus__alert_rules_keepalived:
- alert: KeepalivedVrrpFault
@ -215,7 +215,7 @@ prometheus__alert_rules_keepalived:
labels:
severity: critical
annotations:
summary: "{{ '$labels.instance' | interp }}"
Instance: !unsafe "{{ $labels.instance }}"
- alert: KeepalivedMasterChange
expr: 'changes(
keepalived_vrrp_state
@ -224,7 +224,7 @@ prometheus__alert_rules_keepalived:
labels:
severity: warning
annotations:
summary: "{{ '$labels.instance' | interp }}"
Instance: !unsafe "{{ $labels.instance }}"
prometheus__alert_rules_bird:
- alert: BirdProtocolDown
@ -233,8 +233,8 @@ prometheus__alert_rules_bird:
labels:
severity: critical
annotations:
summary: "{{ '$labels.name' | interp }} :
{{ '$labels.state' | interp }}"
Protocol: !unsafe "{{ $labels.name }}"
State: !unsafe "{{ $labels.state }}"
prometheus__alert_rules_quanta:
- alert: QuantaQueueOverflow
@ -248,60 +248,73 @@ prometheus__alert_rules_quanta:
labels:
severity: warning
annotations:
summary: "Utilisation forte du processus ({{ '$value' | interp }}%)"
Usage: !unsafe "{{ $value }} %"
- alert: QuantaCpuUsage
expr: 'snAgGblCpuUtil1MinAvg > 80'
for: 5m
labels:
severity: critical
annotations:
summary: "Utilisation intense du processus ({{ '$value' | interp }}%)"
Usage: !unsafe "{{ $value }} %"
- alert: QuantaMemoryUsage
expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50'
for: 5m
labels:
severity: warning
annotations:
summary: "Utilisation forte de la mémoire ({{ '$value' | interp }}%)"
UsedMemory: !unsafe "{{ $value }} %"
- alert: QuantaMemoryUsage
expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80'
for: 5m
labels:
severity: alert
annotations:
summary: "Utilisation intense de la mémoire ({{ '$value' | interp }}%)"
UsedMemory: !unsafe "{{ $value }} %"
- alert: QuantaFanHealth
expr: 'snChasFanOperStatus{snChasFanOperStatus="normal"} == 0'
for: 0m
labels:
severity: critical
annotations:
summary: "Le ventilateur {{ '$labels.snChasFanDescription' | interp }} est
en mode {{ '$labels.snChasFanOperStatus' | interp }}"
Description: !unsafe "{{ $labels.shChasFanDescription }}"
Status: !unsafe "{{ $labels.snChasFanOperStatus }}"
- alert: QuantaTemp
expr: '(snAgentTempValue / 2) > 45'
for: 0m
labels:
severity: warning
annotations:
summary: "La température de {{ '$labels.snAgentTempSensorDescr' | interp }}
est élevée ({{ '$value' | interp }}°C)"
Temperature: !unsafe "{{ $value }} °C"
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
- alert: QuantaTemp
expr: '(snAgentTempValue / 2) > 60'
for: 0m
labels:
severity: critical
annotations:
summary: "La température de {{ '$labels.snAgentTempSensorDescr' | interp }}
est très élevée ({{ '$value' | interp }}°C)"
Temperature: !unsafe "{{ $value }} °C"
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
- alert: QuantaPowerRedundancyFailure
expr: 'count by (instance) (snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"}) < 2'
expr: 'count by (instance)
(snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"})
< 2'
for: 0m
labels:
severity: warning
prometheus__alert_rules_switch:
- alert: SwitchPromiscuousChange
expr: "changes(ifPromiscuousMode[5m]) > 0"
for: 0m
labels:
severity: warning
annotations:
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
prometheus__alert_rules:
common: "{{ prometheus__alert_rules_common }}"
switch: "{{ prometheus__alert_rules_switch }}"
prometheus: "{{ prometheus__alert_rules_prometheus }}"
node: "{{ prometheus__alert_rules_node }}"
keepalived: "{{ prometheus__alert_rules_keepalived }}"

View file

@ -21,6 +21,11 @@ prometheus_snmp__modules:
- snAgentCpu
- snSwInfo
- snSwIfInfoTable
- dot3StatsTable
- dot3HCStatsTable
- dot3Errors
- dot3Tests
- dot3CollTable
lookups:
- source_indexes:
- ifIndex
@ -44,6 +49,15 @@ prometheus_snmp__modules:
- source_indexes:
- snSwIfInfoPortNum
lookup: snSwIfDescr
- source_indexes:
- dot3StatsIndex
lookup: ifAlias
- source_indexes:
- dot3StatsIndex
lookup: ifDescr
- source_indexes:
- dot3StatsIndex
lookup: ifName
overrides:
ifIndex:
ignore: true
@ -79,4 +93,10 @@ prometheus_snmp__modules:
ignore: true
snSwIfInfoMediaType:
type: EnumAsInfo
dot3StatsIndex:
ignore: true
dot3StatsEtherChipSet:
ignore: true
dot3StatsDuplexStatus:
type: EnumAsStateSet
...