misc: various monitoring changes

This commit is contained in:
jeltz 2023-11-02 00:25:35 +01:00
parent 9e483d5285
commit fc7f59b231
Signed by: jeltz
GPG key ID: 800882B66C0C3326
4 changed files with 87 additions and 48 deletions

View file

@ -5,19 +5,9 @@ class FilterModule:
def filters(self): def filters(self):
return { return {
"prometheus__convert_jobs": convert_jobs, "prometheus__convert_jobs": convert_jobs,
"interp": interp,
"interp_float": interp_float,
} }
def interp(string):
return AnsibleUnicode(f"{{{{ {string} }}}}")
def interp_float(string):
return AnsibleUnicode(f'{{{{ printf "%.0f" {string} }}}}')
def convert_jobs(config): def convert_jobs(config):
for name, job in config.items(): for name, job in config.items():

View file

@ -98,6 +98,10 @@ firewall__zones:
- 10.206.1.5 - 10.206.1.5
- 2a09:6840:206::1:6 - 2a09:6840:206::1:6
- 10.206.1.6 - 10.206.1.6
docker-ovh.adm:
addrs:
- 2a09:6840:128::150
- 10.128.0.150
firewall__input: firewall__input:
- src: back - src: back
@ -134,6 +138,13 @@ firewall__forward:
- src: monit - src: monit
dst: sw dst: sw
verdict: accept verdict: accept
# Alertmanager
- src: monit
dst: docker-ovh.adm
protocols:
tcp:
dport: 9093
verdict: accept
- src: adm-legacy - src: adm-legacy
dst: bmc dst: bmc
verdict: accept verdict: accept
@ -204,4 +215,9 @@ firewall__nat:
protocols: null protocols: null
snat: snat:
addr: 45.66.111.200/32 addr: 45.66.111.200/32
#- src: monit
# dst: adm-legacy
# protocols: null
# snat:
# addr: 10.203.1.3/32
... ...

View file

@ -47,7 +47,7 @@ prometheus__alert_rules_common:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Collecteur {{ '$labels.job' | interp }}" Job: !unsafe "{{ $labels.job }}"
prometheus__alert_rules_node: prometheus__alert_rules_node:
- alert: OutOfMemory - alert: OutOfMemory
@ -59,7 +59,7 @@ prometheus__alert_rules_node:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Mémoire libre à {{ '$value' | interp_float }}%" FreeMemory: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostSwapIsFillingUp - alert: HostSwapIsFillingUp
expr: "( 1 - ( node_memory_SwapFree_bytes expr: "( 1 - ( node_memory_SwapFree_bytes
/ node_memory_SwapTotal_bytes ) ) / node_memory_SwapTotal_bytes ) )
@ -68,59 +68,59 @@ prometheus__alert_rules_node:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Swap {{ '$value' | interp_float }}%" UsedSwap: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostPhysicalComponentTooHot - alert: HostPhysicalComponentTooHot
expr: "node_hwmon_temp_celsius > 79" expr: "node_hwmon_temp_celsius > 79"
for: 3m for: 3m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "{{ '$value' | interp_float }}°C : Temperature: !unsafe '{{ printf "%.0f" $value }} °C'
{{ '$labels.chip' | interp }}, Chip: !unsafe "{{ $labels.chip }}"
{{ '$labels.sensor' | interp }}" Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostNodeOvertemperatureAlarm - alert: HostNodeOvertemperatureAlarm
expr: "node_hwmon_temp_crit_alarm_celsius == 1" expr: "node_hwmon_temp_crit_alarm_celsius == 1"
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "{{ '$labels.chip' | interp }}, Chip: !unsafe "{{ $labels.chip }}"
{{ '$labels.sensor' | interp }}" Sensor: !unsafe "{{ $labels.sensor }}"
- alert: HostRaidArrayGotInactive - alert: HostRaidArrayGotInactive
expr: 'node_md_state{state="inactive"} > 0' expr: 'node_md_state{state="inactive"} > 0'
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "{{ '$labels.device' | interp }}" Device: !unsafe "{{ $labels.device }}"
- alert: HostRaidDiskFailure - alert: HostRaidDiskFailure
expr: 'node_md_disks{state="failed"} > 0' expr: 'node_md_disks{state="failed"} > 0'
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
severity: "{{ '$labels.md_device' | interp }}" severity: !unsafe "{{ $labels.md_device }}"
- alert: HostOomKillDetected - alert: HostOomKillDetected
expr: "increase(node_vmstat_oom_kill[1m]) > 0" expr: "increase(node_vmstat_oom_kill[1m]) > 0"
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "PID {{ '$value' | interp }}" PID: !unsafe "{{ $value }}"
- alert: HostEdacCorrectableErrorsDetected - alert: HostEdacCorrectableErrorsDetected
expr: "increase(node_edac_correctable_errors_total[1m]) > 0" expr: "increase(node_edac_correctable_errors_total[1m]) > 0"
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ '$value' | interp }} erreurs corrigées" CorrectedErrors: !unsafe "{{ $value }}"
- alert: HostEdacUncorrectableErrorsDetected - alert: HostEdacUncorrectableErrorsDetected
expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0" expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0"
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ '$value' | interp }} erreurs corrigées" DetectedErrors: !unsafe "{{ $value }}"
- alert: OutOfDiskSpace - alert: OutOfDiskSpace
expr: "( node_filesystem_free_bytes expr: "( node_filesystem_free_bytes
/ node_filesystem_size_bytes * 100 < 10 ) / node_filesystem_size_bytes * 100 < 10 )
@ -130,8 +130,8 @@ prometheus__alert_rules_node:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "{{ '$labels.mountpoint' | interp }} : Mountpoint: !unsafe '{{ $labels.mountpoint }}'
{{ '$value' | interp_float }}% libre" FreeSpace: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostConntrackLimit - alert: HostConntrackLimit
expr: "( node_nf_conntrack_entries expr: "( node_nf_conntrack_entries
/ node_nf_conntrack_entries_limit ) * 100 > 80" / node_nf_conntrack_entries_limit ) * 100 > 80"
@ -139,7 +139,7 @@ prometheus__alert_rules_node:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ '$value' | interp_float }}% complet" Filled: !unsafe '{{ printf "%.0f" $value }} %'
- alert: HostClockSkew - alert: HostClockSkew
expr: "(node_timex_offset_seconds > 0.05 expr: "(node_timex_offset_seconds > 0.05
and deriv(node_timex_offset_seconds[5m]) >= 0) and deriv(node_timex_offset_seconds[5m]) >= 0)
@ -166,8 +166,8 @@ prometheus__alert_rules_node:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ '$labels.mountpoint' | interp }} : Mountpoint: !unsafe "{{ $labels.mountpoint }}"
{{ '$value' | interp_float }}% libre" FreeInodes: !unsafe '{{ printf "%.0f" $value }} %'
- alert: CpuUsage - alert: CpuUsage
expr: '( 100 - avg by (instance) expr: '( 100 - avg by (instance)
( irate(node_cpu_seconds_total{mode="idle"}[5m]) ) ( irate(node_cpu_seconds_total{mode="idle"}[5m]) )
@ -176,28 +176,28 @@ prometheus__alert_rules_node:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ '$value' | interp_float }}%" Usage: !unsafe '{{ printf "%.0f" $value }} %'
- alert: SystemdServiceFailed - alert: SystemdServiceFailed
expr: 'node_systemd_unit_state{state="failed"} == 1' expr: 'node_systemd_unit_state{state="failed"} == 1'
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ '$labels.name' | interp }}" Service: !unsafe "{{ $labels.name }}"
- alert: LoadUsage - alert: LoadUsage
expr: "node_load1 > 5" expr: "node_load1 > 5"
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ '$value' | interp_float }}" Load1: !unsafe '{{ printf "%.0f" $value }}'
- alert: UnhealthyDisk - alert: UnhealthyDisk
expr: "smartmon_device_smart_healthy < 1" expr: "smartmon_device_smart_healthy < 1"
for: 10m for: 10m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "{{ '$labels.disk' | interp }}" Disk: !unsafe "{{ $labels.disk }}"
- alert: HostCpuStealNoisyNeighbor - alert: HostCpuStealNoisyNeighbor
expr: 'avg by(instance) expr: 'avg by(instance)
(rate(node_cpu_seconds_total{mode="steal"}[5m])) (rate(node_cpu_seconds_total{mode="steal"}[5m]))
@ -206,7 +206,7 @@ prometheus__alert_rules_node:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ '$labels.disk' | interp }}" Disk: !unsafe "{{ $labels.disk }}"
prometheus__alert_rules_keepalived: prometheus__alert_rules_keepalived:
- alert: KeepalivedVrrpFault - alert: KeepalivedVrrpFault
@ -215,7 +215,7 @@ prometheus__alert_rules_keepalived:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "{{ '$labels.instance' | interp }}" Instance: !unsafe "{{ $labels.instance }}"
- alert: KeepalivedMasterChange - alert: KeepalivedMasterChange
expr: 'changes( expr: 'changes(
keepalived_vrrp_state keepalived_vrrp_state
@ -224,7 +224,7 @@ prometheus__alert_rules_keepalived:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "{{ '$labels.instance' | interp }}" Instance: !unsafe "{{ $labels.instance }}"
prometheus__alert_rules_bird: prometheus__alert_rules_bird:
- alert: BirdProtocolDown - alert: BirdProtocolDown
@ -233,8 +233,8 @@ prometheus__alert_rules_bird:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "{{ '$labels.name' | interp }} : Protocol: !unsafe "{{ $labels.name }}"
{{ '$labels.state' | interp }}" State: !unsafe "{{ $labels.state }}"
prometheus__alert_rules_quanta: prometheus__alert_rules_quanta:
- alert: QuantaQueueOverflow - alert: QuantaQueueOverflow
@ -248,60 +248,73 @@ prometheus__alert_rules_quanta:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Utilisation forte du processus ({{ '$value' | interp }}%)" Usage: !unsafe "{{ $value }} %"
- alert: QuantaCpuUsage - alert: QuantaCpuUsage
expr: 'snAgGblCpuUtil1MinAvg > 80' expr: 'snAgGblCpuUtil1MinAvg > 80'
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Utilisation intense du processus ({{ '$value' | interp }}%)" Usage: !unsafe "{{ $value }} %"
- alert: QuantaMemoryUsage - alert: QuantaMemoryUsage
expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50' expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50'
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Utilisation forte de la mémoire ({{ '$value' | interp }}%)" UsedMemory: !unsafe "{{ $value }} %"
- alert: QuantaMemoryUsage - alert: QuantaMemoryUsage
expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80' expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80'
for: 5m for: 5m
labels: labels:
severity: alert severity: alert
annotations: annotations:
summary: "Utilisation intense de la mémoire ({{ '$value' | interp }}%)" UsedMemory: !unsafe "{{ $value }} %"
- alert: QuantaFanHealth - alert: QuantaFanHealth
expr: 'snChasFanOperStatus{snChasFanOperStatus="normal"} == 0' expr: 'snChasFanOperStatus{snChasFanOperStatus="normal"} == 0'
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Le ventilateur {{ '$labels.snChasFanDescription' | interp }} est Description: !unsafe "{{ $labels.shChasFanDescription }}"
en mode {{ '$labels.snChasFanOperStatus' | interp }}" Status: !unsafe "{{ $labels.snChasFanOperStatus }}"
- alert: QuantaTemp - alert: QuantaTemp
expr: '(snAgentTempValue / 2) > 45' expr: '(snAgentTempValue / 2) > 45'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "La température de {{ '$labels.snAgentTempSensorDescr' | interp }} Temperature: !unsafe "{{ $value }} °C"
est élevée ({{ '$value' | interp }}°C)" Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
- alert: QuantaTemp - alert: QuantaTemp
expr: '(snAgentTempValue / 2) > 60' expr: '(snAgentTempValue / 2) > 60'
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "La température de {{ '$labels.snAgentTempSensorDescr' | interp }} Temperature: !unsafe "{{ $value }} °C"
est très élevée ({{ '$value' | interp }}°C)" Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
- alert: QuantaPowerRedundancyFailure - alert: QuantaPowerRedundancyFailure
expr: 'count by (instance) (snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"}) < 2' expr: 'count by (instance)
(snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"})
< 2'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
prometheus__alert_rules_switch:
- alert: SwitchPromiscuousChange
expr: "changes(ifPromiscuousMode[5m]) > 0"
for: 0m
labels:
severity: warning
annotations:
Interface: !unsafe "{{ $labels.ifName }}
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
prometheus__alert_rules: prometheus__alert_rules:
common: "{{ prometheus__alert_rules_common }}" common: "{{ prometheus__alert_rules_common }}"
switch: "{{ prometheus__alert_rules_switch }}"
prometheus: "{{ prometheus__alert_rules_prometheus }}" prometheus: "{{ prometheus__alert_rules_prometheus }}"
node: "{{ prometheus__alert_rules_node }}" node: "{{ prometheus__alert_rules_node }}"
keepalived: "{{ prometheus__alert_rules_keepalived }}" keepalived: "{{ prometheus__alert_rules_keepalived }}"

View file

@ -21,6 +21,11 @@ prometheus_snmp__modules:
- snAgentCpu - snAgentCpu
- snSwInfo - snSwInfo
- snSwIfInfoTable - snSwIfInfoTable
- dot3StatsTable
- dot3HCStatsTable
- dot3Errors
- dot3Tests
- dot3CollTable
lookups: lookups:
- source_indexes: - source_indexes:
- ifIndex - ifIndex
@ -44,6 +49,15 @@ prometheus_snmp__modules:
- source_indexes: - source_indexes:
- snSwIfInfoPortNum - snSwIfInfoPortNum
lookup: snSwIfDescr lookup: snSwIfDescr
- source_indexes:
- dot3StatsIndex
lookup: ifAlias
- source_indexes:
- dot3StatsIndex
lookup: ifDescr
- source_indexes:
- dot3StatsIndex
lookup: ifName
overrides: overrides:
ifIndex: ifIndex:
ignore: true ignore: true
@ -79,4 +93,10 @@ prometheus_snmp__modules:
ignore: true ignore: true
snSwIfInfoMediaType: snSwIfInfoMediaType:
type: EnumAsInfo type: EnumAsInfo
dot3StatsIndex:
ignore: true
dot3StatsEtherChipSet:
ignore: true
dot3StatsDuplexStatus:
type: EnumAsStateSet
... ...