misc: various monitoring changes
This commit is contained in:
parent
9e483d5285
commit
fc7f59b231
4 changed files with 87 additions and 48 deletions
|
@ -5,19 +5,9 @@ class FilterModule:
|
||||||
def filters(self):
|
def filters(self):
|
||||||
return {
|
return {
|
||||||
"prometheus__convert_jobs": convert_jobs,
|
"prometheus__convert_jobs": convert_jobs,
|
||||||
"interp": interp,
|
|
||||||
"interp_float": interp_float,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def interp(string):
|
|
||||||
return AnsibleUnicode(f"{{{{ {string} }}}}")
|
|
||||||
|
|
||||||
|
|
||||||
def interp_float(string):
|
|
||||||
return AnsibleUnicode(f'{{{{ printf "%.0f" {string} }}}}')
|
|
||||||
|
|
||||||
|
|
||||||
def convert_jobs(config):
|
def convert_jobs(config):
|
||||||
|
|
||||||
for name, job in config.items():
|
for name, job in config.items():
|
||||||
|
|
|
@ -98,6 +98,10 @@ firewall__zones:
|
||||||
- 10.206.1.5
|
- 10.206.1.5
|
||||||
- 2a09:6840:206::1:6
|
- 2a09:6840:206::1:6
|
||||||
- 10.206.1.6
|
- 10.206.1.6
|
||||||
|
docker-ovh.adm:
|
||||||
|
addrs:
|
||||||
|
- 2a09:6840:128::150
|
||||||
|
- 10.128.0.150
|
||||||
|
|
||||||
firewall__input:
|
firewall__input:
|
||||||
- src: back
|
- src: back
|
||||||
|
@ -134,6 +138,13 @@ firewall__forward:
|
||||||
- src: monit
|
- src: monit
|
||||||
dst: sw
|
dst: sw
|
||||||
verdict: accept
|
verdict: accept
|
||||||
|
# Alertmanager
|
||||||
|
- src: monit
|
||||||
|
dst: docker-ovh.adm
|
||||||
|
protocols:
|
||||||
|
tcp:
|
||||||
|
dport: 9093
|
||||||
|
verdict: accept
|
||||||
- src: adm-legacy
|
- src: adm-legacy
|
||||||
dst: bmc
|
dst: bmc
|
||||||
verdict: accept
|
verdict: accept
|
||||||
|
@ -204,4 +215,9 @@ firewall__nat:
|
||||||
protocols: null
|
protocols: null
|
||||||
snat:
|
snat:
|
||||||
addr: 45.66.111.200/32
|
addr: 45.66.111.200/32
|
||||||
|
#- src: monit
|
||||||
|
# dst: adm-legacy
|
||||||
|
# protocols: null
|
||||||
|
# snat:
|
||||||
|
# addr: 10.203.1.3/32
|
||||||
...
|
...
|
||||||
|
|
|
@ -47,7 +47,7 @@ prometheus__alert_rules_common:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Collecteur {{ '$labels.job' | interp }}"
|
Job: !unsafe "{{ $labels.job }}"
|
||||||
|
|
||||||
prometheus__alert_rules_node:
|
prometheus__alert_rules_node:
|
||||||
- alert: OutOfMemory
|
- alert: OutOfMemory
|
||||||
|
@ -59,7 +59,7 @@ prometheus__alert_rules_node:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Mémoire libre à {{ '$value' | interp_float }}%"
|
FreeMemory: !unsafe '{{ printf "%.0f" $value }} %'
|
||||||
- alert: HostSwapIsFillingUp
|
- alert: HostSwapIsFillingUp
|
||||||
expr: "( 1 - ( node_memory_SwapFree_bytes
|
expr: "( 1 - ( node_memory_SwapFree_bytes
|
||||||
/ node_memory_SwapTotal_bytes ) )
|
/ node_memory_SwapTotal_bytes ) )
|
||||||
|
@ -68,59 +68,59 @@ prometheus__alert_rules_node:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Swap {{ '$value' | interp_float }}%"
|
UsedSwap: !unsafe '{{ printf "%.0f" $value }} %'
|
||||||
- alert: HostPhysicalComponentTooHot
|
- alert: HostPhysicalComponentTooHot
|
||||||
expr: "node_hwmon_temp_celsius > 79"
|
expr: "node_hwmon_temp_celsius > 79"
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$value' | interp_float }}°C :
|
Temperature: !unsafe '{{ printf "%.0f" $value }} °C'
|
||||||
{{ '$labels.chip' | interp }},
|
Chip: !unsafe "{{ $labels.chip }}"
|
||||||
{{ '$labels.sensor' | interp }}"
|
Sensor: !unsafe "{{ $labels.sensor }}"
|
||||||
- alert: HostNodeOvertemperatureAlarm
|
- alert: HostNodeOvertemperatureAlarm
|
||||||
expr: "node_hwmon_temp_crit_alarm_celsius == 1"
|
expr: "node_hwmon_temp_crit_alarm_celsius == 1"
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.chip' | interp }},
|
Chip: !unsafe "{{ $labels.chip }}"
|
||||||
{{ '$labels.sensor' | interp }}"
|
Sensor: !unsafe "{{ $labels.sensor }}"
|
||||||
- alert: HostRaidArrayGotInactive
|
- alert: HostRaidArrayGotInactive
|
||||||
expr: 'node_md_state{state="inactive"} > 0'
|
expr: 'node_md_state{state="inactive"} > 0'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.device' | interp }}"
|
Device: !unsafe "{{ $labels.device }}"
|
||||||
- alert: HostRaidDiskFailure
|
- alert: HostRaidDiskFailure
|
||||||
expr: 'node_md_disks{state="failed"} > 0'
|
expr: 'node_md_disks{state="failed"} > 0'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
severity: "{{ '$labels.md_device' | interp }}"
|
severity: !unsafe "{{ $labels.md_device }}"
|
||||||
- alert: HostOomKillDetected
|
- alert: HostOomKillDetected
|
||||||
expr: "increase(node_vmstat_oom_kill[1m]) > 0"
|
expr: "increase(node_vmstat_oom_kill[1m]) > 0"
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "PID {{ '$value' | interp }}"
|
PID: !unsafe "{{ $value }}"
|
||||||
- alert: HostEdacCorrectableErrorsDetected
|
- alert: HostEdacCorrectableErrorsDetected
|
||||||
expr: "increase(node_edac_correctable_errors_total[1m]) > 0"
|
expr: "increase(node_edac_correctable_errors_total[1m]) > 0"
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$value' | interp }} erreurs corrigées"
|
CorrectedErrors: !unsafe "{{ $value }}"
|
||||||
- alert: HostEdacUncorrectableErrorsDetected
|
- alert: HostEdacUncorrectableErrorsDetected
|
||||||
expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0"
|
expr: "increase(node_edac_uncorrectable_errors_total[1m]) > 0"
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$value' | interp }} erreurs corrigées"
|
DetectedErrors: !unsafe "{{ $value }}"
|
||||||
- alert: OutOfDiskSpace
|
- alert: OutOfDiskSpace
|
||||||
expr: "( node_filesystem_free_bytes
|
expr: "( node_filesystem_free_bytes
|
||||||
/ node_filesystem_size_bytes * 100 < 10 )
|
/ node_filesystem_size_bytes * 100 < 10 )
|
||||||
|
@ -130,8 +130,8 @@ prometheus__alert_rules_node:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.mountpoint' | interp }} :
|
Mountpoint: !unsafe '{{ $labels.mountpoint }}'
|
||||||
{{ '$value' | interp_float }}% libre"
|
FreeSpace: !unsafe '{{ printf "%.0f" $value }} %'
|
||||||
- alert: HostConntrackLimit
|
- alert: HostConntrackLimit
|
||||||
expr: "( node_nf_conntrack_entries
|
expr: "( node_nf_conntrack_entries
|
||||||
/ node_nf_conntrack_entries_limit ) * 100 > 80"
|
/ node_nf_conntrack_entries_limit ) * 100 > 80"
|
||||||
|
@ -139,7 +139,7 @@ prometheus__alert_rules_node:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$value' | interp_float }}% complet"
|
Filled: !unsafe '{{ printf "%.0f" $value }} %'
|
||||||
- alert: HostClockSkew
|
- alert: HostClockSkew
|
||||||
expr: "(node_timex_offset_seconds > 0.05
|
expr: "(node_timex_offset_seconds > 0.05
|
||||||
and deriv(node_timex_offset_seconds[5m]) >= 0)
|
and deriv(node_timex_offset_seconds[5m]) >= 0)
|
||||||
|
@ -166,8 +166,8 @@ prometheus__alert_rules_node:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.mountpoint' | interp }} :
|
Mountpoint: !unsafe "{{ $labels.mountpoint }}"
|
||||||
{{ '$value' | interp_float }}% libre"
|
FreeInodes: !unsafe '{{ printf "%.0f" $value }} %'
|
||||||
- alert: CpuUsage
|
- alert: CpuUsage
|
||||||
expr: '( 100 - avg by (instance)
|
expr: '( 100 - avg by (instance)
|
||||||
( irate(node_cpu_seconds_total{mode="idle"}[5m]) )
|
( irate(node_cpu_seconds_total{mode="idle"}[5m]) )
|
||||||
|
@ -176,28 +176,28 @@ prometheus__alert_rules_node:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$value' | interp_float }}%"
|
Usage: !unsafe '{{ printf "%.0f" $value }} %'
|
||||||
- alert: SystemdServiceFailed
|
- alert: SystemdServiceFailed
|
||||||
expr: 'node_systemd_unit_state{state="failed"} == 1'
|
expr: 'node_systemd_unit_state{state="failed"} == 1'
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.name' | interp }}"
|
Service: !unsafe "{{ $labels.name }}"
|
||||||
- alert: LoadUsage
|
- alert: LoadUsage
|
||||||
expr: "node_load1 > 5"
|
expr: "node_load1 > 5"
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$value' | interp_float }}"
|
Load1: !unsafe '{{ printf "%.0f" $value }}'
|
||||||
- alert: UnhealthyDisk
|
- alert: UnhealthyDisk
|
||||||
expr: "smartmon_device_smart_healthy < 1"
|
expr: "smartmon_device_smart_healthy < 1"
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.disk' | interp }}"
|
Disk: !unsafe "{{ $labels.disk }}"
|
||||||
- alert: HostCpuStealNoisyNeighbor
|
- alert: HostCpuStealNoisyNeighbor
|
||||||
expr: 'avg by(instance)
|
expr: 'avg by(instance)
|
||||||
(rate(node_cpu_seconds_total{mode="steal"}[5m]))
|
(rate(node_cpu_seconds_total{mode="steal"}[5m]))
|
||||||
|
@ -206,7 +206,7 @@ prometheus__alert_rules_node:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.disk' | interp }}"
|
Disk: !unsafe "{{ $labels.disk }}"
|
||||||
|
|
||||||
prometheus__alert_rules_keepalived:
|
prometheus__alert_rules_keepalived:
|
||||||
- alert: KeepalivedVrrpFault
|
- alert: KeepalivedVrrpFault
|
||||||
|
@ -215,7 +215,7 @@ prometheus__alert_rules_keepalived:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.instance' | interp }}"
|
Instance: !unsafe "{{ $labels.instance }}"
|
||||||
- alert: KeepalivedMasterChange
|
- alert: KeepalivedMasterChange
|
||||||
expr: 'changes(
|
expr: 'changes(
|
||||||
keepalived_vrrp_state
|
keepalived_vrrp_state
|
||||||
|
@ -224,7 +224,7 @@ prometheus__alert_rules_keepalived:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.instance' | interp }}"
|
Instance: !unsafe "{{ $labels.instance }}"
|
||||||
|
|
||||||
prometheus__alert_rules_bird:
|
prometheus__alert_rules_bird:
|
||||||
- alert: BirdProtocolDown
|
- alert: BirdProtocolDown
|
||||||
|
@ -233,8 +233,8 @@ prometheus__alert_rules_bird:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ '$labels.name' | interp }} :
|
Protocol: !unsafe "{{ $labels.name }}"
|
||||||
{{ '$labels.state' | interp }}"
|
State: !unsafe "{{ $labels.state }}"
|
||||||
|
|
||||||
prometheus__alert_rules_quanta:
|
prometheus__alert_rules_quanta:
|
||||||
- alert: QuantaQueueOverflow
|
- alert: QuantaQueueOverflow
|
||||||
|
@ -248,60 +248,73 @@ prometheus__alert_rules_quanta:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Utilisation forte du processus ({{ '$value' | interp }}%)"
|
Usage: !unsafe "{{ $value }} %"
|
||||||
- alert: QuantaCpuUsage
|
- alert: QuantaCpuUsage
|
||||||
expr: 'snAgGblCpuUtil1MinAvg > 80'
|
expr: 'snAgGblCpuUtil1MinAvg > 80'
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Utilisation intense du processus ({{ '$value' | interp }}%)"
|
Usage: !unsafe "{{ $value }} %"
|
||||||
- alert: QuantaMemoryUsage
|
- alert: QuantaMemoryUsage
|
||||||
expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50'
|
expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50'
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Utilisation forte de la mémoire ({{ '$value' | interp }}%)"
|
UsedMemory: !unsafe "{{ $value }} %"
|
||||||
- alert: QuantaMemoryUsage
|
- alert: QuantaMemoryUsage
|
||||||
expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80'
|
expr: '100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80'
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: alert
|
severity: alert
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Utilisation intense de la mémoire ({{ '$value' | interp }}%)"
|
UsedMemory: !unsafe "{{ $value }} %"
|
||||||
- alert: QuantaFanHealth
|
- alert: QuantaFanHealth
|
||||||
expr: 'snChasFanOperStatus{snChasFanOperStatus="normal"} == 0'
|
expr: 'snChasFanOperStatus{snChasFanOperStatus="normal"} == 0'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Le ventilateur {{ '$labels.snChasFanDescription' | interp }} est
|
Description: !unsafe "{{ $labels.shChasFanDescription }}"
|
||||||
en mode {{ '$labels.snChasFanOperStatus' | interp }}"
|
Status: !unsafe "{{ $labels.snChasFanOperStatus }}"
|
||||||
- alert: QuantaTemp
|
- alert: QuantaTemp
|
||||||
expr: '(snAgentTempValue / 2) > 45'
|
expr: '(snAgentTempValue / 2) > 45'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "La température de {{ '$labels.snAgentTempSensorDescr' | interp }}
|
Temperature: !unsafe "{{ $value }} °C"
|
||||||
est élevée ({{ '$value' | interp }}°C)"
|
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
|
||||||
- alert: QuantaTemp
|
- alert: QuantaTemp
|
||||||
expr: '(snAgentTempValue / 2) > 60'
|
expr: '(snAgentTempValue / 2) > 60'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "La température de {{ '$labels.snAgentTempSensorDescr' | interp }}
|
Temperature: !unsafe "{{ $value }} °C"
|
||||||
est très élevée ({{ '$value' | interp }}°C)"
|
Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}"
|
||||||
- alert: QuantaPowerRedundancyFailure
|
- alert: QuantaPowerRedundancyFailure
|
||||||
expr: 'count by (instance) (snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"}) < 2'
|
expr: 'count by (instance)
|
||||||
|
(snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"})
|
||||||
|
< 2'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
||||||
|
prometheus__alert_rules_switch:
|
||||||
|
- alert: SwitchPromiscuousChange
|
||||||
|
expr: "changes(ifPromiscuousMode[5m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
Interface: !unsafe "{{ $labels.ifName }}
|
||||||
|
{{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}"
|
||||||
|
|
||||||
prometheus__alert_rules:
|
prometheus__alert_rules:
|
||||||
common: "{{ prometheus__alert_rules_common }}"
|
common: "{{ prometheus__alert_rules_common }}"
|
||||||
|
switch: "{{ prometheus__alert_rules_switch }}"
|
||||||
prometheus: "{{ prometheus__alert_rules_prometheus }}"
|
prometheus: "{{ prometheus__alert_rules_prometheus }}"
|
||||||
node: "{{ prometheus__alert_rules_node }}"
|
node: "{{ prometheus__alert_rules_node }}"
|
||||||
keepalived: "{{ prometheus__alert_rules_keepalived }}"
|
keepalived: "{{ prometheus__alert_rules_keepalived }}"
|
||||||
|
|
|
@ -21,6 +21,11 @@ prometheus_snmp__modules:
|
||||||
- snAgentCpu
|
- snAgentCpu
|
||||||
- snSwInfo
|
- snSwInfo
|
||||||
- snSwIfInfoTable
|
- snSwIfInfoTable
|
||||||
|
- dot3StatsTable
|
||||||
|
- dot3HCStatsTable
|
||||||
|
- dot3Errors
|
||||||
|
- dot3Tests
|
||||||
|
- dot3CollTable
|
||||||
lookups:
|
lookups:
|
||||||
- source_indexes:
|
- source_indexes:
|
||||||
- ifIndex
|
- ifIndex
|
||||||
|
@ -44,6 +49,15 @@ prometheus_snmp__modules:
|
||||||
- source_indexes:
|
- source_indexes:
|
||||||
- snSwIfInfoPortNum
|
- snSwIfInfoPortNum
|
||||||
lookup: snSwIfDescr
|
lookup: snSwIfDescr
|
||||||
|
- source_indexes:
|
||||||
|
- dot3StatsIndex
|
||||||
|
lookup: ifAlias
|
||||||
|
- source_indexes:
|
||||||
|
- dot3StatsIndex
|
||||||
|
lookup: ifDescr
|
||||||
|
- source_indexes:
|
||||||
|
- dot3StatsIndex
|
||||||
|
lookup: ifName
|
||||||
overrides:
|
overrides:
|
||||||
ifIndex:
|
ifIndex:
|
||||||
ignore: true
|
ignore: true
|
||||||
|
@ -79,4 +93,10 @@ prometheus_snmp__modules:
|
||||||
ignore: true
|
ignore: true
|
||||||
snSwIfInfoMediaType:
|
snSwIfInfoMediaType:
|
||||||
type: EnumAsInfo
|
type: EnumAsInfo
|
||||||
|
dot3StatsIndex:
|
||||||
|
ignore: true
|
||||||
|
dot3StatsEtherChipSet:
|
||||||
|
ignore: true
|
||||||
|
dot3StatsDuplexStatus:
|
||||||
|
type: EnumAsStateSet
|
||||||
...
|
...
|
||||||
|
|
Loading…
Reference in a new issue