--- prometheus__alertmanager_targets: - docker-ovh.adm.auro.re:9093 prometheus__tsdb_retention_time: 90d prometheus__scraping: node: targets: "{{ groups.vm_network + groups.pve_network }}" address: port: 9100 prometheus: targets: "{{ groups.prom }}" address: port: 9090 kresd: targets: "{{ groups.dns }}" address: port: 8453 bird: targets: "{{ groups.router }}" address: port: 9324 quanta: targets: "{{ groups.quanta }}" address: 127.0.0.1:9116 path: /snmp params: module: - quanta snmp: targets: "{{ groups.prom }}" address: port: 9116 prometheus__alert_rules_prometheus: - alert: PrometheusTsdbCompactionFailed expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 for: 0m labels: severity: critical prometheus__alert_rules_common: - alert: CollectorDown expr: > up == 0 for: 3m labels: severity: critical annotations: Job: !unsafe "{{ $labels.job }}" prometheus__alert_rules_node: - alert: OutOfMemory expr: 100 * ( node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes ) / node_memory_MemTotal_bytes < 10 for: 5m labels: severity: warning annotations: FreeMemory: !unsafe '{{ printf "%.0f" $value }} %' - alert: HostSwapIsFillingUp expr: 100 * ( 1 - ( node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes ) ) >= 50 for: 3m labels: severity: critical annotations: UsedSwap: !unsafe '{{ printf "%.0f" $value }} %' - alert: HostPhysicalComponentTooHot expr: node_hwmon_temp_celsius > 79 for: 3m labels: severity: critical annotations: Temperature: !unsafe '{{ printf "%.0f" $value }} °C' Chip: !unsafe "{{ $labels.chip }}" Sensor: !unsafe "{{ $labels.sensor }}" - alert: HostNodeOvertemperatureAlarm expr: node_hwmon_temp_crit_alarm_celsius == 1 for: 0m labels: severity: critical annotations: Chip: !unsafe "{{ $labels.chip }}" Sensor: !unsafe "{{ $labels.sensor }}" - alert: HostRaidArrayGotInactive expr: node_md_state{state="inactive"} > 0 for: 0m labels: severity: critical annotations: Device: !unsafe "{{ $labels.device }}" - alert: HostRaidDiskFailure expr: node_md_disks{state="failed"} > 0 for: 0m labels: severity: critical annotations: severity: !unsafe "{{ $labels.md_device }}" - alert: HostOomKillDetected expr: increase(node_vmstat_oom_kill[1m]) > 0 for: 0m labels: severity: warning annotations: PID: !unsafe "{{ $value }}" - alert: HostEdacCorrectableErrorsDetected expr: increase(node_edac_correctable_errors_total[1m]) > 0 for: 0m labels: severity: warning annotations: CorrectedErrors: !unsafe "{{ $value }}" - alert: HostEdacUncorrectableErrorsDetected expr: increase(node_edac_uncorrectable_errors_total[1m]) > 0 for: 0m labels: severity: warning annotations: DetectedErrors: !unsafe "{{ $value }}" - alert: OutOfDiskSpace expr: ( 100 * node_filesystem_free_bytes / node_filesystem_size_bytes < 10 ) and on (instance, device, mountpoint) ( node_filesystem_readonly ) == 0 for: 5m labels: severity: critical annotations: Mountpoint: !unsafe '{{ $labels.mountpoint }}' FreeSpace: !unsafe '{{ printf "%.0f" $value }} %' - alert: HostConntrackLimit expr: 100 * ( node_nf_conntrack_entries / node_nf_conntrack_entries_limit ) > 80 for: 5m labels: severity: warning annotations: Filled: !unsafe '{{ printf "%.0f" $value }} %' - alert: HostClockSkew expr: ( node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0 ) or ( node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0 ) for: 2m labels: severity: warning - alert: HostClockNotSynchronising expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 for: 2m labels: severity: warning - alert: HostRequiresReboot expr: node_reboot_required > 0 for: 5m labels: severity: warning - alert: OutOfInodes expr: 100 * node_filesystem_files_free / node_filesystem_files < 10 for: 3m labels: severity: warning annotations: Mountpoint: !unsafe "{{ $labels.mountpoint }}" FreeInodes: !unsafe '{{ printf "%.0f" $value }} %' - alert: CpuUsage expr: 100 * ( 1 - avg by (instance) ( irate(node_cpu_seconds_total{mode="idle"}[5m]) ) ) > 75 for: 10m labels: severity: warning annotations: Usage: !unsafe '{{ printf "%.0f" $value }} %' - alert: SystemdServiceFailed expr: node_systemd_unit_state{state="failed"} == 1 for: 10m labels: severity: warning annotations: Service: !unsafe "{{ $labels.name }}" - alert: LoadUsage expr: node_load1 > 5 for: 2m labels: severity: warning annotations: Load1: !unsafe '{{ printf "%.0f" $value }}' - alert: UnhealthyDisk expr: smartmon_device_smart_healthy < 1 for: 10m labels: severity: critical annotations: Disk: !unsafe "{{ $labels.disk }}" - alert: HostCpuStealNoisyNeighbor expr: 100 * avg by (instance) ( rate(node_cpu_seconds_total{mode="steal"}[5m]) ) > 10 for: 5m labels: severity: warning annotations: Disk: !unsafe "{{ $labels.disk }}" prometheus__alert_rules_keepalived: - alert: KeepalivedVrrpFault expr: keepalived_vrrp_state{state="fault"} > 0 for: 0m labels: severity: critical annotations: Instance: !unsafe "{{ $labels.instance }}" - alert: KeepalivedMasterChange expr: changes( keepalived_vrrp_state{ keepalived_vvrp_state="master" }[1m] ) > 1 for: 0m labels: severity: warning annotations: Instance: !unsafe "{{ $labels.instance }}" prometheus__alert_rules_bird: - record: bird:protocol_up:bgp expr: label_replace( bird_protocol_up{proto="BGP"}, "group", "$1", "instance", "^([^0-9\\.]+)-[0-9]+.*" ) # Sessions qui ne sont volontairement pas redondées # au sein d'un groupe - record: bird:protocol_up:bgp:non_redundant expr: bird:protocol_up:bgp{ group="edge", name=~"^(oti|crans|legacy|edge)[46]$" } # Sessions qui le sont - record: bird:protocol_up:bgp:redundant expr: bird:protocol_up:bgp unless bird:protocol_up:bgp:non_redundant - alert: BirdBGPRedundancyDegraded expr: ( count by (group, name) ( bird:protocol_up:bgp:redundant{state="Established"} ) or ( count by (group, name) ( bird:protocol_up:bgp:redundant{state!="Established"} ) * 0 ) ) < 2 for: 0m labels: severity: warning annotations: Session: !unsafe "{{ $labels.name }}" Count: !unsafe "{{ $value }}" Group: !unsafe "{{ $labels.group }}" - alert: BirdBGPDown expr: ( count by (group, name) ( bird:protocol_up:bgp{state="Established"} ) or ( count by (group, name) ( bird:protocol_up:bgp{state!="Established"} ) * 0 ) ) == 0 for: 0m labels: severity: critical annotations: Session: !unsafe "{{ $labels.name }}" Group: !unsafe "{{ $labels.group }}" - alert: BirdBGPNoExportedPrefixRedundant expr: bird_protocol_prefix_export_count{ export_filter!="REJECT", } * on (instance, name) ( bird:protocol_up:bgp:redundant{state="Established"} ) == 0 for: 0m labels: severity: critical annotations: Session: !unsafe "{{ $labels.name }}" - alert: BirdBGPNoExportedPrefixNonRedundant expr: sum by (group) ( bird_protocol_prefix_export_count{ export_filter!="REJECT", } * on (instance, name) group_left (group) ( bird:protocol_up:bgp:non_redundant{state="Established"} ) ) == 0 for: 0m labels: severity: critical annotations: Session: !unsafe "{{ $labels.name }}" - alert: BirdOSPFNeighboursChange expr: changes(bird_ospf_neighbor_count[5m]) > 0 or changes(bird_ospfv3_neighbor_count[5m]) > 0 for: 0m labels: severity: warning - alert: BirdOSPFDown expr: bird_ospf_running == 0 for: 0m labels: severity: critical annotations: Instance: !unsafe "{{ $labels.name }}" prometheus__alert_rules_quanta: - alert: QuantaQueueOverflow expr: snAgGblQueueOverflow == 1 for: 0m labels: severity: critical - alert: QuantaCpuUsage expr: snAgGblCpuUtil1MinAvg > 50 for: 5m labels: severity: warning annotations: Usage: !unsafe "{{ $value }} %" - alert: QuantaCpuUsage expr: snAgGblCpuUtil1MinAvg > 80 for: 5m labels: severity: critical annotations: Usage: !unsafe "{{ $value }} %" - alert: QuantaMemoryUsage expr: 100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50 for: 5m labels: severity: warning annotations: UsedMemory: !unsafe "{{ $value }} %" - alert: QuantaMemoryUsage expr: 100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80 for: 5m labels: severity: alert annotations: UsedMemory: !unsafe "{{ $value }} %" - alert: QuantaFanHealth expr: snChasFanOperStatus{snChasFanOperStatus="normal"} == 0 for: 0m labels: severity: critical annotations: Description: !unsafe "{{ $labels.shChasFanDescription }}" Status: !unsafe "{{ $labels.snChasFanOperStatus }}" - alert: QuantaTemp expr: 0.5 * snAgentTempValue > 45 for: 0m labels: severity: warning annotations: Temperature: !unsafe "{{ $value }} °C" Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}" - alert: QuantaTemp expr: 0.5 * snAgentTempValue > 60 for: 0m labels: severity: critical annotations: Temperature: !unsafe "{{ $value }} °C" Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}" - alert: QuantaPowerRedundancyFailure expr: count by (instance) ( snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"} ) < 2 for: 0m labels: severity: warning prometheus__alert_rules_switch: - alert: SwitchPromiscuousChange expr: changes(ifPromiscuousMode[5m]) > 0 for: 0m labels: severity: warning annotations: Interface: !unsafe "{{ $labels.ifName }} {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" - alert: SwitchInterfaceUpChange expr: changes(ifOperStatus{ifOperStatus="up"}[5m]) > 0 for: 0m labels: severity: warning annotations: Interface: !unsafe "{{ $labels.ifName }} {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" - alert: SwitchInErrors expr: 100 * irate(ifInErrors[5m]) / ( irate(ifInUcastPkts[5m]) + irate(ifInNUcastPkts[5m]) ) > 0.01 for: 0m labels: severity: warning annotations: ErrorRate: !unsafe '{{ printf "%.2f" $value }} %' Interface: !unsafe "{{ $labels.ifName }} {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" - alert: SwitchOutErrors expr: 100 * irate(ifOutErrors[5m]) / ( irate(ifOutUcastPkts[5m]) + irate(ifOutNUcastPkts[5m]) ) > 0.01 for: 0m labels: severity: warning annotations: ErrorRate: !unsafe '{{ printf "%.2f" $value }} %' Interface: !unsafe "{{ $labels.ifName }} {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" prometheus__alert_rules: common: "{{ prometheus__alert_rules_common }}" switch: "{{ prometheus__alert_rules_switch }}" prometheus: "{{ prometheus__alert_rules_prometheus }}" node: "{{ prometheus__alert_rules_node }}" keepalived: "{{ prometheus__alert_rules_keepalived }}" quanta: "{{ prometheus__alert_rules_quanta }}" bird: "{{ prometheus__alert_rules_bird }}" ...