From 16a0d95936abd6c0e919ab81ed20487853b3bd04 Mon Sep 17 00:00:00 2001 From: Jeltz Date: Thu, 2 Nov 2023 20:27:45 +0100 Subject: [PATCH] prometheus: refactoring of the config --- group_vars/prom/prometheus.yml | 526 ---------------------- group_vars/prom/prometheus/bird.yml | 139 ++++++ group_vars/prom/prometheus/common.yml | 11 + group_vars/prom/prometheus/keepalived.yml | 23 + group_vars/prom/prometheus/kresd.yml | 6 + group_vars/prom/prometheus/main.yml | 23 + group_vars/prom/prometheus/node.yml | 199 ++++++++ group_vars/prom/prometheus/prometheus.yml | 14 + group_vars/prom/prometheus/quanta.yml | 84 ++++ group_vars/prom/prometheus/snmp.yml | 6 + group_vars/prom/prometheus/switch.yml | 47 ++ roles/bird/templates/.bird.conf.j2.swp | Bin 20480 -> 0 bytes roles/prometheus/defaults/main.yml | 2 +- roles/prometheus/tasks/main.yml | 4 +- roles/prometheus/templates/rules.yml.j2 | 2 +- 15 files changed, 556 insertions(+), 530 deletions(-) delete mode 100644 group_vars/prom/prometheus.yml create mode 100644 group_vars/prom/prometheus/bird.yml create mode 100644 group_vars/prom/prometheus/common.yml create mode 100644 group_vars/prom/prometheus/keepalived.yml create mode 100644 group_vars/prom/prometheus/kresd.yml create mode 100644 group_vars/prom/prometheus/main.yml create mode 100644 group_vars/prom/prometheus/node.yml create mode 100644 group_vars/prom/prometheus/prometheus.yml create mode 100644 group_vars/prom/prometheus/quanta.yml create mode 100644 group_vars/prom/prometheus/snmp.yml create mode 100644 group_vars/prom/prometheus/switch.yml delete mode 100644 roles/bird/templates/.bird.conf.j2.swp diff --git a/group_vars/prom/prometheus.yml b/group_vars/prom/prometheus.yml deleted file mode 100644 index 0326f78..0000000 --- a/group_vars/prom/prometheus.yml +++ /dev/null @@ -1,526 +0,0 @@ ---- -prometheus__alertmanager_targets: - - docker-ovh.adm.auro.re:9093 - -prometheus__tsdb_retention_time: 90d - -prometheus__scraping: - node: - targets: "{{ groups.vm_network + groups.pve_network }}" - address: - port: 9100 - prometheus: - targets: "{{ groups.prom }}" - address: - port: 9090 - kresd: - targets: "{{ groups.dns }}" - address: - port: 8453 - bird: - targets: "{{ groups.router }}" - address: - port: 9324 - quanta: - targets: "{{ groups.quanta }}" - address: 127.0.0.1:9116 - path: /snmp - params: - module: - - quanta - snmp: - targets: "{{ groups.prom }}" - address: - port: 9116 - -prometheus__alert_rules_prometheus: - - alert: PrometheusTsdbCompactionFailed - expr: - increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - -prometheus__alert_rules_common: - - alert: CollectorDown - expr: > - up == 0 - for: 3m - labels: - severity: critical - annotations: - Job: !unsafe "{{ $labels.job }}" - -prometheus__alert_rules_node: - - alert: OutOfMemory - expr: - 100 * ( - node_memory_MemFree_bytes - + node_memory_Cached_bytes - + node_memory_Buffers_bytes - ) / node_memory_MemTotal_bytes < 10 - for: 5m - labels: - severity: warning - annotations: - FreeMemory: !unsafe '{{ printf "%.0f" $value }} %' - - alert: HostSwapIsFillingUp - expr: - 100 * ( - 1 - ( - node_memory_SwapFree_bytes - / node_memory_SwapTotal_bytes - ) - ) >= 50 - for: 3m - labels: - severity: critical - annotations: - UsedSwap: !unsafe '{{ printf "%.0f" $value }} %' - - alert: HostPhysicalComponentTooHot - expr: - node_hwmon_temp_celsius > 79 - for: 3m - labels: - severity: critical - annotations: - Temperature: !unsafe '{{ printf "%.0f" $value }} °C' - Chip: !unsafe "{{ $labels.chip }}" - Sensor: !unsafe "{{ $labels.sensor }}" - - alert: HostNodeOvertemperatureAlarm - expr: - node_hwmon_temp_crit_alarm_celsius == 1 - for: 0m - labels: - severity: critical - annotations: - Chip: !unsafe "{{ $labels.chip }}" - Sensor: !unsafe "{{ $labels.sensor }}" - - alert: HostRaidArrayGotInactive - expr: - node_md_state{state="inactive"} > 0 - for: 0m - labels: - severity: critical - annotations: - Device: !unsafe "{{ $labels.device }}" - - alert: HostRaidDiskFailure - expr: - node_md_disks{state="failed"} > 0 - for: 0m - labels: - severity: critical - annotations: - severity: !unsafe "{{ $labels.md_device }}" - - alert: HostOomKillDetected - expr: - increase(node_vmstat_oom_kill[1m]) > 0 - for: 0m - labels: - severity: warning - annotations: - PID: !unsafe "{{ $value }}" - - alert: HostEdacCorrectableErrorsDetected - expr: - increase(node_edac_correctable_errors_total[1m]) > 0 - for: 0m - labels: - severity: warning - annotations: - CorrectedErrors: !unsafe "{{ $value }}" - - alert: HostEdacUncorrectableErrorsDetected - expr: - increase(node_edac_uncorrectable_errors_total[1m]) > 0 - for: 0m - labels: - severity: warning - annotations: - DetectedErrors: !unsafe "{{ $value }}" - - alert: OutOfDiskSpace - expr: - ( - 100 * node_filesystem_free_bytes - / node_filesystem_size_bytes < 10 - ) - and on (instance, device, mountpoint) ( - node_filesystem_readonly - ) == 0 - for: 5m - labels: - severity: critical - annotations: - Mountpoint: !unsafe '{{ $labels.mountpoint }}' - FreeSpace: !unsafe '{{ printf "%.0f" $value }} %' - - alert: HostConntrackLimit - expr: - 100 * ( - node_nf_conntrack_entries - / node_nf_conntrack_entries_limit - ) > 80 - for: 5m - labels: - severity: warning - annotations: - Filled: !unsafe '{{ printf "%.0f" $value }} %' - - alert: HostClockSkew - expr: - ( - node_timex_offset_seconds > 0.05 - and deriv(node_timex_offset_seconds[5m]) >= 0 - ) or ( - node_timex_offset_seconds < -0.05 - and deriv(node_timex_offset_seconds[5m]) <= 0 - ) - for: 2m - labels: - severity: warning - - alert: HostClockNotSynchronising - expr: - min_over_time(node_timex_sync_status[1m]) == 0 - and node_timex_maxerror_seconds >= 16 - for: 2m - labels: - severity: warning - - alert: HostRequiresReboot - expr: - node_reboot_required > 0 - for: 5m - labels: - severity: warning - - alert: OutOfInodes - expr: - 100 * node_filesystem_files_free - / node_filesystem_files < 10 - for: 3m - labels: - severity: warning - annotations: - Mountpoint: !unsafe "{{ $labels.mountpoint }}" - FreeInodes: !unsafe '{{ printf "%.0f" $value }} %' - - alert: CpuUsage - expr: - 100 * ( - 1 - avg by (instance) ( - irate(node_cpu_seconds_total{mode="idle"}[5m]) - ) - ) > 75 - for: 10m - labels: - severity: warning - annotations: - Usage: !unsafe '{{ printf "%.0f" $value }} %' - - alert: SystemdServiceFailed - expr: - node_systemd_unit_state{state="failed"} == 1 - for: 10m - labels: - severity: warning - annotations: - Service: !unsafe "{{ $labels.name }}" - - alert: LoadUsage - expr: - node_load1 > 5 - for: 2m - labels: - severity: warning - annotations: - Load1: !unsafe '{{ printf "%.0f" $value }}' - - alert: UnhealthyDisk - expr: - smartmon_device_smart_healthy < 1 - for: 10m - labels: - severity: critical - annotations: - Disk: !unsafe "{{ $labels.disk }}" - - alert: HostCpuStealNoisyNeighbor - expr: - 100 * avg by (instance) ( - rate(node_cpu_seconds_total{mode="steal"}[5m]) - ) > 10 - for: 5m - labels: - severity: warning - annotations: - Disk: !unsafe "{{ $labels.disk }}" - -prometheus__alert_rules_keepalived: - - alert: KeepalivedVrrpFault - expr: - keepalived_vrrp_state{state="fault"} > 0 - for: 0m - labels: - severity: critical - annotations: - Instance: !unsafe "{{ $labels.instance }}" - - alert: KeepalivedMasterChange - expr: - changes( - keepalived_vrrp_state{ - keepalived_vvrp_state="master" - }[1m] - ) > 1 - for: 0m - labels: - severity: warning - annotations: - Instance: !unsafe "{{ $labels.instance }}" - -prometheus__alert_rules_bird: - - record: bird:protocol_up:bgp - expr: - label_replace( - bird_protocol_up{proto="BGP",} - unless bird_protocol_up{ - proto="BGP", - name=~"^(viarezo|isp[12]|rezel)[46]$" - }, - "group", "$1", - "instance", "^([^0-9\\.]+)-[0-9]+.*" - ) - # Sessions qui ne sont volontairement pas redondées - # au sein d'un groupe - - record: bird:protocol_up:bgp:non_redundant - expr: - bird:protocol_up:bgp{ - group="edge", - name=~"^(oti|crans|legacy|edge)[46]$" - } - # Sessions qui le sont - - record: bird:protocol_up:bgp:redundant - expr: - bird:protocol_up:bgp - unless - bird:protocol_up:bgp:non_redundant - - alert: BirdBGPRedundancyDegraded - expr: - ( - count by (group, name) ( - bird:protocol_up:bgp:redundant{state="Established"} - ) or ( - count by (group, name) ( - bird:protocol_up:bgp:redundant{state!="Established"} - ) * 0 - ) - ) < 2 - for: 0m - labels: - severity: warning - annotations: - Session: !unsafe "{{ $labels.name }}" - Count: !unsafe "{{ $value }}" - Group: !unsafe "{{ $labels.group }}" - - alert: BirdBGPDown - expr: - ( - count by (group, name) ( - bird:protocol_up:bgp{state="Established"} - ) or ( - count by (group, name) ( - bird:protocol_up:bgp{state!="Established"} - ) * 0 - ) - ) == 0 - for: 0m - labels: - severity: critical - annotations: - Session: !unsafe "{{ $labels.name }}" - Group: !unsafe "{{ $labels.group }}" - - alert: BirdBGPNoExportedPrefixRedundant - expr: - bird_protocol_prefix_export_count{ - export_filter!="REJECT", - } * on (instance, name) ( - bird:protocol_up:bgp:redundant{state="Established"} - ) == 0 - for: 0m - labels: - severity: critical - annotations: - Session: !unsafe "{{ $labels.name }}" - - alert: BirdBGPNoImportedPrefixRedundant - expr: - bird_protocol_prefix_import_count{ - import_filter!="REJECT", - } * on (instance, name) ( - bird:protocol_up:bgp:redundant{state="Established"} - ) == 0 - for: 0m - labels: - severity: critical - annotations: - - alert: BirdBGPNoExportedPrefixNonRedundant - expr: - sum by (group) ( - bird_protocol_prefix_export_count{ - export_filter!="REJECT", - } * on (instance, name) group_left (group) ( - bird:protocol_up:bgp:non_redundant{state="Established"} - ) - ) == 0 - for: 0m - labels: - severity: critical - annotations: - Session: !unsafe "{{ $labels.name }}" - - alert: BirdBGPNoImportedPrefixNonRedundant - expr: - sum by (group) ( - bird_protocol_prefix_import_count{ - import_filter!="REJECT", - } * on (instance, name) group_left (group) ( - bird:protocol_up:bgp:non_redundant{state="Established"} - ) - ) == 0 - for: 0m - labels: - severity: critical - annotations: - Session: !unsafe "{{ $labels.name }}" - - alert: BirdOSPFNeighboursChange - expr: - changes(bird_ospf_neighbor_count[5m]) > 0 - or changes(bird_ospfv3_neighbor_count[5m]) > 0 - for: 0m - labels: - severity: warning - - alert: BirdOSPFDown - expr: - bird_ospf_running == 0 - for: 0m - labels: - severity: critical - annotations: - Instance: !unsafe "{{ $labels.name }}" - -prometheus__alert_rules_quanta: - - alert: QuantaQueueOverflow - expr: - snAgGblQueueOverflow == 1 - for: 0m - labels: - severity: critical - - alert: QuantaCpuUsage - expr: - snAgGblCpuUtil1MinAvg > 50 - for: 5m - labels: - severity: warning - annotations: - Usage: !unsafe "{{ $value }} %" - - alert: QuantaCpuUsage - expr: - snAgGblCpuUtil1MinAvg > 80 - for: 5m - labels: - severity: critical - annotations: - Usage: !unsafe "{{ $value }} %" - - alert: QuantaMemoryUsage - expr: - 100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50 - for: 5m - labels: - severity: warning - annotations: - UsedMemory: !unsafe "{{ $value }} %" - - alert: QuantaMemoryUsage - expr: - 100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80 - for: 5m - labels: - severity: alert - annotations: - UsedMemory: !unsafe "{{ $value }} %" - - alert: QuantaFanHealth - expr: - snChasFanOperStatus{snChasFanOperStatus="normal"} == 0 - for: 0m - labels: - severity: critical - annotations: - Description: !unsafe "{{ $labels.shChasFanDescription }}" - Status: !unsafe "{{ $labels.snChasFanOperStatus }}" - - alert: QuantaTemp - expr: - 0.5 * snAgentTempValue > 45 - for: 0m - labels: - severity: warning - annotations: - Temperature: !unsafe "{{ $value }} °C" - Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}" - - alert: QuantaTemp - expr: - 0.5 * snAgentTempValue > 60 - for: 0m - labels: - severity: critical - annotations: - Temperature: !unsafe "{{ $value }} °C" - Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}" - - alert: QuantaPowerRedundancyFailure - expr: - count by (instance) ( - snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"} - ) < 2 - for: 0m - labels: - severity: warning - -prometheus__alert_rules_switch: - - alert: SwitchPromiscuousChange - expr: - changes(ifPromiscuousMode[5m]) > 0 - for: 0m - labels: - severity: warning - annotations: - Interface: !unsafe "{{ $labels.ifName }} - {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" - - alert: SwitchInterfaceUpChange - expr: - changes(ifOperStatus{ifOperStatus="up"}[5m]) > 0 - for: 0m - labels: - severity: warning - annotations: - Interface: !unsafe "{{ $labels.ifName }} - {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" - - alert: SwitchInErrors - expr: - 100 * irate(ifInErrors[5m]) / ( - irate(ifInUcastPkts[5m]) - + irate(ifInNUcastPkts[5m]) - ) > 0.01 - for: 0m - labels: - severity: warning - annotations: - ErrorRate: !unsafe '{{ printf "%.2f" $value }} %' - Interface: !unsafe "{{ $labels.ifName }} - {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" - - alert: SwitchOutErrors - expr: - 100 * irate(ifOutErrors[5m]) / ( - irate(ifOutUcastPkts[5m]) - + irate(ifOutNUcastPkts[5m]) - ) > 0.01 - for: 0m - labels: - severity: warning - annotations: - ErrorRate: !unsafe '{{ printf "%.2f" $value }} %' - Interface: !unsafe "{{ $labels.ifName }} - {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" - -prometheus__alert_rules: - common: "{{ prometheus__alert_rules_common }}" - switch: "{{ prometheus__alert_rules_switch }}" - prometheus: "{{ prometheus__alert_rules_prometheus }}" - node: "{{ prometheus__alert_rules_node }}" - keepalived: "{{ prometheus__alert_rules_keepalived }}" - quanta: "{{ prometheus__alert_rules_quanta }}" - bird: "{{ prometheus__alert_rules_bird }}" -... diff --git a/group_vars/prom/prometheus/bird.yml b/group_vars/prom/prometheus/bird.yml new file mode 100644 index 0000000..9d0ca57 --- /dev/null +++ b/group_vars/prom/prometheus/bird.yml @@ -0,0 +1,139 @@ +--- +prometheus__scraping_bird: + targets: "{{ groups.router }}" + address: + port: 9324 + +prometheus__rules_bird: + - record: bird:protocol_up:bgp + expr: + label_replace( + bird_protocol_up{proto="BGP",} + unless bird_protocol_up{ + proto="BGP", + name=~"^(viarezo|isp[12]|rezel)[46]$" + }, + "group", "$1", + "instance", "^([^0-9\\.]+)-[0-9]+.*" + ) + # Sessions qui ne sont volontairement pas redondées + # au sein d'un groupe + - record: bird:protocol_up:bgp:non_redundant + expr: + bird:protocol_up:bgp{ + group="edge", + name=~"^(oti|crans|legacy|edge)[46]$" + } + # Sessions qui le sont + - record: bird:protocol_up:bgp:redundant + expr: + bird:protocol_up:bgp + unless + bird:protocol_up:bgp:non_redundant + - alert: BirdBGPRedundancyDegraded + expr: + ( + count by (group, name) ( + bird:protocol_up:bgp:redundant{state="Established"} + ) or ( + count by (group, name) ( + bird:protocol_up:bgp:redundant{state!="Established"} + ) * 0 + ) + ) < 2 + for: 0m + labels: + severity: warning + annotations: + Session: !unsafe "{{ $labels.name }}" + Count: !unsafe "{{ $value }}" + Group: !unsafe "{{ $labels.group }}" + - alert: BirdBGPDown + expr: + ( + count by (group, name) ( + bird:protocol_up:bgp{state="Established"} + ) or ( + count by (group, name) ( + bird:protocol_up:bgp{state!="Established"} + ) * 0 + ) + ) == 0 + for: 0m + labels: + severity: critical + annotations: + Session: !unsafe "{{ $labels.name }}" + Group: !unsafe "{{ $labels.group }}" + - alert: BirdBGPNoExportedPrefixRedundant + expr: + bird_protocol_prefix_export_count{ + export_filter!="REJECT", + } * on (instance, name) group_left (group) ( + bird:protocol_up:bgp:redundant{state="Established"} + ) == 0 + for: 0m + labels: + severity: critical + annotations: + Session: !unsafe "{{ $labels.name }}" + Group: !unsafe "{{ $labels.group }}" + - alert: BirdBGPNoImportedPrefixRedundant + expr: + bird_protocol_prefix_import_count{ + import_filter!="REJECT", + } * on (instance, name) group_left (group) ( + bird:protocol_up:bgp:redundant{state="Established"} + ) == 0 + for: 0m + labels: + severity: critical + annotations: + Session: !unsafe "{{ $labels.name }}" + Group: !unsafe "{{ $labels.group }}" + - alert: BirdBGPNoExportedPrefixNonRedundant + expr: + sum by (group) ( + bird_protocol_prefix_export_count{ + export_filter!="REJECT", + } * on (instance, name) group_left (group) ( + bird:protocol_up:bgp:non_redundant{state="Established"} + ) + ) == 0 + for: 0m + labels: + severity: critical + annotations: + Session: !unsafe "{{ $labels.name }}" + Group: !unsafe "{{ $labels.group }}" + - alert: BirdBGPNoImportedPrefixNonRedundant + expr: + sum by (group) ( + bird_protocol_prefix_import_count{ + import_filter!="REJECT", + } * on (instance, name) group_left (group) ( + bird:protocol_up:bgp:non_redundant{state="Established"} + ) + ) == 0 + for: 0m + labels: + severity: critical + annotations: + Session: !unsafe "{{ $labels.name }}" + Group: !unsafe "{{ $labels.group }}" + - alert: BirdOSPFNeighboursChange + expr: + changes(bird_ospf_neighbor_count[5m]) > 0 + or changes(bird_ospfv3_neighbor_count[5m]) > 0 + for: 0m + labels: + severity: warning + - alert: BirdOSPFDown + expr: + bird_ospf_running == 0 + for: 0m + labels: + severity: critical + annotations: + Instance: !unsafe "{{ $labels.name }}" +... diff --git a/group_vars/prom/prometheus/common.yml b/group_vars/prom/prometheus/common.yml new file mode 100644 index 0000000..f00a693 --- /dev/null +++ b/group_vars/prom/prometheus/common.yml @@ -0,0 +1,11 @@ +--- +prometheus__rules_common: + - alert: CollectorDown + expr: > + up == 0 + for: 3m + labels: + severity: critical + annotations: + Job: !unsafe "{{ $labels.job }}" +... diff --git a/group_vars/prom/prometheus/keepalived.yml b/group_vars/prom/prometheus/keepalived.yml new file mode 100644 index 0000000..730f8fc --- /dev/null +++ b/group_vars/prom/prometheus/keepalived.yml @@ -0,0 +1,23 @@ +--- +prometheus__rules_keepalived: + - alert: KeepalivedVrrpFault + expr: + keepalived_vrrp_state{state="fault"} > 0 + for: 0m + labels: + severity: critical + annotations: + Instance: !unsafe "{{ $labels.instance }}" + - alert: KeepalivedMasterChange + expr: + changes( + keepalived_vrrp_state{ + keepalived_vvrp_state="master" + }[1m] + ) > 1 + for: 0m + labels: + severity: warning + annotations: + Instance: !unsafe "{{ $labels.instance }}" +... diff --git a/group_vars/prom/prometheus/kresd.yml b/group_vars/prom/prometheus/kresd.yml new file mode 100644 index 0000000..d7d3ce1 --- /dev/null +++ b/group_vars/prom/prometheus/kresd.yml @@ -0,0 +1,6 @@ +--- +prometheus__scraping_kresd: + targets: "{{ groups.dns }}" + address: + port: 8453 +... diff --git a/group_vars/prom/prometheus/main.yml b/group_vars/prom/prometheus/main.yml new file mode 100644 index 0000000..a08c7ef --- /dev/null +++ b/group_vars/prom/prometheus/main.yml @@ -0,0 +1,23 @@ +--- +prometheus__alertmanager_targets: + - docker-ovh.adm.auro.re:9093 + +prometheus__tsdb_retention_time: 90d + +prometheus__scraping: + node: "{{ prometheus__scraping_node }}" + prometheus: "{{ prometheus__scraping_prometheus }}" + kresd: "{{ prometheus__scraping_kresd }}" + bird: "{{ prometheus__scraping_bird }}" + quanta: "{{ prometheus__scraping_quanta }}" + snmp: "{{ prometheus__scraping_snmp }}" + +prometheus__rules: + common: "{{ prometheus__rules_common }}" + switch: "{{ prometheus__rules_switch }}" + prometheus: "{{ prometheus__rules_prometheus }}" + node: "{{ prometheus__rules_node }}" + keepalived: "{{ prometheus__rules_keepalived }}" + quanta: "{{ prometheus__rules_quanta }}" + bird: "{{ prometheus__rules_bird }}" +... diff --git a/group_vars/prom/prometheus/node.yml b/group_vars/prom/prometheus/node.yml new file mode 100644 index 0000000..e1bab96 --- /dev/null +++ b/group_vars/prom/prometheus/node.yml @@ -0,0 +1,199 @@ +--- +prometheus__scraping_node: + targets: "{{ groups.vm_network + groups.pve_network }}" + address: + port: 9100 + +prometheus__rules_node: + - alert: OutOfMemory + expr: + 100 * ( + node_memory_MemFree_bytes + + node_memory_Cached_bytes + + node_memory_Buffers_bytes + ) / node_memory_MemTotal_bytes < 10 + for: 5m + labels: + severity: warning + annotations: + FreeMemory: !unsafe '{{ printf "%.0f" $value }} %' + - alert: HostSwapIsFillingUp + expr: + 100 * ( + 1 - ( + node_memory_SwapFree_bytes + / node_memory_SwapTotal_bytes + ) + ) >= 50 + for: 3m + labels: + severity: critical + annotations: + UsedSwap: !unsafe '{{ printf "%.0f" $value }} %' + - alert: HostPhysicalComponentTooHot + expr: + node_hwmon_temp_celsius > 79 + for: 3m + labels: + severity: critical + annotations: + Temperature: !unsafe '{{ printf "%.0f" $value }} °C' + Chip: !unsafe "{{ $labels.chip }}" + Sensor: !unsafe "{{ $labels.sensor }}" + - alert: HostNodeOvertemperatureAlarm + expr: + node_hwmon_temp_crit_alarm_celsius == 1 + for: 0m + labels: + severity: critical + annotations: + Chip: !unsafe "{{ $labels.chip }}" + Sensor: !unsafe "{{ $labels.sensor }}" + - alert: HostRaidArrayGotInactive + expr: + node_md_state{state="inactive"} > 0 + for: 0m + labels: + severity: critical + annotations: + Device: !unsafe "{{ $labels.device }}" + - alert: HostRaidDiskFailure + expr: + node_md_disks{state="failed"} > 0 + for: 0m + labels: + severity: critical + annotations: + severity: !unsafe "{{ $labels.md_device }}" + - alert: HostOomKillDetected + expr: + increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + PID: !unsafe "{{ $value }}" + - alert: HostEdacCorrectableErrorsDetected + expr: + increase(node_edac_correctable_errors_total[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + CorrectedErrors: !unsafe "{{ $value }}" + - alert: HostEdacUncorrectableErrorsDetected + expr: + increase(node_edac_uncorrectable_errors_total[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + DetectedErrors: !unsafe "{{ $value }}" + - alert: OutOfDiskSpace + expr: + ( + 100 * node_filesystem_free_bytes + / node_filesystem_size_bytes < 10 + ) + and on (instance, device, mountpoint) ( + node_filesystem_readonly + ) == 0 + for: 5m + labels: + severity: critical + annotations: + Mountpoint: !unsafe '{{ $labels.mountpoint }}' + FreeSpace: !unsafe '{{ printf "%.0f" $value }} %' + - alert: HostConntrackLimit + expr: + 100 * ( + node_nf_conntrack_entries + / node_nf_conntrack_entries_limit + ) > 80 + for: 5m + labels: + severity: warning + annotations: + Filled: !unsafe '{{ printf "%.0f" $value }} %' + - alert: HostClockSkew + expr: + ( + node_timex_offset_seconds > 0.05 + and deriv(node_timex_offset_seconds[5m]) >= 0 + ) or ( + node_timex_offset_seconds < -0.05 + and deriv(node_timex_offset_seconds[5m]) <= 0 + ) + for: 2m + labels: + severity: warning + - alert: HostClockNotSynchronising + expr: + min_over_time(node_timex_sync_status[1m]) == 0 + and node_timex_maxerror_seconds >= 16 + for: 2m + labels: + severity: warning + - alert: HostRequiresReboot + expr: + node_reboot_required > 0 + for: 5m + labels: + severity: warning + - alert: OutOfInodes + expr: + 100 * node_filesystem_files_free + / node_filesystem_files < 10 + for: 3m + labels: + severity: warning + annotations: + Mountpoint: !unsafe "{{ $labels.mountpoint }}" + FreeInodes: !unsafe '{{ printf "%.0f" $value }} %' + - alert: CpuUsage + expr: + 100 * ( + 1 - avg by (instance) ( + irate(node_cpu_seconds_total{mode="idle"}[5m]) + ) + ) > 75 + for: 10m + labels: + severity: warning + annotations: + Usage: !unsafe '{{ printf "%.0f" $value }} %' + - alert: SystemdServiceFailed + expr: + node_systemd_unit_state{state="failed"} == 1 + for: 10m + labels: + severity: warning + annotations: + Service: !unsafe "{{ $labels.name }}" + - alert: LoadUsage + expr: + node_load1 > 5 + for: 2m + labels: + severity: warning + annotations: + Load1: !unsafe '{{ printf "%.0f" $value }}' + - alert: UnhealthyDisk + expr: + smartmon_device_smart_healthy < 1 + for: 10m + labels: + severity: critical + annotations: + Disk: !unsafe "{{ $labels.disk }}" + - alert: HostCpuStealNoisyNeighbor + expr: + 100 * avg by (instance) ( + rate(node_cpu_seconds_total{mode="steal"}[5m]) + ) > 10 + for: 5m + labels: + severity: warning + annotations: + Disk: !unsafe "{{ $labels.disk }}" +... diff --git a/group_vars/prom/prometheus/prometheus.yml b/group_vars/prom/prometheus/prometheus.yml new file mode 100644 index 0000000..0e3b663 --- /dev/null +++ b/group_vars/prom/prometheus/prometheus.yml @@ -0,0 +1,14 @@ +--- +prometheus__scraping_prometheus: + targets: "{{ groups.prom }}" + address: + port: 9090 + +prometheus__rules_prometheus: + - alert: PrometheusTsdbCompactionFailed + expr: + increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical +... diff --git a/group_vars/prom/prometheus/quanta.yml b/group_vars/prom/prometheus/quanta.yml new file mode 100644 index 0000000..c58b253 --- /dev/null +++ b/group_vars/prom/prometheus/quanta.yml @@ -0,0 +1,84 @@ +--- +prometheus__scraping_quanta: + targets: "{{ groups.quanta }}" + address: 127.0.0.1:9116 + path: /snmp + params: + module: + - quanta + +prometheus__rules_quanta: + - alert: QuantaQueueOverflow + expr: + snAgGblQueueOverflow == 1 + for: 0m + labels: + severity: critical + - alert: QuantaCpuUsage + expr: + snAgGblCpuUtil1MinAvg > 50 + for: 5m + labels: + severity: warning + annotations: + Usage: !unsafe "{{ $value }} %" + - alert: QuantaCpuUsage + expr: + snAgGblCpuUtil1MinAvg > 80 + for: 5m + labels: + severity: critical + annotations: + Usage: !unsafe "{{ $value }} %" + - alert: QuantaMemoryUsage + expr: + 100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 50 + for: 5m + labels: + severity: warning + annotations: + UsedMemory: !unsafe "{{ $value }} %" + - alert: QuantaMemoryUsage + expr: + 100 * (1 - (snAgGblDynMemFree / snAgGblDynMemTotal)) > 80 + for: 5m + labels: + severity: alert + annotations: + UsedMemory: !unsafe "{{ $value }} %" + - alert: QuantaFanHealth + expr: + snChasFanOperStatus{snChasFanOperStatus="normal"} == 0 + for: 0m + labels: + severity: critical + annotations: + Description: !unsafe "{{ $labels.shChasFanDescription }}" + Status: !unsafe "{{ $labels.snChasFanOperStatus }}" + - alert: QuantaTemp + expr: + 0.5 * snAgentTempValue > 45 + for: 0m + labels: + severity: warning + annotations: + Temperature: !unsafe "{{ $value }} °C" + Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}" + - alert: QuantaTemp + expr: + 0.5 * snAgentTempValue > 60 + for: 0m + labels: + severity: critical + annotations: + Temperature: !unsafe "{{ $value }} °C" + Description: !unsafe "{{ $labels.snAgentTempSensorDescr }}" + - alert: QuantaPowerRedundancyFailure + expr: + count by (instance) ( + snChasPwrSupplyOperStatus{snChasPwrSupplyOperStatus="normal"} + ) < 2 + for: 0m + labels: + severity: warning +... diff --git a/group_vars/prom/prometheus/snmp.yml b/group_vars/prom/prometheus/snmp.yml new file mode 100644 index 0000000..e544581 --- /dev/null +++ b/group_vars/prom/prometheus/snmp.yml @@ -0,0 +1,6 @@ +--- +prometheus__scraping_snmp: + targets: "{{ groups.prom }}" + address: + port: 9116 +... diff --git a/group_vars/prom/prometheus/switch.yml b/group_vars/prom/prometheus/switch.yml new file mode 100644 index 0000000..1c4af87 --- /dev/null +++ b/group_vars/prom/prometheus/switch.yml @@ -0,0 +1,47 @@ +--- +prometheus__rules_switch: + - alert: SwitchPromiscuousChange + expr: + changes(ifPromiscuousMode[5m]) > 0 + for: 0m + labels: + severity: warning + annotations: + Interface: !unsafe "{{ $labels.ifName }} + {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" + - alert: SwitchInterfaceUpChange + expr: + changes(ifOperStatus{ifOperStatus="up"}[5m]) > 0 + for: 0m + labels: + severity: warning + annotations: + Interface: !unsafe "{{ $labels.ifName }} + {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" + - alert: SwitchInErrors + expr: + 100 * irate(ifInErrors[5m]) / ( + irate(ifInUcastPkts[5m]) + + irate(ifInNUcastPkts[5m]) + ) > 0.01 + for: 0m + labels: + severity: warning + annotations: + ErrorRate: !unsafe '{{ printf "%.2f" $value }} %' + Interface: !unsafe "{{ $labels.ifName }} + {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" + - alert: SwitchOutErrors + expr: + 100 * irate(ifOutErrors[5m]) / ( + irate(ifOutUcastPkts[5m]) + + irate(ifOutNUcastPkts[5m]) + ) > 0.01 + for: 0m + labels: + severity: warning + annotations: + ErrorRate: !unsafe '{{ printf "%.2f" $value }} %' + Interface: !unsafe "{{ $labels.ifName }} + {{ if $labels.ifAlias }}- {{ $labels.ifAlias }}{{ end }}" +... diff --git a/roles/bird/templates/.bird.conf.j2.swp b/roles/bird/templates/.bird.conf.j2.swp deleted file mode 100644 index bbe98ff43f61aee7d181e0ed6d9adc66d181749b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20480 zcmeI2U5MON6vwaDTCMF`D+u}^*I8C~VV&9C)>0|k2Oq?$Ra&)bOC9e_?(C*!l4+8e zw$83n{GjNQSfvVrBKoBG;6p*{n_&B(tzSjZYViZXDn*0^pY(rjZZgTtPN%bIm2$)U z$mD*Xe{N24Zc-ZCvSA~?IX}j*t-$ZWCGsH8vCXe8Vf8@zl-z5Z_JUB=?@_Q2_>F?7 z`<^cgZb7(#U2_Z%4%O15tmLRqajBGiZZvgcOvJ0^OxYO;+2MuDCR)Wgb+YnQWC z6XWVq9n4+FuUvUw&uXTCQNSo*6fg=H1&jhl0i%FXz$kG3C=k{bvBzP~h1#S?_5Omk z{YCot61|^k%U`6AuhIJ_+Vb_G*%<|l0!9I&fKk9GU=%P47zK<1MggOMQNSqhA1EL! z#^%uP94!Fg{eK+)pS+y0@4!*;33wa41zraGz|-I1=^#twlO!84!?9sqZO zL2wmV3g(tGb^?3|-T=>n5*PW#h8Dn37_rP=DVQ??l2<`yu!8jNLSAbKOF?JXn z09(Kz<#g~>;;>_dN2Tf#G=Px@Gf`>JPABt zf&0NI_;U$kzk;8@SKv4}0zLqT!1JIAZU@(bzb*#U zumMbh8^8cKjRl!6!8>3tr~?-`pa?dDO<)4#z|UBqc^|w0ror`q&iO~+V{icYK!S(B zCcr^+h*vzH3(NAk?Q;4|qgeEX)yVsn8w7lYTe2eRPB^@6`v@PJr3%uus16P|w%K)^ zX_QXY9o}qm%c}}BJ%f5X>s~1NY*dBX(0RyKeMX1HKnlM+o$gx?v*yc+y_*KAibsdC zbCi7+XEfd}TT6hJHpQygU9{a$`VHZXD0|}C$idh!!YGsMS3^zLU{qL@sED$pBDVUI zR){y_Q={@mRV!q8QZLBc_?{*hj?YRU!&D}pOrOhErIe`6Syi>>d!bkMoTvxYtzAh~ z=5|1Hz;>lYr#ySIGqa&>KA;snMaW*cyXN^JckHSilF6m1T3#Ild7Y)F1$Ew$mmLGO zr?(l{Y)Lm9UQ6eJ#mU&uL^ec?58JiI>XB@AYGNlDdSFo{ZEDsIyV}(boi(o=I&a~2 zL=E(5M^s5!2g@4L4{XncPMBT1ZJXg0+Yds@6hVA~Txm~Dmpng@Br>b54m8$ZdFAV& z?O;aM$dL@|jv~F|HagoxQ5gi;M&})djAI4a?as*EQN z{g3311jAHUml!4zF0NS1$w{80xs}ttk(^qbhpTm4lsT#jt;uaqm7?}XX{ysz9r+fT z_o<6IGymGIw9Fx$mTqo1x%A1t`TPtI!g^_)u1~{{3Sm&fmm*KODW}oxE}(}%yLhvt zu=vDScGm0?xctZ45`7u^m`LOqZps#)mX71`iELuHrzUyH_k>jzLDnXz&NJ~Fbh2o2(7qU7_vBxjJV$A{9^NEV53Zy! zk;C=J$)j-T1QIE$V%PGIc^|sNCT9LixA#Y+?@FiZ9e&>Ig`eiW=dkX-kNKM&amU=9 zH+$g*^*wKO|7V!9(H*1Zfok+LK<*P6NYxGK)RZf{RbU>*51DxMZ?)jyG2LyeuhPz{ zcGMvc1wIm|>qQShC$}D=vguh#lj~5|LNn+77OwkF?gM(j=^J_UU>cN#%R?KF7<2H;{W*l{V#~;Pk^sMyauoz|Lb4}pt%2TFaS;=rl;8d8*mI91qZ+$ z;DW8-9