diff --git a/roles/prometheus/templates/django.rules.yml.j2 b/roles/prometheus/templates/django.rules.yml.j2 deleted file mode 100644 index fddd398..0000000 --- a/roles/prometheus/templates/django.rules.yml.j2 +++ /dev/null @@ -1,106 +0,0 @@ -# {{ ansible_managed }} -{# As this is also Jinja2 it will conflict without a raw block #} -{% raw %} -groups: -- name: django.rules - rules: - - record: job:django_http_requests_before_middlewares_total:sum_rate30s - expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job) - - record: job:django_http_requests_unknown_latency_total:sum_rate30s - expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job) - - record: job:django_http_ajax_requests_total:sum_rate30s - expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job) - - record: job:django_http_responses_before_middlewares_total:sum_rate30s - expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job) - - record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s - expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s])) - BY (job) - - record: job:django_http_requests_body_total_bytes:sum_rate30s - expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job) - - record: job:django_http_responses_streaming_total:sum_rate30s - expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job) - - record: job:django_http_responses_body_total_bytes:sum_rate30s - expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job) - - record: job:django_http_requests_total:sum_rate30s - expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job) - - record: job:django_http_requests_total_by_method:sum_rate30s - expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method) - - record: job:django_http_requests_total_by_transport:sum_rate30s - expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport) - - record: job:django_http_requests_total_by_view:sum_rate30s - expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, - view) - - record: job:django_http_requests_total_by_view_transport_method:sum_rate30s - expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job, - view, transport, method) - - record: job:django_http_responses_total_by_templatename:sum_rate30s - expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename) - - record: job:django_http_responses_total_by_status:sum_rate30s - expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status) - - record: job:django_http_responses_total_by_charset:sum_rate30s - expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset) - - record: job:django_http_exceptions_total_by_type:sum_rate30s - expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type) - - record: job:django_http_exceptions_total_by_view:sum_rate30s - expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view) - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "50" - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "95" - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99" - - record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s - expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99.9" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "50" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "95" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99" - - record: job:django_http_requests_latency_seconds:quantile_rate30s - expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s])) - BY (job, le)) - labels: - quantile: "99.9" - - record: job:django_model_inserts_total:sum_rate1m - expr: sum(rate(django_model_inserts_total[1m])) BY (job, model) - - record: job:django_model_updates_total:sum_rate1m - expr: sum(rate(django_model_updates_total[1m])) BY (job, model) - - record: job:django_model_deletes_total:sum_rate1m - expr: sum(rate(django_model_deletes_total[1m])) BY (job, model) - - record: job:django_db_new_connections_total:sum_rate30s - expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor) - - record: job:django_db_new_connection_errors_total:sum_rate30s - expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor) - - record: job:django_db_execute_total:sum_rate30s - expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor) - - record: job:django_db_execute_many_total:sum_rate30s - expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor) - - record: job:django_db_errors_total:sum_rate30s - expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type) - - record: job:django_migrations_applied_total:max - expr: max(django_migrations_applied_total) BY (job, connection) - - record: job:django_migrations_unapplied_total:max - expr: max(django_migrations_unapplied_total) BY (job, connection) -{% endraw %} diff --git a/roles/prometheus/templates/ilo.rules.yml.j2 b/roles/prometheus/templates/ilo.rules.yml.j2 index 4a0bc37..47a3e8f 100644 --- a/roles/prometheus/templates/ilo.rules.yml.j2 +++ b/roles/prometheus/templates/ilo.rules.yml.j2 @@ -11,48 +11,53 @@ groups: rules: - alert: IloBiosSelfTestDegraded - expr: cpqHeHWBiosCondition >= 3 + expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1 for: 3m labels: severity: critical annotations: summary: >- Une erreur a été détectée lors du POST du serveur + ({{ raw('$labels.cpqHeHWBiosCondition') }}) - alert: IloTemperatureSensorDegraded - expr: cpqHeTemperatureCondition >= 3 + expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1 for: 3m labels: severity: critical annotations: summary: >- Le capteur de température est dégradé + ({{ raw('$labels.cpqHeTemperatureCondition') }}) - alert: IloFanDegraded - expr: cpqHeFltTolFanCondition >= 3 + expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1 for: 3m labels: severity: critical annotations: summary: >- Le ventilateur est dégradé + ({{ raw('$labels.cpqHeFltTolFanCondition') }}) - alert: IloPowerSupplyDegraded - expr: cpqHeFltTolPowerSupplyStatus >= 2 + expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1 for: 3m labels: severity: critical annotations: summary: >- L'alimentation est dégradée + ({{ raw('$labels.cpqHeFltTolPowerSupplyStatus') }}) - alert: IloOverrideSwitchState - expr: cpqSm2CntlriLOSecurityOverrideSwitchState == 2 + expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1 for: 3m labels: severity: critical annotations: summary: >- - Le switch de réinitialisation n'est pas à l'état d'origine + Le switch de réinitialisation n'est pas à l'état d'origine, + l'authentification est bypassée ... diff --git a/roles/prometheus/templates/snmp.yml.j2 b/roles/prometheus/templates/snmp.yml.j2 index d593a3d..1171d4c 100644 --- a/roles/prometheus/templates/snmp.yml.j2 +++ b/roles/prometheus/templates/snmp.yml.j2 @@ -497,7 +497,7 @@ ilo: help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3 - name: cpqHeHWBiosCondition oid: 1.3.6.1.4.1.232.6.2.16.1 - type: gauge + type: EnumAsStateSet help: This value indicates an error has been detected during Pre-OS Test (POST) or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1 enum_values: @@ -505,25 +505,6 @@ ilo: 2: ok 3: degraded 4: failed - - name: cpqHeTemperatureChassis - oid: 1.3.6.1.4.1.232.6.2.6.8.1.1 - type: gauge - help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.8.1.1 - indexes: - - labelname: cpqHeTemperatureChassis - type: gauge - - labelname: cpqHeTemperatureIndex - type: gauge - - name: cpqHeTemperatureIndex - oid: 1.3.6.1.4.1.232.6.2.6.8.1.2 - type: gauge - help: A number that uniquely specifies this temperature sensor description. - - 1.3.6.1.4.1.232.6.2.6.8.1.2 - indexes: - - labelname: cpqHeTemperatureChassis - type: gauge - - labelname: cpqHeTemperatureIndex - type: gauge - name: cpqHeTemperatureLocale oid: 1.3.6.1.4.1.232.6.2.6.8.1.3 type: EnumAsInfo @@ -569,7 +550,7 @@ ilo: type: gauge - name: cpqHeTemperatureCondition oid: 1.3.6.1.4.1.232.6.2.6.8.1.6 - type: gauge + type: EnumAsStateSet help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6 indexes: - labelname: cpqHeTemperatureChassis @@ -581,25 +562,6 @@ ilo: 2: ok 3: degraded 4: failed - - name: cpqHeFltTolFanChassis - oid: 1.3.6.1.4.1.232.6.2.6.7.1.1 - type: gauge - help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.7.1.1 - indexes: - - labelname: cpqHeFltTolFanChassis - type: gauge - - labelname: cpqHeFltTolFanIndex - type: gauge - - name: cpqHeFltTolFanIndex - oid: 1.3.6.1.4.1.232.6.2.6.7.1.2 - type: gauge - help: A number that uniquely specifies this fan description. - - 1.3.6.1.4.1.232.6.2.6.7.1.2 - indexes: - - labelname: cpqHeFltTolFanChassis - type: gauge - - labelname: cpqHeFltTolFanIndex - type: gauge - name: cpqHeFltTolFanLocale oid: 1.3.6.1.4.1.232.6.2.6.7.1.3 type: EnumAsInfo @@ -626,7 +588,7 @@ ilo: 13: bridgeCard - name: cpqHeFltTolFanCondition oid: 1.3.6.1.4.1.232.6.2.6.7.1.9 - type: gauge + type: EnumAsStateSet help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9 indexes: - labelname: cpqHeFltTolFanChassis @@ -638,27 +600,9 @@ ilo: 2: ok 3: degraded 4: failed - - name: cpqHeFltTolPowerSupplyChassis - oid: 1.3.6.1.4.1.232.6.2.9.3.1.1 - type: gauge - help: The system chassis number. - 1.3.6.1.4.1.232.6.2.9.3.1.1 - indexes: - - labelname: cpqHeFltTolPowerSupplyChassis - type: gauge - - labelname: cpqHeFltTolPowerSupplyBay - type: gauge - - name: cpqHeFltTolPowerSupplyBay - oid: 1.3.6.1.4.1.232.6.2.9.3.1.2 - type: gauge - help: The bay number to index within this chassis. - 1.3.6.1.4.1.232.6.2.9.3.1.2 - indexes: - - labelname: cpqHeFltTolPowerSupplyChassis - type: gauge - - labelname: cpqHeFltTolPowerSupplyBay - type: gauge - name: cpqHeFltTolPowerSupplyStatus oid: 1.3.6.1.4.1.232.6.2.9.3.1.5 - type: gauge + type: EnumAsStateSet help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5 indexes: - labelname: cpqHeFltTolPowerSupplyChassis @@ -685,7 +629,7 @@ ilo: 17: noPowerInput - name: cpqSm2CntlrInterfaceStatus oid: 1.3.6.1.4.1.232.9.2.2.17 - type: gauge + type: EnumAsStateSet help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17 enum_values: 1: other @@ -693,7 +637,7 @@ ilo: 3: notResponding - name: cpqSm2CntlriLOSecurityOverrideSwitchState oid: 1.3.6.1.4.1.232.9.2.2.27 - type: gauge + type: EnumAsStateSet help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27 enum_values: 1: notSupported @@ -701,7 +645,7 @@ ilo: 3: notSet - name: cpqSm2CntlrLicenseActive oid: 1.3.6.1.4.1.232.9.2.2.30 - type: gauge + type: EnumAsStateSet help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30 enum_values: 1: none @@ -714,7 +658,7 @@ ilo: 8: iloAdvancedPremiumSecurity - name: cpqSm2CntlrServerPowerState oid: 1.3.6.1.4.1.232.9.2.2.32 - type: gauge + type: EnumAsStateSet help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32 enum_values: 1: unknown