prometheus: use enums for iLO SNMP
continuous-integration/drone/push Build is passing Details
continuous-integration/drone/pr Build is passing Details

This commit is contained in:
Alexandre Iooss 2022-01-01 11:59:23 +01:00
parent 9b53daf42a
commit 5ab3dcdac2
3 changed files with 19 additions and 176 deletions

View File

@ -1,106 +0,0 @@
# {{ ansible_managed }}
{# As this is also Jinja2 it will conflict without a raw block #}
{% raw %}
groups:
- name: django.rules
rules:
- record: job:django_http_requests_before_middlewares_total:sum_rate30s
expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job)
- record: job:django_http_requests_unknown_latency_total:sum_rate30s
expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job)
- record: job:django_http_ajax_requests_total:sum_rate30s
expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job)
- record: job:django_http_responses_before_middlewares_total:sum_rate30s
expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job)
- record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s
expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s]))
BY (job)
- record: job:django_http_requests_body_total_bytes:sum_rate30s
expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job)
- record: job:django_http_responses_streaming_total:sum_rate30s
expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job)
- record: job:django_http_responses_body_total_bytes:sum_rate30s
expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job)
- record: job:django_http_requests_total:sum_rate30s
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job)
- record: job:django_http_requests_total_by_method:sum_rate30s
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method)
- record: job:django_http_requests_total_by_transport:sum_rate30s
expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport)
- record: job:django_http_requests_total_by_view:sum_rate30s
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
view)
- record: job:django_http_requests_total_by_view_transport_method:sum_rate30s
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
view, transport, method)
- record: job:django_http_responses_total_by_templatename:sum_rate30s
expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename)
- record: job:django_http_responses_total_by_status:sum_rate30s
expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status)
- record: job:django_http_responses_total_by_charset:sum_rate30s
expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset)
- record: job:django_http_exceptions_total_by_type:sum_rate30s
expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type)
- record: job:django_http_exceptions_total_by_view:sum_rate30s
expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view)
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "50"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "95"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99"
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99.9"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "50"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "95"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99"
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
BY (job, le))
labels:
quantile: "99.9"
- record: job:django_model_inserts_total:sum_rate1m
expr: sum(rate(django_model_inserts_total[1m])) BY (job, model)
- record: job:django_model_updates_total:sum_rate1m
expr: sum(rate(django_model_updates_total[1m])) BY (job, model)
- record: job:django_model_deletes_total:sum_rate1m
expr: sum(rate(django_model_deletes_total[1m])) BY (job, model)
- record: job:django_db_new_connections_total:sum_rate30s
expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor)
- record: job:django_db_new_connection_errors_total:sum_rate30s
expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor)
- record: job:django_db_execute_total:sum_rate30s
expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor)
- record: job:django_db_execute_many_total:sum_rate30s
expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor)
- record: job:django_db_errors_total:sum_rate30s
expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type)
- record: job:django_migrations_applied_total:max
expr: max(django_migrations_applied_total) BY (job, connection)
- record: job:django_migrations_unapplied_total:max
expr: max(django_migrations_unapplied_total) BY (job, connection)
{% endraw %}

View File

@ -11,48 +11,53 @@ groups:
rules: rules:
- alert: IloBiosSelfTestDegraded - alert: IloBiosSelfTestDegraded
expr: cpqHeHWBiosCondition >= 3 expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
for: 3m for: 3m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: >- summary: >-
Une erreur a été détectée lors du POST du serveur Une erreur a été détectée lors du POST du serveur
({{ raw('$labels.cpqHeHWBiosCondition') }})
- alert: IloTemperatureSensorDegraded - alert: IloTemperatureSensorDegraded
expr: cpqHeTemperatureCondition >= 3 expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
for: 3m for: 3m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: >- summary: >-
Le capteur de température est dégradé Le capteur de température est dégradé
({{ raw('$labels.cpqHeTemperatureCondition') }})
- alert: IloFanDegraded - alert: IloFanDegraded
expr: cpqHeFltTolFanCondition >= 3 expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1
for: 3m for: 3m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: >- summary: >-
Le ventilateur est dégradé Le ventilateur est dégradé
({{ raw('$labels.cpqHeFltTolFanCondition') }})
- alert: IloPowerSupplyDegraded - alert: IloPowerSupplyDegraded
expr: cpqHeFltTolPowerSupplyStatus >= 2 expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1
for: 3m for: 3m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: >- summary: >-
L'alimentation est dégradée L'alimentation est dégradée
({{ raw('$labels.cpqHeFltTolPowerSupplyStatus') }})
- alert: IloOverrideSwitchState - alert: IloOverrideSwitchState
expr: cpqSm2CntlriLOSecurityOverrideSwitchState == 2 expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1
for: 3m for: 3m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: >- summary: >-
Le switch de réinitialisation n'est pas à l'état d'origine Le switch de réinitialisation n'est pas à l'état d'origine,
l'authentification est bypassée
... ...

View File

@ -497,7 +497,7 @@ ilo:
help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3 help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3
- name: cpqHeHWBiosCondition - name: cpqHeHWBiosCondition
oid: 1.3.6.1.4.1.232.6.2.16.1 oid: 1.3.6.1.4.1.232.6.2.16.1
type: gauge type: EnumAsStateSet
help: This value indicates an error has been detected during Pre-OS Test (POST) help: This value indicates an error has been detected during Pre-OS Test (POST)
or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1 or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1
enum_values: enum_values:
@ -505,25 +505,6 @@ ilo:
2: ok 2: ok
3: degraded 3: degraded
4: failed 4: failed
- name: cpqHeTemperatureChassis
oid: 1.3.6.1.4.1.232.6.2.6.8.1.1
type: gauge
help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.8.1.1
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
- name: cpqHeTemperatureIndex
oid: 1.3.6.1.4.1.232.6.2.6.8.1.2
type: gauge
help: A number that uniquely specifies this temperature sensor description. -
1.3.6.1.4.1.232.6.2.6.8.1.2
indexes:
- labelname: cpqHeTemperatureChassis
type: gauge
- labelname: cpqHeTemperatureIndex
type: gauge
- name: cpqHeTemperatureLocale - name: cpqHeTemperatureLocale
oid: 1.3.6.1.4.1.232.6.2.6.8.1.3 oid: 1.3.6.1.4.1.232.6.2.6.8.1.3
type: EnumAsInfo type: EnumAsInfo
@ -569,7 +550,7 @@ ilo:
type: gauge type: gauge
- name: cpqHeTemperatureCondition - name: cpqHeTemperatureCondition
oid: 1.3.6.1.4.1.232.6.2.6.8.1.6 oid: 1.3.6.1.4.1.232.6.2.6.8.1.6
type: gauge type: EnumAsStateSet
help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6 help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6
indexes: indexes:
- labelname: cpqHeTemperatureChassis - labelname: cpqHeTemperatureChassis
@ -581,25 +562,6 @@ ilo:
2: ok 2: ok
3: degraded 3: degraded
4: failed 4: failed
- name: cpqHeFltTolFanChassis
oid: 1.3.6.1.4.1.232.6.2.6.7.1.1
type: gauge
help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.7.1.1
indexes:
- labelname: cpqHeFltTolFanChassis
type: gauge
- labelname: cpqHeFltTolFanIndex
type: gauge
- name: cpqHeFltTolFanIndex
oid: 1.3.6.1.4.1.232.6.2.6.7.1.2
type: gauge
help: A number that uniquely specifies this fan description. -
1.3.6.1.4.1.232.6.2.6.7.1.2
indexes:
- labelname: cpqHeFltTolFanChassis
type: gauge
- labelname: cpqHeFltTolFanIndex
type: gauge
- name: cpqHeFltTolFanLocale - name: cpqHeFltTolFanLocale
oid: 1.3.6.1.4.1.232.6.2.6.7.1.3 oid: 1.3.6.1.4.1.232.6.2.6.7.1.3
type: EnumAsInfo type: EnumAsInfo
@ -626,7 +588,7 @@ ilo:
13: bridgeCard 13: bridgeCard
- name: cpqHeFltTolFanCondition - name: cpqHeFltTolFanCondition
oid: 1.3.6.1.4.1.232.6.2.6.7.1.9 oid: 1.3.6.1.4.1.232.6.2.6.7.1.9
type: gauge type: EnumAsStateSet
help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9 help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9
indexes: indexes:
- labelname: cpqHeFltTolFanChassis - labelname: cpqHeFltTolFanChassis
@ -638,27 +600,9 @@ ilo:
2: ok 2: ok
3: degraded 3: degraded
4: failed 4: failed
- name: cpqHeFltTolPowerSupplyChassis
oid: 1.3.6.1.4.1.232.6.2.9.3.1.1
type: gauge
help: The system chassis number. - 1.3.6.1.4.1.232.6.2.9.3.1.1
indexes:
- labelname: cpqHeFltTolPowerSupplyChassis
type: gauge
- labelname: cpqHeFltTolPowerSupplyBay
type: gauge
- name: cpqHeFltTolPowerSupplyBay
oid: 1.3.6.1.4.1.232.6.2.9.3.1.2
type: gauge
help: The bay number to index within this chassis. - 1.3.6.1.4.1.232.6.2.9.3.1.2
indexes:
- labelname: cpqHeFltTolPowerSupplyChassis
type: gauge
- labelname: cpqHeFltTolPowerSupplyBay
type: gauge
- name: cpqHeFltTolPowerSupplyStatus - name: cpqHeFltTolPowerSupplyStatus
oid: 1.3.6.1.4.1.232.6.2.9.3.1.5 oid: 1.3.6.1.4.1.232.6.2.9.3.1.5
type: gauge type: EnumAsStateSet
help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5 help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5
indexes: indexes:
- labelname: cpqHeFltTolPowerSupplyChassis - labelname: cpqHeFltTolPowerSupplyChassis
@ -685,7 +629,7 @@ ilo:
17: noPowerInput 17: noPowerInput
- name: cpqSm2CntlrInterfaceStatus - name: cpqSm2CntlrInterfaceStatus
oid: 1.3.6.1.4.1.232.9.2.2.17 oid: 1.3.6.1.4.1.232.9.2.2.17
type: gauge type: EnumAsStateSet
help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17 help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17
enum_values: enum_values:
1: other 1: other
@ -693,7 +637,7 @@ ilo:
3: notResponding 3: notResponding
- name: cpqSm2CntlriLOSecurityOverrideSwitchState - name: cpqSm2CntlriLOSecurityOverrideSwitchState
oid: 1.3.6.1.4.1.232.9.2.2.27 oid: 1.3.6.1.4.1.232.9.2.2.27
type: gauge type: EnumAsStateSet
help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27 help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27
enum_values: enum_values:
1: notSupported 1: notSupported
@ -701,7 +645,7 @@ ilo:
3: notSet 3: notSet
- name: cpqSm2CntlrLicenseActive - name: cpqSm2CntlrLicenseActive
oid: 1.3.6.1.4.1.232.9.2.2.30 oid: 1.3.6.1.4.1.232.9.2.2.30
type: gauge type: EnumAsStateSet
help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30 help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30
enum_values: enum_values:
1: none 1: none
@ -714,7 +658,7 @@ ilo:
8: iloAdvancedPremiumSecurity 8: iloAdvancedPremiumSecurity
- name: cpqSm2CntlrServerPowerState - name: cpqSm2CntlrServerPowerState
oid: 1.3.6.1.4.1.232.9.2.2.32 oid: 1.3.6.1.4.1.232.9.2.2.32
type: gauge type: EnumAsStateSet
help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32 help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32
enum_values: enum_values:
1: unknown 1: unknown