Prometheus: cleanup #90
3 changed files with 19 additions and 176 deletions
|
@ -1,106 +0,0 @@
|
||||||
# {{ ansible_managed }}
|
|
||||||
{# As this is also Jinja2 it will conflict without a raw block #}
|
|
||||||
{% raw %}
|
|
||||||
groups:
|
|
||||||
- name: django.rules
|
|
||||||
rules:
|
|
||||||
- record: job:django_http_requests_before_middlewares_total:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_requests_before_middlewares_total[30s])) BY (job)
|
|
||||||
- record: job:django_http_requests_unknown_latency_total:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_requests_unknown_latency_total[30s])) BY (job)
|
|
||||||
- record: job:django_http_ajax_requests_total:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_ajax_requests_total[30s])) BY (job)
|
|
||||||
- record: job:django_http_responses_before_middlewares_total:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_responses_before_middlewares_total[30s])) BY (job)
|
|
||||||
- record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s]))
|
|
||||||
BY (job)
|
|
||||||
- record: job:django_http_requests_body_total_bytes:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_requests_body_total_bytes[30s])) BY (job)
|
|
||||||
- record: job:django_http_responses_streaming_total:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_responses_streaming_total[30s])) BY (job)
|
|
||||||
- record: job:django_http_responses_body_total_bytes:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_responses_body_total_bytes[30s])) BY (job)
|
|
||||||
- record: job:django_http_requests_total:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job)
|
|
||||||
- record: job:django_http_requests_total_by_method:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_requests_total_by_method[30s])) BY (job, method)
|
|
||||||
- record: job:django_http_requests_total_by_transport:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_requests_total_by_transport[30s])) BY (job, transport)
|
|
||||||
- record: job:django_http_requests_total_by_view:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
|
|
||||||
view)
|
|
||||||
- record: job:django_http_requests_total_by_view_transport_method:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) BY (job,
|
|
||||||
view, transport, method)
|
|
||||||
- record: job:django_http_responses_total_by_templatename:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_responses_total_by_templatename[30s])) BY (job, templatename)
|
|
||||||
- record: job:django_http_responses_total_by_status:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_responses_total_by_status[30s])) BY (job, status)
|
|
||||||
- record: job:django_http_responses_total_by_charset:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_responses_total_by_charset[30s])) BY (job, charset)
|
|
||||||
- record: job:django_http_exceptions_total_by_type:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_exceptions_total_by_type[30s])) BY (job, type)
|
|
||||||
- record: job:django_http_exceptions_total_by_view:sum_rate30s
|
|
||||||
expr: sum(rate(django_http_exceptions_total_by_view[30s])) BY (job, view)
|
|
||||||
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
|
||||||
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
|
|
||||||
BY (job, le))
|
|
||||||
labels:
|
|
||||||
quantile: "50"
|
|
||||||
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
|
||||||
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
|
|
||||||
BY (job, le))
|
|
||||||
labels:
|
|
||||||
quantile: "95"
|
|
||||||
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
|
||||||
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
|
|
||||||
BY (job, le))
|
|
||||||
labels:
|
|
||||||
quantile: "99"
|
|
||||||
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
|
|
||||||
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s]))
|
|
||||||
BY (job, le))
|
|
||||||
labels:
|
|
||||||
quantile: "99.9"
|
|
||||||
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
|
||||||
expr: histogram_quantile(0.5, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
|
|
||||||
BY (job, le))
|
|
||||||
labels:
|
|
||||||
quantile: "50"
|
|
||||||
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
|
||||||
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
|
|
||||||
BY (job, le))
|
|
||||||
labels:
|
|
||||||
quantile: "95"
|
|
||||||
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
|
||||||
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
|
|
||||||
BY (job, le))
|
|
||||||
labels:
|
|
||||||
quantile: "99"
|
|
||||||
- record: job:django_http_requests_latency_seconds:quantile_rate30s
|
|
||||||
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s]))
|
|
||||||
BY (job, le))
|
|
||||||
labels:
|
|
||||||
quantile: "99.9"
|
|
||||||
- record: job:django_model_inserts_total:sum_rate1m
|
|
||||||
expr: sum(rate(django_model_inserts_total[1m])) BY (job, model)
|
|
||||||
- record: job:django_model_updates_total:sum_rate1m
|
|
||||||
expr: sum(rate(django_model_updates_total[1m])) BY (job, model)
|
|
||||||
- record: job:django_model_deletes_total:sum_rate1m
|
|
||||||
expr: sum(rate(django_model_deletes_total[1m])) BY (job, model)
|
|
||||||
- record: job:django_db_new_connections_total:sum_rate30s
|
|
||||||
expr: sum(rate(django_db_new_connections_total[30s])) BY (alias, vendor)
|
|
||||||
- record: job:django_db_new_connection_errors_total:sum_rate30s
|
|
||||||
expr: sum(rate(django_db_new_connection_errors_total[30s])) BY (alias, vendor)
|
|
||||||
- record: job:django_db_execute_total:sum_rate30s
|
|
||||||
expr: sum(rate(django_db_execute_total[30s])) BY (alias, vendor)
|
|
||||||
- record: job:django_db_execute_many_total:sum_rate30s
|
|
||||||
expr: sum(rate(django_db_execute_many_total[30s])) BY (alias, vendor)
|
|
||||||
- record: job:django_db_errors_total:sum_rate30s
|
|
||||||
expr: sum(rate(django_db_errors_total[30s])) BY (alias, vendor, type)
|
|
||||||
- record: job:django_migrations_applied_total:max
|
|
||||||
expr: max(django_migrations_applied_total) BY (job, connection)
|
|
||||||
- record: job:django_migrations_unapplied_total:max
|
|
||||||
expr: max(django_migrations_unapplied_total) BY (job, connection)
|
|
||||||
{% endraw %}
|
|
|
@ -11,48 +11,53 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
|
|
||||||
- alert: IloBiosSelfTestDegraded
|
- alert: IloBiosSelfTestDegraded
|
||||||
expr: cpqHeHWBiosCondition >= 3
|
expr: cpqHeHWBiosCondition{cpqHeHWBiosCondition!~"ok|other"} == 1
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: >-
|
summary: >-
|
||||||
Une erreur a été détectée lors du POST du serveur
|
Une erreur a été détectée lors du POST du serveur
|
||||||
|
({{ raw('$labels.cpqHeHWBiosCondition') }})
|
||||||
|
|
||||||
- alert: IloTemperatureSensorDegraded
|
- alert: IloTemperatureSensorDegraded
|
||||||
expr: cpqHeTemperatureCondition >= 3
|
expr: cpqHeTemperatureCondition{cpqHeTemperatureCondition!~"ok|other"} == 1
|
||||||
jeltz marked this conversation as resolved
|
|||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: >-
|
summary: >-
|
||||||
Le capteur de température est dégradé
|
Le capteur de température est dégradé
|
||||||
|
({{ raw('$labels.cpqHeTemperatureCondition') }})
|
||||||
|
|
||||||
- alert: IloFanDegraded
|
- alert: IloFanDegraded
|
||||||
expr: cpqHeFltTolFanCondition >= 3
|
expr: cpqHeFltTolFanCondition{cpqHeFltTolFanCondition!~"ok|other"} == 1
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: >-
|
summary: >-
|
||||||
Le ventilateur est dégradé
|
Le ventilateur est dégradé
|
||||||
|
({{ raw('$labels.cpqHeFltTolFanCondition') }})
|
||||||
|
|
||||||
- alert: IloPowerSupplyDegraded
|
- alert: IloPowerSupplyDegraded
|
||||||
expr: cpqHeFltTolPowerSupplyStatus >= 2
|
expr: cpqHeFltTolPowerSupplyStatus{cpqHeFltTolPowerSupplyStatus!="noError"} == 1
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: >-
|
summary: >-
|
||||||
L'alimentation est dégradée
|
L'alimentation est dégradée
|
||||||
|
({{ raw('$labels.cpqHeFltTolPowerSupplyStatus') }})
|
||||||
|
|
||||||
- alert: IloOverrideSwitchState
|
- alert: IloOverrideSwitchState
|
||||||
expr: cpqSm2CntlriLOSecurityOverrideSwitchState == 2
|
expr: cpqSm2CntlriLOSecurityOverrideSwitchState{cpqSm2CntlriLOSecurityOverrideSwitchState="set"} == 1
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: >-
|
summary: >-
|
||||||
Le switch de réinitialisation n'est pas à l'état d'origine
|
Le switch de réinitialisation n'est pas à l'état d'origine,
|
||||||
|
l'authentification est bypassée
|
||||||
|
|
||||||
...
|
...
|
||||||
|
|
|
@ -497,7 +497,7 @@ ilo:
|
||||||
help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3
|
help: This is the current Power Meter reading in Watts - 1.3.6.1.4.1.232.6.2.15.3
|
||||||
- name: cpqHeHWBiosCondition
|
- name: cpqHeHWBiosCondition
|
||||||
oid: 1.3.6.1.4.1.232.6.2.16.1
|
oid: 1.3.6.1.4.1.232.6.2.16.1
|
||||||
type: gauge
|
type: EnumAsStateSet
|
||||||
help: This value indicates an error has been detected during Pre-OS Test (POST)
|
help: This value indicates an error has been detected during Pre-OS Test (POST)
|
||||||
or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1
|
or during initial hardware initialization - 1.3.6.1.4.1.232.6.2.16.1
|
||||||
enum_values:
|
enum_values:
|
||||||
|
@ -505,25 +505,6 @@ ilo:
|
||||||
2: ok
|
2: ok
|
||||||
3: degraded
|
3: degraded
|
||||||
4: failed
|
4: failed
|
||||||
- name: cpqHeTemperatureChassis
|
|
||||||
oid: 1.3.6.1.4.1.232.6.2.6.8.1.1
|
|
||||||
type: gauge
|
|
||||||
help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.8.1.1
|
|
||||||
indexes:
|
|
||||||
- labelname: cpqHeTemperatureChassis
|
|
||||||
type: gauge
|
|
||||||
- labelname: cpqHeTemperatureIndex
|
|
||||||
type: gauge
|
|
||||||
- name: cpqHeTemperatureIndex
|
|
||||||
oid: 1.3.6.1.4.1.232.6.2.6.8.1.2
|
|
||||||
type: gauge
|
|
||||||
help: A number that uniquely specifies this temperature sensor description. -
|
|
||||||
1.3.6.1.4.1.232.6.2.6.8.1.2
|
|
||||||
indexes:
|
|
||||||
- labelname: cpqHeTemperatureChassis
|
|
||||||
type: gauge
|
|
||||||
- labelname: cpqHeTemperatureIndex
|
|
||||||
type: gauge
|
|
||||||
- name: cpqHeTemperatureLocale
|
- name: cpqHeTemperatureLocale
|
||||||
oid: 1.3.6.1.4.1.232.6.2.6.8.1.3
|
oid: 1.3.6.1.4.1.232.6.2.6.8.1.3
|
||||||
type: EnumAsInfo
|
type: EnumAsInfo
|
||||||
|
@ -569,7 +550,7 @@ ilo:
|
||||||
type: gauge
|
type: gauge
|
||||||
- name: cpqHeTemperatureCondition
|
- name: cpqHeTemperatureCondition
|
||||||
oid: 1.3.6.1.4.1.232.6.2.6.8.1.6
|
oid: 1.3.6.1.4.1.232.6.2.6.8.1.6
|
||||||
type: gauge
|
type: EnumAsStateSet
|
||||||
help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6
|
help: The Temperature sensor condition - 1.3.6.1.4.1.232.6.2.6.8.1.6
|
||||||
indexes:
|
indexes:
|
||||||
- labelname: cpqHeTemperatureChassis
|
- labelname: cpqHeTemperatureChassis
|
||||||
|
@ -581,25 +562,6 @@ ilo:
|
||||||
2: ok
|
2: ok
|
||||||
3: degraded
|
3: degraded
|
||||||
4: failed
|
4: failed
|
||||||
- name: cpqHeFltTolFanChassis
|
|
||||||
oid: 1.3.6.1.4.1.232.6.2.6.7.1.1
|
|
||||||
type: gauge
|
|
||||||
help: The System Chassis number. - 1.3.6.1.4.1.232.6.2.6.7.1.1
|
|
||||||
indexes:
|
|
||||||
- labelname: cpqHeFltTolFanChassis
|
|
||||||
type: gauge
|
|
||||||
- labelname: cpqHeFltTolFanIndex
|
|
||||||
type: gauge
|
|
||||||
- name: cpqHeFltTolFanIndex
|
|
||||||
oid: 1.3.6.1.4.1.232.6.2.6.7.1.2
|
|
||||||
type: gauge
|
|
||||||
help: A number that uniquely specifies this fan description. -
|
|
||||||
1.3.6.1.4.1.232.6.2.6.7.1.2
|
|
||||||
indexes:
|
|
||||||
- labelname: cpqHeFltTolFanChassis
|
|
||||||
type: gauge
|
|
||||||
- labelname: cpqHeFltTolFanIndex
|
|
||||||
type: gauge
|
|
||||||
- name: cpqHeFltTolFanLocale
|
- name: cpqHeFltTolFanLocale
|
||||||
oid: 1.3.6.1.4.1.232.6.2.6.7.1.3
|
oid: 1.3.6.1.4.1.232.6.2.6.7.1.3
|
||||||
type: EnumAsInfo
|
type: EnumAsInfo
|
||||||
|
@ -626,7 +588,7 @@ ilo:
|
||||||
13: bridgeCard
|
13: bridgeCard
|
||||||
- name: cpqHeFltTolFanCondition
|
- name: cpqHeFltTolFanCondition
|
||||||
oid: 1.3.6.1.4.1.232.6.2.6.7.1.9
|
oid: 1.3.6.1.4.1.232.6.2.6.7.1.9
|
||||||
type: gauge
|
type: EnumAsStateSet
|
||||||
help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9
|
help: The fan condition - 1.3.6.1.4.1.232.6.2.6.7.1.9
|
||||||
indexes:
|
indexes:
|
||||||
- labelname: cpqHeFltTolFanChassis
|
- labelname: cpqHeFltTolFanChassis
|
||||||
|
@ -638,27 +600,9 @@ ilo:
|
||||||
2: ok
|
2: ok
|
||||||
3: degraded
|
3: degraded
|
||||||
4: failed
|
4: failed
|
||||||
- name: cpqHeFltTolPowerSupplyChassis
|
|
||||||
oid: 1.3.6.1.4.1.232.6.2.9.3.1.1
|
|
||||||
type: gauge
|
|
||||||
help: The system chassis number. - 1.3.6.1.4.1.232.6.2.9.3.1.1
|
|
||||||
indexes:
|
|
||||||
- labelname: cpqHeFltTolPowerSupplyChassis
|
|
||||||
type: gauge
|
|
||||||
- labelname: cpqHeFltTolPowerSupplyBay
|
|
||||||
type: gauge
|
|
||||||
- name: cpqHeFltTolPowerSupplyBay
|
|
||||||
oid: 1.3.6.1.4.1.232.6.2.9.3.1.2
|
|
||||||
type: gauge
|
|
||||||
help: The bay number to index within this chassis. - 1.3.6.1.4.1.232.6.2.9.3.1.2
|
|
||||||
indexes:
|
|
||||||
- labelname: cpqHeFltTolPowerSupplyChassis
|
|
||||||
type: gauge
|
|
||||||
- labelname: cpqHeFltTolPowerSupplyBay
|
|
||||||
type: gauge
|
|
||||||
- name: cpqHeFltTolPowerSupplyStatus
|
- name: cpqHeFltTolPowerSupplyStatus
|
||||||
oid: 1.3.6.1.4.1.232.6.2.9.3.1.5
|
oid: 1.3.6.1.4.1.232.6.2.9.3.1.5
|
||||||
type: gauge
|
type: EnumAsStateSet
|
||||||
help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5
|
help: The status of the power supply. - 1.3.6.1.4.1.232.6.2.9.3.1.5
|
||||||
indexes:
|
indexes:
|
||||||
- labelname: cpqHeFltTolPowerSupplyChassis
|
- labelname: cpqHeFltTolPowerSupplyChassis
|
||||||
|
@ -685,7 +629,7 @@ ilo:
|
||||||
17: noPowerInput
|
17: noPowerInput
|
||||||
- name: cpqSm2CntlrInterfaceStatus
|
- name: cpqSm2CntlrInterfaceStatus
|
||||||
oid: 1.3.6.1.4.1.232.9.2.2.17
|
oid: 1.3.6.1.4.1.232.9.2.2.17
|
||||||
type: gauge
|
type: EnumAsStateSet
|
||||||
help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17
|
help: Remote Insight/ Integrated Lights-Out Interface Status - 1.3.6.1.4.1.232.9.2.2.17
|
||||||
enum_values:
|
enum_values:
|
||||||
1: other
|
1: other
|
||||||
|
@ -693,7 +637,7 @@ ilo:
|
||||||
3: notResponding
|
3: notResponding
|
||||||
- name: cpqSm2CntlriLOSecurityOverrideSwitchState
|
- name: cpqSm2CntlriLOSecurityOverrideSwitchState
|
||||||
oid: 1.3.6.1.4.1.232.9.2.2.27
|
oid: 1.3.6.1.4.1.232.9.2.2.27
|
||||||
type: gauge
|
type: EnumAsStateSet
|
||||||
help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27
|
help: Integrated Lights-Out Security Override Switch State - 1.3.6.1.4.1.232.9.2.2.27
|
||||||
enum_values:
|
enum_values:
|
||||||
1: notSupported
|
1: notSupported
|
||||||
|
@ -701,7 +645,7 @@ ilo:
|
||||||
3: notSet
|
3: notSet
|
||||||
- name: cpqSm2CntlrLicenseActive
|
- name: cpqSm2CntlrLicenseActive
|
||||||
oid: 1.3.6.1.4.1.232.9.2.2.30
|
oid: 1.3.6.1.4.1.232.9.2.2.30
|
||||||
type: gauge
|
type: EnumAsStateSet
|
||||||
help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30
|
help: Remote Insight License State - 1.3.6.1.4.1.232.9.2.2.30
|
||||||
enum_values:
|
enum_values:
|
||||||
1: none
|
1: none
|
||||||
|
@ -714,7 +658,7 @@ ilo:
|
||||||
8: iloAdvancedPremiumSecurity
|
8: iloAdvancedPremiumSecurity
|
||||||
- name: cpqSm2CntlrServerPowerState
|
- name: cpqSm2CntlrServerPowerState
|
||||||
oid: 1.3.6.1.4.1.232.9.2.2.32
|
oid: 1.3.6.1.4.1.232.9.2.2.32
|
||||||
type: gauge
|
type: EnumAsStateSet
|
||||||
help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32
|
help: The current power state for the server - 1.3.6.1.4.1.232.9.2.2.32
|
||||||
enum_values:
|
enum_values:
|
||||||
1: unknown
|
1: unknown
|
||||||
|
|
Loading…
Reference in a new issue
À quoi correspond
other
? C'est jamais une erreur ?other = le module est absent par exemple, on a BEAUCOUP de other
Ok