colmena initial implementation for sandbox and monitor
All checks were successful
Hello World / test (push) Successful in 4s
All checks were successful
Hello World / test (push) Successful in 4s
This commit is contained in:
parent
a90630ecb6
commit
5feb74d56d
40 changed files with 27629 additions and 141 deletions
30
machines/monitor/provisioning/alerts/traefik-alerts.yml
Normal file
30
machines/monitor/provisioning/alerts/traefik-alerts.yml
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
groups:
|
||||
- name: Traefik
|
||||
|
||||
rules:
|
||||
- alert: TraefikServiceDown
|
||||
expr: "count(traefik_service_server_up) by (service) == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Traefik service down (instance {{ $labels.instance }})
|
||||
description: "All Traefik services are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TraefikHighHttp4xxErrorRateService
|
||||
expr: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Traefik high HTTP 4xx error rate service (instance {{ $labels.instance }})
|
||||
description: "Traefik service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TraefikHighHttp5xxErrorRateService
|
||||
expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
|
||||
description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
Loading…
Add table
Add a link
Reference in a new issue