265 lines
7.7 KiB
YAML
265 lines
7.7 KiB
YAML
apiVersion: 1
|
|
|
|
groups:
|
|
- name: node-alerts
|
|
folder: "VM Alerts"
|
|
interval: 1m
|
|
rules:
|
|
- uid: high_cpu
|
|
title: High CPU Usage
|
|
condition: A
|
|
for: 5m
|
|
annotations:
|
|
summary: "High CPU on {{ $labels.instance }}"
|
|
description: "CPU > 90% for 5 minutes"
|
|
labels:
|
|
severity: warning
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: { from: 300, to: 0 }
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) by (instance) > 0.9
|
|
interval: ""
|
|
datasource: { type: prometheus, uid: prometheus }
|
|
instant: false
|
|
intervalMs: 15000
|
|
maxDataPoints: 43200
|
|
|
|
- uid: high_memory
|
|
title: High Memory Usage
|
|
condition: A
|
|
for: 5m
|
|
annotations:
|
|
summary: "High memory on {{ $labels.instance }}"
|
|
description: "Memory > 90% for 5 minutes"
|
|
labels:
|
|
severity: warning
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: { from: 300, to: 0 }
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.9
|
|
interval: ""
|
|
datasource: { type: prometheus, uid: prometheus }
|
|
instant: false
|
|
intervalMs: 15000
|
|
maxDataPoints: 43200
|
|
|
|
- uid: low_disk
|
|
title: Low Disk Space
|
|
condition: A
|
|
for: 5m
|
|
annotations:
|
|
summary: "Low disk on {{ $labels.instance }} {{ $labels.mountpoint }}"
|
|
description: "< 10% space left"
|
|
labels:
|
|
severity: critical
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: { from: 300, to: 0 }
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.1
|
|
interval: ""
|
|
datasource: { type: prometheus, uid: prometheus }
|
|
instant: false
|
|
intervalMs: 15000
|
|
maxDataPoints: 43200
|
|
|
|
- uid: node_down
|
|
title: Node is Down
|
|
condition: A
|
|
for: 1m
|
|
annotations:
|
|
summary: "{{ $labels.instance }} is DOWN"
|
|
description: "Exporter is not responding"
|
|
labels:
|
|
severity: critical
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: { from: 60, to: 0 }
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: up == 0
|
|
interval: ""
|
|
datasource: { type: prometheus, uid: prometheus }
|
|
instant: true
|
|
intervalMs: 15000
|
|
maxDataPoints: 43200
|
|
|
|
- name: prometheus-alerts
|
|
folder: "Prometheus Alerts"
|
|
interval: 1m
|
|
rules:
|
|
- uid: prometheus_down
|
|
title: Prometheus is Down
|
|
condition: A
|
|
for: 1m
|
|
annotations:
|
|
summary: "Prometheus is not responding"
|
|
description: "up{job='prometheus'} == 0"
|
|
labels:
|
|
severity: critical
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: { from: 60, to: 0 }
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: up{job="prometheus"} == 0
|
|
interval: ""
|
|
datasource: { type: prometheus, uid: prometheus }
|
|
instant: true
|
|
intervalMs: 15000
|
|
maxDataPoints: 43200
|
|
|
|
- name: traefik-alerts
|
|
folder: "Traefik Alerts"
|
|
interval: 1m
|
|
rules:
|
|
- uid: traefik_5xx
|
|
title: Traefik 5xx Errors
|
|
condition: A
|
|
for: 1m
|
|
annotations:
|
|
summary: "Traefik has a high rate of 5xx responses"
|
|
description: "Rate of HTTP 5xx responses is high"
|
|
labels:
|
|
severity: warning
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: { from: 60, to: 0 }
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: rate(traefik_service_requests_total{code=~"5.."}[1m]) > 0
|
|
interval: ""
|
|
datasource: { type: prometheus, uid: prometheus }
|
|
instant: false
|
|
intervalMs: 15000
|
|
maxDataPoints: 43200
|
|
|
|
- name: grafana-alerts
|
|
folder: "Grafana Alerts"
|
|
interval: 1m
|
|
rules:
|
|
- uid: grafana_down
|
|
title: Grafana is Down
|
|
condition: A
|
|
for: 1m
|
|
annotations:
|
|
summary: "Grafana is not responding"
|
|
description: "up{job='grafana'} == 0"
|
|
labels:
|
|
severity: critical
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: { from: 60, to: 0 }
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: up{job="grafana"} == 0
|
|
interval: ""
|
|
datasource: { type: prometheus, uid: prometheus }
|
|
instant: true
|
|
intervalMs: 15000
|
|
maxDataPoints: 43200
|
|
|
|
- name: postgres-alerts
|
|
folder: "Postgres Alerts"
|
|
interval: 1m
|
|
rules:
|
|
- uid: postgres_down
|
|
title: Postgres is Down
|
|
condition: A
|
|
for: 1m
|
|
annotations:
|
|
summary: "Postgres is not responding"
|
|
description: "up{job='postgres'} == 0"
|
|
labels:
|
|
severity: critical
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: { from: 60, to: 0 }
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: up{job="postgres"} == 0
|
|
interval: ""
|
|
datasource: { type: prometheus, uid: prometheus }
|
|
instant: true
|
|
intervalMs: 15000
|
|
maxDataPoints: 43200
|
|
|
|
- name: gitea-alerts
|
|
folder: "Gitea Alerts"
|
|
interval: 1m
|
|
rules:
|
|
- uid: gitea_down
|
|
title: Gitea is Down
|
|
condition: A
|
|
for: 1m
|
|
annotations:
|
|
summary: "Gitea is not responding"
|
|
description: "up{job='gitea'} == 0"
|
|
labels:
|
|
severity: critical
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: { from: 60, to: 0 }
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: up{job="gitea"} == 0
|
|
interval: ""
|
|
datasource: { type: prometheus, uid: prometheus }
|
|
instant: true
|
|
intervalMs: 15000
|
|
maxDataPoints: 43200
|
|
|
|
- name: promtail-alerts
|
|
folder: "Promtail Alerts"
|
|
interval: 1m
|
|
rules:
|
|
- uid: promtail_down
|
|
title: Promtail is Down
|
|
condition: A
|
|
for: 1m
|
|
annotations:
|
|
summary: "Promtail is not responding"
|
|
description: "up{job='promtail'} == 0"
|
|
labels:
|
|
severity: critical
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: { from: 60, to: 0 }
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: up{job="promtail"} == 0
|
|
interval: ""
|
|
datasource: { type: prometheus, uid: prometheus }
|
|
instant: true
|
|
intervalMs: 15000
|
|
maxDataPoints: 43200
|
|
|
|
- name: logs-alerts
|
|
folder: "Logs Alerts"
|
|
interval: 1m
|
|
rules:
|
|
- uid: failed_ssh_logins
|
|
title: Failed SSH Logins
|
|
condition: A
|
|
for: 1m
|
|
annotations:
|
|
summary: "Too many failed SSH login attempts"
|
|
description: "Check for brute force login attempts"
|
|
labels:
|
|
severity: warning
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange: { from: 300, to: 0 }
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: sum(rate(failed_ssh_logins[5m])) > 5
|
|
interval: ""
|
|
datasource: { type: prometheus, uid: prometheus }
|
|
instant: false
|
|
intervalMs: 15000
|
|
maxDataPoints: 43200
|