apiVersion: 1 groups: - name: node-alerts folder: "VM Alerts" interval: 1m rules: - uid: high_cpu title: High CPU Usage condition: A for: 5m annotations: summary: "High CPU on {{ $labels.instance }}" description: "CPU > 90% for 5 minutes" labels: severity: warning data: - refId: A relativeTimeRange: { from: 300, to: 0 } datasourceUid: prometheus model: expr: avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) by (instance) > 0.9 interval: "" datasource: { type: prometheus, uid: prometheus } instant: false intervalMs: 15000 maxDataPoints: 43200 - uid: high_memory title: High Memory Usage condition: A for: 5m annotations: summary: "High memory on {{ $labels.instance }}" description: "Memory > 90% for 5 minutes" labels: severity: warning data: - refId: A relativeTimeRange: { from: 300, to: 0 } datasourceUid: prometheus model: expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.9 interval: "" datasource: { type: prometheus, uid: prometheus } instant: false intervalMs: 15000 maxDataPoints: 43200 - uid: low_disk title: Low Disk Space condition: A for: 5m annotations: summary: "Low disk on {{ $labels.instance }} {{ $labels.mountpoint }}" description: "< 10% space left" labels: severity: critical data: - refId: A relativeTimeRange: { from: 300, to: 0 } datasourceUid: prometheus model: expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.1 interval: "" datasource: { type: prometheus, uid: prometheus } instant: false intervalMs: 15000 maxDataPoints: 43200 - uid: node_down title: Node is Down condition: A for: 1m annotations: summary: "{{ $labels.instance }} is DOWN" description: "Exporter is not responding" labels: severity: critical data: - refId: A relativeTimeRange: { from: 60, to: 0 } datasourceUid: prometheus model: expr: up == 0 interval: "" datasource: { type: prometheus, uid: prometheus } instant: true intervalMs: 15000 maxDataPoints: 43200 - name: prometheus-alerts folder: "Prometheus Alerts" interval: 1m rules: - uid: prometheus_down title: Prometheus is Down condition: A for: 1m annotations: summary: "Prometheus is not responding" description: "up{job='prometheus'} == 0" labels: severity: critical data: - refId: A relativeTimeRange: { from: 60, to: 0 } datasourceUid: prometheus model: expr: up{job="prometheus"} == 0 interval: "" datasource: { type: prometheus, uid: prometheus } instant: true intervalMs: 15000 maxDataPoints: 43200 - name: traefik-alerts folder: "Traefik Alerts" interval: 1m rules: - uid: traefik_5xx title: Traefik 5xx Errors condition: A for: 1m annotations: summary: "Traefik has a high rate of 5xx responses" description: "Rate of HTTP 5xx responses is high" labels: severity: warning data: - refId: A relativeTimeRange: { from: 60, to: 0 } datasourceUid: prometheus model: expr: rate(traefik_service_requests_total{code=~"5.."}[1m]) > 0 interval: "" datasource: { type: prometheus, uid: prometheus } instant: false intervalMs: 15000 maxDataPoints: 43200 - name: grafana-alerts folder: "Grafana Alerts" interval: 1m rules: - uid: grafana_down title: Grafana is Down condition: A for: 1m annotations: summary: "Grafana is not responding" description: "up{job='grafana'} == 0" labels: severity: critical data: - refId: A relativeTimeRange: { from: 60, to: 0 } datasourceUid: prometheus model: expr: up{job="grafana"} == 0 interval: "" datasource: { type: prometheus, uid: prometheus } instant: true intervalMs: 15000 maxDataPoints: 43200 - name: postgres-alerts folder: "Postgres Alerts" interval: 1m rules: - uid: postgres_down title: Postgres is Down condition: A for: 1m annotations: summary: "Postgres is not responding" description: "up{job='postgres'} == 0" labels: severity: critical data: - refId: A relativeTimeRange: { from: 60, to: 0 } datasourceUid: prometheus model: expr: up{job="postgres"} == 0 interval: "" datasource: { type: prometheus, uid: prometheus } instant: true intervalMs: 15000 maxDataPoints: 43200 - name: gitea-alerts folder: "Gitea Alerts" interval: 1m rules: - uid: gitea_down title: Gitea is Down condition: A for: 1m annotations: summary: "Gitea is not responding" description: "up{job='gitea'} == 0" labels: severity: critical data: - refId: A relativeTimeRange: { from: 60, to: 0 } datasourceUid: prometheus model: expr: up{job="gitea"} == 0 interval: "" datasource: { type: prometheus, uid: prometheus } instant: true intervalMs: 15000 maxDataPoints: 43200 - name: promtail-alerts folder: "Promtail Alerts" interval: 1m rules: - uid: promtail_down title: Promtail is Down condition: A for: 1m annotations: summary: "Promtail is not responding" description: "up{job='promtail'} == 0" labels: severity: critical data: - refId: A relativeTimeRange: { from: 60, to: 0 } datasourceUid: prometheus model: expr: up{job="promtail"} == 0 interval: "" datasource: { type: prometheus, uid: prometheus } instant: true intervalMs: 15000 maxDataPoints: 43200 - name: logs-alerts folder: "Logs Alerts" interval: 1m rules: - uid: failed_ssh_logins title: Failed SSH Logins condition: A for: 1m annotations: summary: "Too many failed SSH login attempts" description: "Check for brute force login attempts" labels: severity: warning data: - refId: A relativeTimeRange: { from: 300, to: 0 } datasourceUid: prometheus model: expr: sum(rate(failed_ssh_logins[5m])) > 5 interval: "" datasource: { type: prometheus, uid: prometheus } instant: false intervalMs: 15000 maxDataPoints: 43200