From fc9971ddc97875b755ac4d0c23576ac1e603def8 Mon Sep 17 00:00:00 2001 From: plasmagoat Date: Wed, 4 Jun 2025 05:03:02 +0200 Subject: [PATCH] alerts... --- nixos/hosts/forgejo/forgejo.nix | 7 +- nixos/hosts/monitoring/grafana.nix | 28 +- nixos/hosts/monitoring/prometheus.nix | 17 +- .../provisioning/alerting/alerts.yml | 265 ++++++++++++++++++ .../provisioning/notifiers/contact-points.yml | 11 + 5 files changed, 324 insertions(+), 4 deletions(-) create mode 100644 nixos/hosts/monitoring/provisioning/alerting/alerts.yml create mode 100644 nixos/hosts/monitoring/provisioning/notifiers/contact-points.yml diff --git a/nixos/hosts/forgejo/forgejo.nix b/nixos/hosts/forgejo/forgejo.nix index e32addd..236c250 100644 --- a/nixos/hosts/forgejo/forgejo.nix +++ b/nixos/hosts/forgejo/forgejo.nix @@ -3,6 +3,7 @@ let cfg = config.services.forgejo; srv = cfg.settings.server; domain = "git.procopius.dk"; + ssh_domain = "gitssh.procopius.dk"; in { users.users.plasmagoat.extraGroups = [ "forgejo" ]; @@ -19,6 +20,10 @@ in ROOT_URL = "https://${srv.DOMAIN}/"; PROTOCOL = "http"; HTTP_PORT = 3000; + + START_SSH_SERVER = true; + SSH_PORT = 2222; + SSH_DOMAIN = ssh_domain; }; database = { DB_TYPE = lib.mkForce "postgres"; @@ -61,5 +66,5 @@ in ''; # Optional: firewall - networking.firewall.allowedTCPPorts = [ 3000 ]; + networking.firewall.allowedTCPPorts = [ 3000 2222 ]; } diff --git a/nixos/hosts/monitoring/grafana.nix b/nixos/hosts/monitoring/grafana.nix index f9a3ee1..501e0df 100644 --- a/nixos/hosts/monitoring/grafana.nix +++ b/nixos/hosts/monitoring/grafana.nix @@ -6,7 +6,7 @@ http_port = 3000; http_addr = "0.0.0.0"; # Grafana needs to know on which domain and URL it's running - # domain = "monitor.local"; + domain = "grafana.procopius.dk"; # root_url = "https://monitor.local/grafana/"; # Not needed if it is `https://your.domain/` # serve_from_sub_path = true; }; @@ -22,11 +22,13 @@ datasources.settings.datasources = [ # "Built-in" datasources can be provisioned - c.f. https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources { + uid = "prometheus"; name = "Prometheus"; type = "prometheus"; url = "http://127.0.0.1:${toString config.services.prometheus.port}"; } { + uid = "loki"; name = "Loki"; type = "loki"; url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}"; @@ -41,7 +43,13 @@ # Note: removing attributes from the above `datasources.settings.datasources` is not enough for them to be deleted on `grafana`; # One needs to use the following option: - # datasources.settings.deleteDatasources = [ { name = "foo"; orgId = 1; } { name = "bar"; orgId = 1; } ]; + # datasources.settings.deleteDatasources = [ { name = "prometheus"; orgId = 1; } { name = "loki"; orgId = 1; } ]; + + alerting.rules.path = "/etc/grafana/provisioning/alerting/alerts.yml"; + + # notifiers.settings = { + # path = "/etc/grafana/provisioning/notifiers"; + # }; dashboards.settings.providers = [{ name = "my dashboards"; @@ -91,4 +99,20 @@ group = "grafana"; mode = "0644"; }; + + # 🔔 Alerts provisioning + environment.etc."grafana/provisioning/alerting/alerts.yml" = { + source = ./provisioning/alerting/alerts.yml; + user = "grafana"; + group = "grafana"; + mode = "0644"; + }; + + # 📬 Contact point provisioning + environment.etc."grafana/provisioning/notifiers/contact-points.yml" = { + source = ./provisioning/notifiers/contact-points.yml; + user = "grafana"; + group = "grafana"; + mode = "0644"; + }; } diff --git a/nixos/hosts/monitoring/prometheus.nix b/nixos/hosts/monitoring/prometheus.nix index 2171dea..ac34e0e 100644 --- a/nixos/hosts/monitoring/prometheus.nix +++ b/nixos/hosts/monitoring/prometheus.nix @@ -7,6 +7,9 @@ let forgejo_ip = "forgejo.local"; prometheus_exporter_port = 9100; + postgres_exporter_port = 9187; + prometheus_port = 9090; + grafana_port = 3000; promtail_port = 9080; traefik_monitor_port = 8082; forgejo_monitor_port = 3000; @@ -34,6 +37,18 @@ in { } ]; } + { + job_name = "grafana"; + static_configs = [ + { targets = [ "${monitor_ip}:${toString grafana_port}" ]; } + ]; + } + { + job_name = "prometheus"; + static_configs = [ + { targets = [ "${monitor_ip}:${toString prometheus_port}" ]; } + ]; + } { job_name = "traefik"; static_configs = [ @@ -49,7 +64,7 @@ in { { job_name = "postgres"; static_configs = [ - { targets = [ "${forgejo_ip}:9187" ]; } + { targets = [ "${forgejo_ip}:${toString postgres_exporter_port}" ]; } ]; } { diff --git a/nixos/hosts/monitoring/provisioning/alerting/alerts.yml b/nixos/hosts/monitoring/provisioning/alerting/alerts.yml new file mode 100644 index 0000000..a9fbd51 --- /dev/null +++ b/nixos/hosts/monitoring/provisioning/alerting/alerts.yml @@ -0,0 +1,265 @@ +apiVersion: 1 + +groups: + - name: node-alerts + folder: "VM Alerts" + interval: 1m + rules: + - uid: high_cpu + title: High CPU Usage + condition: A + for: 5m + annotations: + summary: "High CPU on {{ $labels.instance }}" + description: "CPU > 90% for 5 minutes" + labels: + severity: warning + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: prometheus + model: + expr: avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) by (instance) > 0.9 + interval: "" + datasource: { type: prometheus, uid: prometheus } + instant: false + intervalMs: 15000 + maxDataPoints: 43200 + + - uid: high_memory + title: High Memory Usage + condition: A + for: 5m + annotations: + summary: "High memory on {{ $labels.instance }}" + description: "Memory > 90% for 5 minutes" + labels: + severity: warning + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: prometheus + model: + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.9 + interval: "" + datasource: { type: prometheus, uid: prometheus } + instant: false + intervalMs: 15000 + maxDataPoints: 43200 + + - uid: low_disk + title: Low Disk Space + condition: A + for: 5m + annotations: + summary: "Low disk on {{ $labels.instance }} {{ $labels.mountpoint }}" + description: "< 10% space left" + labels: + severity: critical + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: prometheus + model: + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.1 + interval: "" + datasource: { type: prometheus, uid: prometheus } + instant: false + intervalMs: 15000 + maxDataPoints: 43200 + + - uid: node_down + title: Node is Down + condition: A + for: 1m + annotations: + summary: "{{ $labels.instance }} is DOWN" + description: "Exporter is not responding" + labels: + severity: critical + data: + - refId: A + relativeTimeRange: { from: 60, to: 0 } + datasourceUid: prometheus + model: + expr: up == 0 + interval: "" + datasource: { type: prometheus, uid: prometheus } + instant: true + intervalMs: 15000 + maxDataPoints: 43200 + + - name: prometheus-alerts + folder: "Prometheus Alerts" + interval: 1m + rules: + - uid: prometheus_down + title: Prometheus is Down + condition: A + for: 1m + annotations: + summary: "Prometheus is not responding" + description: "up{job='prometheus'} == 0" + labels: + severity: critical + data: + - refId: A + relativeTimeRange: { from: 60, to: 0 } + datasourceUid: prometheus + model: + expr: up{job="prometheus"} == 0 + interval: "" + datasource: { type: prometheus, uid: prometheus } + instant: true + intervalMs: 15000 + maxDataPoints: 43200 + + - name: traefik-alerts + folder: "Traefik Alerts" + interval: 1m + rules: + - uid: traefik_5xx + title: Traefik 5xx Errors + condition: A + for: 1m + annotations: + summary: "Traefik has a high rate of 5xx responses" + description: "Rate of HTTP 5xx responses is high" + labels: + severity: warning + data: + - refId: A + relativeTimeRange: { from: 60, to: 0 } + datasourceUid: prometheus + model: + expr: rate(traefik_service_requests_total{code=~"5.."}[1m]) > 0 + interval: "" + datasource: { type: prometheus, uid: prometheus } + instant: false + intervalMs: 15000 + maxDataPoints: 43200 + + - name: grafana-alerts + folder: "Grafana Alerts" + interval: 1m + rules: + - uid: grafana_down + title: Grafana is Down + condition: A + for: 1m + annotations: + summary: "Grafana is not responding" + description: "up{job='grafana'} == 0" + labels: + severity: critical + data: + - refId: A + relativeTimeRange: { from: 60, to: 0 } + datasourceUid: prometheus + model: + expr: up{job="grafana"} == 0 + interval: "" + datasource: { type: prometheus, uid: prometheus } + instant: true + intervalMs: 15000 + maxDataPoints: 43200 + + - name: postgres-alerts + folder: "Postgres Alerts" + interval: 1m + rules: + - uid: postgres_down + title: Postgres is Down + condition: A + for: 1m + annotations: + summary: "Postgres is not responding" + description: "up{job='postgres'} == 0" + labels: + severity: critical + data: + - refId: A + relativeTimeRange: { from: 60, to: 0 } + datasourceUid: prometheus + model: + expr: up{job="postgres"} == 0 + interval: "" + datasource: { type: prometheus, uid: prometheus } + instant: true + intervalMs: 15000 + maxDataPoints: 43200 + + - name: gitea-alerts + folder: "Gitea Alerts" + interval: 1m + rules: + - uid: gitea_down + title: Gitea is Down + condition: A + for: 1m + annotations: + summary: "Gitea is not responding" + description: "up{job='gitea'} == 0" + labels: + severity: critical + data: + - refId: A + relativeTimeRange: { from: 60, to: 0 } + datasourceUid: prometheus + model: + expr: up{job="gitea"} == 0 + interval: "" + datasource: { type: prometheus, uid: prometheus } + instant: true + intervalMs: 15000 + maxDataPoints: 43200 + + - name: promtail-alerts + folder: "Promtail Alerts" + interval: 1m + rules: + - uid: promtail_down + title: Promtail is Down + condition: A + for: 1m + annotations: + summary: "Promtail is not responding" + description: "up{job='promtail'} == 0" + labels: + severity: critical + data: + - refId: A + relativeTimeRange: { from: 60, to: 0 } + datasourceUid: prometheus + model: + expr: up{job="promtail"} == 0 + interval: "" + datasource: { type: prometheus, uid: prometheus } + instant: true + intervalMs: 15000 + maxDataPoints: 43200 + + - name: logs-alerts + folder: "Logs Alerts" + interval: 1m + rules: + - uid: failed_ssh_logins + title: Failed SSH Logins + condition: A + for: 1m + annotations: + summary: "Too many failed SSH login attempts" + description: "Check for brute force login attempts" + labels: + severity: warning + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: prometheus + model: + expr: sum(rate(failed_ssh_logins[5m])) > 5 + interval: "" + datasource: { type: prometheus, uid: prometheus } + instant: false + intervalMs: 15000 + maxDataPoints: 43200 diff --git a/nixos/hosts/monitoring/provisioning/notifiers/contact-points.yml b/nixos/hosts/monitoring/provisioning/notifiers/contact-points.yml new file mode 100644 index 0000000..c2396d0 --- /dev/null +++ b/nixos/hosts/monitoring/provisioning/notifiers/contact-points.yml @@ -0,0 +1,11 @@ +# /etc/grafana/provisioning/notifiers/contact-points.yml +apiVersion: 1 + +contactPoints: + - orgId: 1 + name: telegram + type: telegram + settings: + bottoken: "__YOUR_BOT_TOKEN__" + chatid: "__YOUR_CHAT_ID__" + disableResolveMessage: false