From 7dd5043b5d3e77c0bd509b5590d9ee8ed4fdfc57 Mon Sep 17 00:00:00 2001
From: plasmagoat <david.mikael@proton.me>
Date: Fri, 6 Jun 2025 23:32:17 +0200
Subject: [PATCH] ready for runners

---
 nixos/README.md                               |   6 +-
 nixos/flake.nix                               |  13 +-
 nixos/hosts/dns/dnsmasq.nix                   |  61 ++
 nixos/hosts/{forgejo => dns}/host.nix         |   5 +-
 nixos/hosts/dns/networking.nix                |  19 +
 nixos/hosts/forgejo-runner/host.nix           |   9 +
 nixos/hosts/forgejo-runner/networking.nix     |   4 +
 nixos/hosts/forgejo-runner/runner.nix         |  35 +
 nixos/hosts/forgejo-runner/sops.nix           |   7 +
 nixos/hosts/forgejo/database.nix              |   9 +-
 nixos/hosts/forgejo/forgejo.nix               |   5 +-
 nixos/hosts/forgejo/networking.nix            |   8 +-
 nixos/hosts/forgejo/sops.nix                  |  19 +
 nixos/hosts/monitoring/alertmanager.nix       |  59 ++
 .../dashboards/grafana-traefik.json           | 854 ++++++++----------
 nixos/hosts/monitoring/grafana.nix            |  24 +-
 nixos/hosts/monitoring/host.nix               |   3 +
 nixos/hosts/monitoring/networking.nix         |  19 +-
 nixos/hosts/monitoring/prometheus.nix         | 113 ++-
 .../provisioning/alerting/alerts.yml          | 265 ------
 .../provisioning/alerts/loki-alerts.yml       |  39 +
 .../alerts/node-exporter-alerts-v2.yml        | 299 ++++++
 .../alerts/node-exporter-alerts.yml           | 320 +++++++
 .../provisioning/alerts/postgres-alerts.yml   | 201 +++++
 .../provisioning/alerts/prometheus-alerts.yml | 255 ++++++
 .../provisioning/alerts/promtail-alerts.yml   |  21 +
 .../provisioning/alerts/traefik-alerts.yml    |  30 +
 .../templates/telegram.markdown.v2.tmpl       |  37 +
 .../provisioning/templates/telegram.tmpl      |  28 +
 nixos/hosts/monitoring/sops.nix               |   7 +
 nixos/hosts/sandbox/networking.nix            |  24 +-
 .../traefik/configuration/middlewares.nix     |  10 +
 nixos/hosts/traefik/configuration/routers.nix | 140 +++
 .../hosts/traefik/configuration/services.nix  |  38 +
 nixos/hosts/traefik/configuration/static.nix  |  61 ++
 nixos/hosts/traefik/networking.nix            |  23 +-
 nixos/hosts/traefik/promtail.nix              |   4 +-
 nixos/hosts/traefik/traefik.nix               | 168 +---
 nixos/hosts/vpn/README.md                     | 127 +++
 nixos/modules/promtail.nix                    |   5 +-
 nixos/secrets/HOWTO.md                        |  59 ++
 nixos/secrets/README.md                       | 109 ++-
 nixos/secrets/forgejo/runner-secrets.yml      |  16 +
 nixos/secrets/forgejo/secrets.yml             |  18 +
 nixos/secrets/secrets.yaml                    |  27 -
 nixos/secrets/shared-sops.nix                 |  11 +
 nixos/secrets/shared/secrets.yml              |  16 +
 nixos/secrets/sops.nix                        |   8 -
 nixos/secrets/telegram/secrets.yml            |  16 +
 49 files changed, 2569 insertions(+), 1085 deletions(-)
 create mode 100644 nixos/hosts/dns/dnsmasq.nix
 rename nixos/hosts/{forgejo => dns}/host.nix (58%)
 create mode 100644 nixos/hosts/dns/networking.nix
 create mode 100644 nixos/hosts/forgejo-runner/host.nix
 create mode 100644 nixos/hosts/forgejo-runner/networking.nix
 create mode 100644 nixos/hosts/forgejo-runner/runner.nix
 create mode 100644 nixos/hosts/forgejo-runner/sops.nix
 create mode 100644 nixos/hosts/forgejo/sops.nix
 create mode 100644 nixos/hosts/monitoring/alertmanager.nix
 delete mode 100644 nixos/hosts/monitoring/provisioning/alerting/alerts.yml
 create mode 100644 nixos/hosts/monitoring/provisioning/alerts/loki-alerts.yml
 create mode 100644 nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts-v2.yml
 create mode 100644 nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts.yml
 create mode 100644 nixos/hosts/monitoring/provisioning/alerts/postgres-alerts.yml
 create mode 100644 nixos/hosts/monitoring/provisioning/alerts/prometheus-alerts.yml
 create mode 100644 nixos/hosts/monitoring/provisioning/alerts/promtail-alerts.yml
 create mode 100644 nixos/hosts/monitoring/provisioning/alerts/traefik-alerts.yml
 create mode 100644 nixos/hosts/monitoring/provisioning/templates/telegram.markdown.v2.tmpl
 create mode 100644 nixos/hosts/monitoring/provisioning/templates/telegram.tmpl
 create mode 100644 nixos/hosts/monitoring/sops.nix
 create mode 100644 nixos/hosts/traefik/configuration/middlewares.nix
 create mode 100644 nixos/hosts/traefik/configuration/routers.nix
 create mode 100644 nixos/hosts/traefik/configuration/services.nix
 create mode 100644 nixos/hosts/traefik/configuration/static.nix
 create mode 100644 nixos/hosts/vpn/README.md
 create mode 100644 nixos/secrets/HOWTO.md
 create mode 100644 nixos/secrets/forgejo/runner-secrets.yml
 create mode 100644 nixos/secrets/forgejo/secrets.yml
 delete mode 100644 nixos/secrets/secrets.yaml
 create mode 100644 nixos/secrets/shared-sops.nix
 create mode 100644 nixos/secrets/shared/secrets.yml
 delete mode 100644 nixos/secrets/sops.nix
 create mode 100644 nixos/secrets/telegram/secrets.yml

diff --git a/nixos/README.md b/nixos/README.md
index 5805186..f19ef37 100644
--- a/nixos/README.md
+++ b/nixos/README.md
@@ -1,5 +1,9 @@
 nixos-rebuild switch --flake .#traefik --target-host root@192.168.1.171 --verbose
 nixos-rebuild switch --flake .#proxmox --target-host root@192.168.1.205 --verbose
 nixos-rebuild switch --flake .#sandbox --target-host root@192.168.1.148 --verbose
-nixos-rebuild switch --flake .#monitoring --target-host root@192.168.1.88 --verbose
+nixos-rebuild switch --flake .#monitoring --target-host root@monitor.lab --verbose
 nixos-rebuild switch --flake .#forgejo --target-host root@192.168.1.249 --verbose
+nixos-rebuild switch --flake .#dns --target-host root@192.168.1.140 --verbose
+
+nixos-rebuild switch --flake .#runner --target-host root@forgejo-runner-01.lab --override-input runnerId 01
+nixos-rebuild switch --flake .#runner01 --target-host root@forgejo-runner-01.lab --verbose
diff --git a/nixos/flake.nix b/nixos/flake.nix
index 9cf7424..7e58a47 100644
--- a/nixos/flake.nix
+++ b/nixos/flake.nix
@@ -34,9 +34,14 @@
           modules = [ ./hosts/sandbox/host.nix ];
         };
 
+        dns = nixpkgs.lib.nixosSystem {
+          inherit system;
+          modules = [ ./hosts/dns/host.nix ];
+        };
+
         monitoring = nixpkgs.lib.nixosSystem {
           inherit system;
-          modules = [ ./hosts/monitoring/host.nix ];
+          modules = [ ./hosts/monitoring/host.nix sops-nix.nixosModules.sops ];
         };
 
         forgejo = nixpkgs.lib.nixosSystem {
@@ -44,6 +49,12 @@
           modules = [ ./hosts/forgejo/host.nix sops-nix.nixosModules.sops ];
         };
 
+        runner01 = nixpkgs.lib.nixosSystem {
+          inherit system;
+          modules = [ ./hosts/forgejo-runner/host.nix sops-nix.nixosModules.sops ];
+          specialArgs.runnerId = "01";
+        };
+
         # dockerHost = pkgs.lib.nixosSystem {
         #   inherit system;
         #   modules = [
diff --git a/nixos/hosts/dns/dnsmasq.nix b/nixos/hosts/dns/dnsmasq.nix
new file mode 100644
index 0000000..b076b54
--- /dev/null
+++ b/nixos/hosts/dns/dnsmasq.nix
@@ -0,0 +1,61 @@
+{
+  services.dnsmasq = {
+    enable = true;
+    alwaysKeepRunning = true;
+    settings = {
+      domain = "lab";
+      expand-hosts = true;
+      domain-needed = true;
+
+      # interface = "eth0";  # Replace with your real interface
+      bind-interfaces = true;
+
+      local = [
+        "/lab/"
+        "/procopius.dk/"
+      ];
+      bogus-priv = true;
+      no-resolv = true;
+
+      # no-hosts = true; # Prevent 127.0.0.2 etc from leaking in
+      server = [
+        "8.8.8.8"
+        "8.8.4.4"
+        "1.1.1.1"
+        "1.0.0.1"
+      ];
+
+      # Static DNS entry: map hostname to IP (without DHCP)
+      address = [
+        # Static IPs
+        "/dns.lab/192.168.1.53"
+        "/traefik.lab/192.168.1.80"
+        # "/proxmox-01.lab/192.168.1.205"
+        # "/nas-01.lab/192.168.1.226"
+
+        # Split Horizon DNS
+        "/procopius.dk/192.168.1.80"
+        "/.procopius.dk/192.168.1.80"
+      ];
+
+      cache-size = 10000;
+
+      dhcp-authoritative = true;
+      dhcp-range = "192.168.1.100,192.168.1.254,12h";
+      dhcp-host = "bc:24:11:58:f5:da,dns,192.168.1.53";
+      # "Use 192.168.1.53 as your DNS server."
+      dhcp-option = [
+        "option:router,192.168.1.1" # router
+        "option:dns-server,192.168.1.53" # DNS server (this VM)
+      ];
+
+      log-queries = true;
+      localise-queries = true;
+      log-async = true;
+      # log-facility = "/var/log/dnsmasq/dnsmasq.log";
+    };
+  };
+
+  services.prometheus.exporters.dnsmasq.enable = true;
+  services.prometheus.exporters.dnsmasq.openFirewall = true;
+}
diff --git a/nixos/hosts/forgejo/host.nix b/nixos/hosts/dns/host.nix
similarity index 58%
rename from nixos/hosts/forgejo/host.nix
rename to nixos/hosts/dns/host.nix
index d8257e4..d346b22 100644
--- a/nixos/hosts/forgejo/host.nix
+++ b/nixos/hosts/dns/host.nix
@@ -3,10 +3,7 @@
 {
   imports = [
     ../../templates/base.nix
-    ../../secrets/sops.nix
     ./networking.nix
-    ./storage.nix
-    ./forgejo.nix
-    ./database.nix
+    ./dnsmasq.nix
   ];
 }
diff --git a/nixos/hosts/dns/networking.nix b/nixos/hosts/dns/networking.nix
new file mode 100644
index 0000000..0942d99
--- /dev/null
+++ b/nixos/hosts/dns/networking.nix
@@ -0,0 +1,19 @@
+{
+  networking.hostName = "dns";
+  # networking.useHostResolvConf = false;
+  # networking.interfaces.eth0.useDHCP = true;
+  networking.interfaces.eth0.ipv4.addresses = [{
+    address = "192.168.1.53";
+    prefixLength = 24;
+  }];
+
+  networking.defaultGateway = "192.168.1.1"; # your router
+  networking.nameservers = [ "8.8.8.8" ]; # fallback resolvers
+
+  networking.firewall.allowedTCPPorts = [ 53 67 80 443 ];
+  networking.firewall.allowedUDPPorts = [ 53 67 ];
+
+  networking.hosts = {
+    "192.168.1.53" = [ "dns" "dns.lab" ];
+  };
+}
diff --git a/nixos/hosts/forgejo-runner/host.nix b/nixos/hosts/forgejo-runner/host.nix
new file mode 100644
index 0000000..7c4e256
--- /dev/null
+++ b/nixos/hosts/forgejo-runner/host.nix
@@ -0,0 +1,9 @@
+{
+  imports = [
+    ../../templates/base.nix
+    ../../secrets/shared-sops.nix
+    ./runner.nix
+    ./networking.nix
+    ./sops.nix
+  ];
+}
diff --git a/nixos/hosts/forgejo-runner/networking.nix b/nixos/hosts/forgejo-runner/networking.nix
new file mode 100644
index 0000000..df98995
--- /dev/null
+++ b/nixos/hosts/forgejo-runner/networking.nix
@@ -0,0 +1,4 @@
+{ config, lib, pkgs, runnerId, ... }:
+{
+  networking.hostName = "forgejo-runner-${runnerId}";
+}
diff --git a/nixos/hosts/forgejo-runner/runner.nix b/nixos/hosts/forgejo-runner/runner.nix
new file mode 100644
index 0000000..2d1a40b
--- /dev/null
+++ b/nixos/hosts/forgejo-runner/runner.nix
@@ -0,0 +1,35 @@
+{ config, pkgs,... }:
+
+{
+  # users.users.forgejo-runner = {
+  #   isSystemUser = true;
+  #   extraGroups = [ "docker" ];  # Optional: if using docker jobs
+  # };
+
+  services.gitea-actions-runner = {
+    package = pkgs.forgejo-actions-runner;
+    instances.default = {
+      enable = true;
+      name = config.networking.hostName;
+      url = "https://git.procopius.dk";
+      # Obtaining the path to the runner token file may differ
+      # tokenFile should be in format TOKEN=<secret>, since it's EnvironmentFile for systemd
+      tokenFile = config.sops.secrets."forgejo-runner-registration-token".path;
+      labels = [
+        "ubuntu-latest:docker://node:16-bullseye"
+        # "ubuntu-22.04:docker://node:16-bullseye"
+        # "ubuntu-20.04:docker://node:16-bullseye"
+        # "ubuntu-18.04:docker://node:16-buster"
+        ## optionally provide native execution on the host:
+        # "native:host"
+      ];
+    };
+  };
+
+  # systemd.services."forgejo-actions-runner-default".serviceConfig = {
+  #   User = "forgejo-runner";
+  #   Group = "forgejo-runner";
+  # };
+
+  virtualisation.docker.enable = true;  # Optional: if using docker
+}
diff --git a/nixos/hosts/forgejo-runner/sops.nix b/nixos/hosts/forgejo-runner/sops.nix
new file mode 100644
index 0000000..1cbbeaf
--- /dev/null
+++ b/nixos/hosts/forgejo-runner/sops.nix
@@ -0,0 +1,7 @@
+{ config, lib, ... }:
+{
+  sops.secrets."forgejo-runner-registration-token" = {
+    sopsFile = ../../secrets/forgejo/runner-secrets.yml;
+    mode = "0440";
+  };
+}
diff --git a/nixos/hosts/forgejo/database.nix b/nixos/hosts/forgejo/database.nix
index 6dc9fe4..9d6f23e 100644
--- a/nixos/hosts/forgejo/database.nix
+++ b/nixos/hosts/forgejo/database.nix
@@ -21,11 +21,6 @@
     '';
   };
 
-  services.prometheus.exporters.postgres = {
-    enable = true;
-    listenAddress = "0.0.0.0";
-    port = 9187;
-  };
-  networking.firewall.allowedTCPPorts = [ 9187 ];
-
+  services.prometheus.exporters.postgres.enable = true;
+  services.prometheus.exporters.postgres.openFirewall = true;
 }
diff --git a/nixos/hosts/forgejo/forgejo.nix b/nixos/hosts/forgejo/forgejo.nix
index 236c250..633100c 100644
--- a/nixos/hosts/forgejo/forgejo.nix
+++ b/nixos/hosts/forgejo/forgejo.nix
@@ -47,14 +47,11 @@ in
 
       security = {
         INSTALL_LOCK = true;
-        SECRET_KEY = "changeme";  # can be another secret
+        SECRET_KEY = config.sops.secrets."forgejo-secret-key".path; # can be another secret
       };
     };
   };
 
-  sops.secrets.forgejo-admin-password.owner = "forgejo";
-  sops.secrets.forgejo-db-password.owner = "forgejo";
-
   systemd.services.forgejo.preStart = let
     adminCmd = "${lib.getExe cfg.package} admin user";
     user = "plasmagoat"; # Note, Forgejo doesn't allow creation of an account named "admin"
diff --git a/nixos/hosts/forgejo/networking.nix b/nixos/hosts/forgejo/networking.nix
index 2f076b7..6e9eb1c 100644
--- a/nixos/hosts/forgejo/networking.nix
+++ b/nixos/hosts/forgejo/networking.nix
@@ -1,6 +1,4 @@
-{ config, lib, pkgs, ... }: {
-
-  networking = {
-    hostName = "forgejo";
-  };
+{ config, lib, pkgs, ... }:
+{
+  networking.hostName = "forgejo";
 }
diff --git a/nixos/hosts/forgejo/sops.nix b/nixos/hosts/forgejo/sops.nix
new file mode 100644
index 0000000..7e96908
--- /dev/null
+++ b/nixos/hosts/forgejo/sops.nix
@@ -0,0 +1,19 @@
+let
+  forgejoSops = ../../secrets/forgejo/secrets.yml;
+in
+{
+  sops.secrets = {
+    "forgejo-admin-password" = {
+      sopsFile = forgejoSops;
+      owner = "forgejo";
+    };
+    "forgejo-db-password" = {
+      sopsFile = forgejoSops;
+      owner = "forgejo";
+    };
+    "forgejo-secret-key" = {
+      sopsFile = forgejoSops;
+      owner = "forgejo";
+    };
+  };
+}
diff --git a/nixos/hosts/monitoring/alertmanager.nix b/nixos/hosts/monitoring/alertmanager.nix
new file mode 100644
index 0000000..17d0649
--- /dev/null
+++ b/nixos/hosts/monitoring/alertmanager.nix
@@ -0,0 +1,59 @@
+{ config, pkgs, modulesPath, lib, ... }:
+
+{
+  services.prometheus.alertmanagers = [ {
+    scheme = "http";
+    # path_prefix = "/alertmanager";
+    static_configs = [ {
+      targets = [
+        "localhost:9093"
+      ];
+    } ];
+  } ];
+  services.prometheus.alertmanager = {
+    enable = true;
+    openFirewall = true;
+    webExternalUrl = "http://monitor.lab:9093";  # optional but helpful
+    configuration = {
+      route = {
+        group_wait = "10s";
+        group_interval = "30s";
+        repeat_interval = "30m";
+        receiver = "telegram";
+
+        routes = [
+          {
+            receiver = "telegram";
+            group_wait = "10s";
+            match_re = {
+              severity = "critical|warning";
+            };
+            continue = true;
+          }
+        ];
+      };
+
+      receivers = [
+        {
+          name = "telegram";
+          telegram_configs = [
+            {
+              api_url = "https://api.telegram.org";
+              bot_token = config.sops.secrets."telegram-alert-bot-token".path;
+              chat_id = -1002642560007;
+              message_thread_id = 4;
+              parse_mode = "HTML";
+              send_resolved = false;
+              message = "{{ template \"telegram.message\". }}";
+            }
+          ];
+        }
+      ];
+
+      templates = [
+        (pkgs.writeText "telegram.tmpl" (builtins.readFile ./provisioning/templates/telegram.tmpl))
+        (pkgs.writeText "telegram.markdown.v2.tmpl" (builtins.readFile ./provisioning/templates/telegram.markdown.v2.tmpl))
+      ];
+    };
+  };
+}
diff --git a/nixos/hosts/monitoring/dashboards/grafana-traefik.json b/nixos/hosts/monitoring/dashboards/grafana-traefik.json
index f6c4f35..c03e56e 100644
--- a/nixos/hosts/monitoring/dashboards/grafana-traefik.json
+++ b/nixos/hosts/monitoring/dashboards/grafana-traefik.json
@@ -1,42 +1,12 @@
 {
-  "__inputs": [],
-  "__requires": [
-    {
-      "type": "grafana",
-      "id": "grafana",
-      "name": "Grafana",
-      "version": "7.5.5"
-    },
-    {
-      "type": "panel",
-      "id": "graph",
-      "name": "Graph",
-      "version": ""
-    },
-    {
-      "type": "panel",
-      "id": "piechart",
-      "name": "Pie chart v2",
-      "version": ""
-    },
-    {
-      "type": "datasource",
-      "id": "prometheus",
-      "name": "Prometheus",
-      "version": "1.0.0"
-    },
-    {
-      "type": "panel",
-      "id": "singlestat",
-      "name": "Singlestat",
-      "version": ""
-    }
-  ],
   "annotations": {
     "list": [
       {
         "builtIn": 1,
-        "datasource": "-- Grafana --",
+        "datasource": {
+          "type": "datasource",
+          "uid": "grafana"
+        },
         "enable": true,
         "hide": true,
         "iconColor": "rgba(0, 211, 255, 1)",
@@ -47,14 +17,13 @@
   },
   "description": "Traefik dashboard prometheus",
   "editable": true,
-  "gnetId": 4475,
+  "fiscalYearStartMonth": 0,
   "graphTooltip": 0,
-  "id": null,
-  "iteration": 1620932097756,
+  "id": 2,
   "links": [],
   "panels": [
     {
-      "datasource": null,
+      "collapsed": false,
       "gridPos": {
         "h": 1,
         "w": 24,
@@ -62,32 +31,29 @@
         "y": 0
       },
       "id": 10,
+      "panels": [],
       "title": "$backend stats",
       "type": "row"
     },
     {
-      "cacheTimeout": null,
-      "datasource": "Prometheus",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
       "fieldConfig": {
         "defaults": {
           "color": {
             "mode": "palette-classic"
           },
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            }
+          },
           "decimals": 0,
           "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
           "unit": "short"
         },
         "overrides": []
@@ -99,8 +65,6 @@
         "y": 1
       },
       "id": 2,
-      "interval": null,
-      "links": [],
       "maxDataPoints": 3,
       "options": {
         "displayLabels": [],
@@ -108,6 +72,7 @@
           "calcs": [],
           "displayMode": "table",
           "placement": "right",
+          "showLegend": true,
           "values": ["value", "percent"]
         },
         "pieType": "pie",
@@ -116,10 +81,20 @@
           "fields": "",
           "values": false
         },
-        "text": {}
+        "text": {},
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
       },
+      "pluginVersion": "12.0.0+security-01",
       "targets": [
         {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
           "exemplar": true,
           "expr": "traefik_service_requests_total{service=\"$service\"}",
           "format": "time_series",
@@ -133,22 +108,42 @@
       "type": "piechart"
     },
     {
-      "cacheTimeout": null,
-      "colorBackground": false,
-      "colorValue": false,
-      "colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
-      "datasource": "Prometheus",
-      "fieldConfig": {
-        "defaults": {},
-        "overrides": []
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
       },
-      "format": "ms",
-      "gauge": {
-        "maxValue": 100,
-        "minValue": 0,
-        "show": false,
-        "thresholdLabels": false,
-        "thresholdMarkers": true
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "fixedColor": "rgb(31, 120, 193)",
+            "mode": "fixed"
+          },
+          "mappings": [
+            {
+              "options": {
+                "match": "null",
+                "result": {
+                  "text": "N/A"
+                }
+              },
+              "type": "special"
+            }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
       },
       "gridPos": {
         "h": 7,
@@ -157,42 +152,29 @@
         "y": 1
       },
       "id": 4,
-      "interval": null,
-      "links": [],
-      "mappingType": 1,
-      "mappingTypes": [
-        {
-          "name": "value to text",
-          "value": 1
-        },
-        {
-          "name": "range to text",
-          "value": 2
-        }
-      ],
       "maxDataPoints": 100,
-      "nullPointMode": "connected",
-      "nullText": null,
-      "postfix": "",
-      "postfixFontSize": "50%",
-      "prefix": "",
-      "prefixFontSize": "50%",
-      "rangeMaps": [
-        {
-          "from": "null",
-          "text": "N/A",
-          "to": "null"
-        }
-      ],
-      "sparkline": {
-        "fillColor": "rgba(31, 118, 189, 0.18)",
-        "full": false,
-        "lineColor": "rgb(31, 120, 193)",
-        "show": true
+      "options": {
+        "colorMode": "none",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "horizontal",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": {
+          "calcs": ["mean"],
+          "fields": "",
+          "values": false
+        },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
       },
-      "tableColumn": "",
+      "pluginVersion": "12.0.0+security-01",
       "targets": [
         {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
           "exemplar": true,
           "expr": "sum(traefik_service_request_duration_seconds_sum{service=\"$service\"}) / sum(traefik_service_requests_total{service=\"$service\"}) * 1000",
           "format": "time_series",
@@ -202,68 +184,97 @@
           "refId": "A"
         }
       ],
-      "thresholds": "",
       "title": "$service response time",
-      "type": "singlestat",
-      "valueFontSize": "80%",
-      "valueMaps": [
-        {
-          "op": "=",
-          "text": "N/A",
-          "value": "null"
-        }
-      ],
-      "valueName": "avg"
+      "type": "stat"
     },
     {
-      "aliasColors": {},
-      "bars": true,
-      "dashLength": 10,
-      "dashes": false,
-      "datasource": "Prometheus",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
       "fieldConfig": {
-        "defaults": {},
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "bars",
+            "fillOpacity": 100,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
         "overrides": []
       },
-      "fill": 1,
-      "fillGradient": 0,
       "gridPos": {
         "h": 7,
         "w": 24,
         "x": 0,
         "y": 8
       },
-      "hiddenSeries": false,
       "id": 3,
-      "legend": {
-        "alignAsTable": true,
-        "avg": true,
-        "current": false,
-        "max": true,
-        "min": true,
-        "rightSide": true,
-        "show": true,
-        "total": false,
-        "values": true
-      },
-      "lines": false,
-      "linewidth": 1,
-      "links": [],
-      "nullPointMode": "null",
       "options": {
-        "alertThreshold": true
+        "alertThreshold": true,
+        "legend": {
+          "calcs": ["mean", "max", "min"],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
       },
-      "percentage": false,
-      "pluginVersion": "7.5.5",
-      "pointradius": 5,
-      "points": false,
-      "renderer": "flot",
-      "seriesOverrides": [],
-      "spaceLength": 10,
-      "stack": false,
-      "steppedLine": false,
+      "pluginVersion": "12.0.0+security-01",
       "targets": [
         {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
           "exemplar": true,
           "expr": "sum(rate(traefik_service_requests_total{service=\"$service\"}[5m]))",
           "format": "time_series",
@@ -273,50 +284,11 @@
           "refId": "A"
         }
       ],
-      "thresholds": [],
-      "timeFrom": null,
-      "timeRegions": [],
-      "timeShift": null,
       "title": "Total requests over 5min $service",
-      "tooltip": {
-        "shared": true,
-        "sort": 0,
-        "value_type": "individual"
-      },
-      "type": "graph",
-      "xaxis": {
-        "buckets": null,
-        "mode": "time",
-        "name": null,
-        "show": true,
-        "values": []
-      },
-      "yaxes": [
-        {
-          "format": "short",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        },
-        {
-          "format": "short",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        }
-      ],
-      "yaxis": {
-        "align": false,
-        "alignLevel": null
-      }
+      "type": "timeseries"
     },
     {
       "collapsed": false,
-      "datasource": null,
       "gridPos": {
         "h": 1,
         "w": 24,
@@ -329,215 +301,54 @@
       "type": "row"
     },
     {
-      "aliasColors": {},
-      "bars": true,
-      "dashLength": 10,
-      "dashes": false,
-      "datasource": "Prometheus",
-      "fieldConfig": {
-        "defaults": {},
-        "overrides": []
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
       },
-      "fill": 1,
-      "fillGradient": 0,
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 0,
-        "y": 16
-      },
-      "hiddenSeries": false,
-      "id": 5,
-      "legend": {
-        "alignAsTable": true,
-        "avg": false,
-        "current": true,
-        "max": true,
-        "min": true,
-        "rightSide": true,
-        "show": true,
-        "total": false,
-        "values": true
-      },
-      "lines": false,
-      "linewidth": 1,
-      "links": [],
-      "nullPointMode": "null",
-      "options": {
-        "alertThreshold": true
-      },
-      "percentage": false,
-      "pluginVersion": "7.5.5",
-      "pointradius": 5,
-      "points": false,
-      "renderer": "flot",
-      "seriesOverrides": [],
-      "spaceLength": 10,
-      "stack": true,
-      "steppedLine": false,
-      "targets": [
-        {
-          "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code=\"200\"}[5m])",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{method}} : {{code}}",
-          "refId": "A"
-        }
-      ],
-      "thresholds": [],
-      "timeFrom": null,
-      "timeRegions": [],
-      "timeShift": null,
-      "title": "Status code 200 over 5min",
-      "tooltip": {
-        "shared": true,
-        "sort": 0,
-        "value_type": "individual"
-      },
-      "type": "graph",
-      "xaxis": {
-        "buckets": null,
-        "mode": "time",
-        "name": null,
-        "show": true,
-        "values": []
-      },
-      "yaxes": [
-        {
-          "format": "short",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        },
-        {
-          "format": "short",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        }
-      ],
-      "yaxis": {
-        "align": false,
-        "alignLevel": null
-      }
-    },
-    {
-      "aliasColors": {},
-      "bars": true,
-      "dashLength": 10,
-      "dashes": false,
-      "datasource": "Prometheus",
-      "fieldConfig": {
-        "defaults": {},
-        "overrides": []
-      },
-      "fill": 1,
-      "fillGradient": 0,
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 12,
-        "y": 16
-      },
-      "hiddenSeries": false,
-      "id": 6,
-      "legend": {
-        "alignAsTable": true,
-        "avg": false,
-        "current": true,
-        "max": true,
-        "min": true,
-        "rightSide": true,
-        "show": true,
-        "total": false,
-        "values": true
-      },
-      "lines": false,
-      "linewidth": 1,
-      "links": [],
-      "nullPointMode": "null",
-      "options": {
-        "alertThreshold": true
-      },
-      "percentage": false,
-      "pluginVersion": "7.5.5",
-      "pointradius": 5,
-      "points": false,
-      "renderer": "flot",
-      "seriesOverrides": [],
-      "spaceLength": 10,
-      "stack": true,
-      "steppedLine": false,
-      "targets": [
-        {
-          "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code!=\"200\"}[5m])",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{ method }} : {{code}}",
-          "refId": "A"
-        }
-      ],
-      "thresholds": [],
-      "timeFrom": null,
-      "timeRegions": [],
-      "timeShift": null,
-      "title": "Others status code over 5min",
-      "tooltip": {
-        "shared": true,
-        "sort": 0,
-        "value_type": "individual"
-      },
-      "type": "graph",
-      "xaxis": {
-        "buckets": null,
-        "mode": "time",
-        "name": null,
-        "show": true,
-        "values": []
-      },
-      "yaxes": [
-        {
-          "format": "short",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        },
-        {
-          "format": "short",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        }
-      ],
-      "yaxis": {
-        "align": false,
-        "alignLevel": null
-      }
-    },
-    {
-      "cacheTimeout": null,
-      "datasource": "Prometheus",
       "fieldConfig": {
         "defaults": {
           "color": {
             "mode": "palette-classic"
           },
-          "decimals": 0,
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "bars",
+            "fillOpacity": 100,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "normal"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
           "mappings": [],
           "thresholds": {
             "mode": "absolute",
             "steps": [
               {
-                "color": "green",
-                "value": null
+                "color": "green"
               },
               {
                 "color": "red",
@@ -549,6 +360,165 @@
         },
         "overrides": []
       },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
+      "id": 5,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": ["lastNotNull", "max", "min"],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.0+security-01",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code=\"200\"}[5m])",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "{{method}} : {{code}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Status code 200 over 5min",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "bars",
+            "fillOpacity": 100,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "normal"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
+      "id": 6,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": ["lastNotNull", "max", "min"],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.0+security-01",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code!=\"200\"}[5m])",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "{{ method }} : {{code}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Others status code over 5min",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            }
+          },
+          "decimals": 0,
+          "mappings": [],
+          "unit": "short"
+        },
+        "overrides": []
+      },
       "gridPos": {
         "h": 7,
         "w": 12,
@@ -556,8 +526,6 @@
         "y": 23
       },
       "id": 7,
-      "interval": null,
-      "links": [],
       "maxDataPoints": 3,
       "options": {
         "displayLabels": [],
@@ -565,6 +533,7 @@
           "calcs": [],
           "displayMode": "table",
           "placement": "right",
+          "showLegend": true,
           "values": ["value"]
         },
         "pieType": "pie",
@@ -573,10 +542,20 @@
           "fields": "",
           "values": false
         },
-        "text": {}
+        "text": {},
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
       },
+      "pluginVersion": "12.0.0+security-01",
       "targets": [
         {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
           "exemplar": true,
           "expr": "sum(rate(traefik_service_requests_total[5m])) by (service) ",
           "format": "time_series",
@@ -590,28 +569,24 @@
       "type": "piechart"
     },
     {
-      "cacheTimeout": null,
-      "datasource": "Prometheus",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
       "fieldConfig": {
         "defaults": {
           "color": {
             "mode": "palette-classic"
           },
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            }
+          },
           "decimals": 0,
           "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
           "unit": "short"
         },
         "overrides": []
@@ -623,8 +598,6 @@
         "y": 23
       },
       "id": 8,
-      "interval": null,
-      "links": [],
       "maxDataPoints": 3,
       "options": {
         "displayLabels": [],
@@ -632,6 +605,7 @@
           "calcs": [],
           "displayMode": "table",
           "placement": "right",
+          "showLegend": true,
           "values": ["value"]
         },
         "pieType": "pie",
@@ -640,10 +614,20 @@
           "fields": "",
           "values": false
         },
-        "text": {}
+        "text": {},
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
       },
+      "pluginVersion": "12.0.0+security-01",
       "targets": [
         {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
           "exemplar": true,
           "expr": "sum(rate(traefik_entrypoint_requests_total{entrypoint =~ \"$entrypoint\"}[5m])) by (entrypoint) ",
           "format": "time_series",
@@ -657,48 +641,32 @@
       "type": "piechart"
     }
   ],
-  "schemaVersion": 27,
-  "style": "dark",
+  "preload": false,
+  "schemaVersion": 41,
   "tags": ["traefik", "prometheus"],
   "templating": {
     "list": [
       {
-        "allValue": null,
         "current": {},
         "datasource": "Prometheus",
-        "definition": "label_values(service)",
-        "description": null,
-        "error": null,
-        "hide": 0,
+        "definition": "label_values({job=\"traefik\"},service)",
         "includeAll": false,
-        "label": null,
-        "multi": false,
         "name": "service",
         "options": [],
         "query": {
-          "query": "label_values(service)",
-          "refId": "StandardVariableQuery"
+          "qryType": 1,
+          "query": "label_values({job=\"traefik\"},service)",
+          "refId": "PrometheusVariableQueryEditor-VariableQuery"
         },
         "refresh": 1,
         "regex": "",
-        "skipUrlSync": false,
-        "sort": 0,
-        "tagValuesQuery": "",
-        "tags": [],
-        "tagsQuery": "",
-        "type": "query",
-        "useTags": false
+        "type": "query"
       },
       {
-        "allValue": null,
         "current": {},
         "datasource": "Prometheus",
         "definition": "",
-        "description": null,
-        "error": null,
-        "hide": 0,
         "includeAll": true,
-        "label": null,
         "multi": true,
         "name": "entrypoint",
         "options": [],
@@ -708,13 +676,7 @@
         },
         "refresh": 1,
         "regex": "",
-        "skipUrlSync": false,
-        "sort": 0,
-        "tagValuesQuery": "",
-        "tags": [],
-        "tagsQuery": "",
-        "type": "query",
-        "useTags": false
+        "type": "query"
       }
     ]
   },
@@ -722,23 +684,9 @@
     "from": "now-1h",
     "to": "now"
   },
-  "timepicker": {
-    "refresh_intervals": [
-      "5s",
-      "10s",
-      "30s",
-      "1m",
-      "5m",
-      "15m",
-      "30m",
-      "1h",
-      "2h",
-      "1d"
-    ],
-    "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
-  },
+  "timepicker": {},
   "timezone": "",
   "title": "Traefik",
   "uid": "qPdAviJmz",
-  "version": 10
+  "version": 1
 }
diff --git a/nixos/hosts/monitoring/grafana.nix b/nixos/hosts/monitoring/grafana.nix
index 501e0df..6381378 100644
--- a/nixos/hosts/monitoring/grafana.nix
+++ b/nixos/hosts/monitoring/grafana.nix
@@ -6,7 +6,7 @@
     http_port = 3000;
     http_addr = "0.0.0.0";
     # Grafana needs to know on which domain and URL it's running
-    domain = "grafana.procopius.dk";
+    domain = "grafana.lab";
     # root_url = "https://monitor.local/grafana/"; # Not needed if it is `https://your.domain/`
     # serve_from_sub_path = true;
   };
@@ -45,12 +45,6 @@
       # One needs to use the following option:
       # datasources.settings.deleteDatasources = [ { name = "prometheus"; orgId = 1; } { name = "loki"; orgId = 1; } ];
 
-      alerting.rules.path = "/etc/grafana/provisioning/alerting/alerts.yml";
-
-      # notifiers.settings = {
-      #   path = "/etc/grafana/provisioning/notifiers";
-      # };
-
       dashboards.settings.providers = [{
         name = "my dashboards";
         options.path = "/etc/grafana-dashboards";
@@ -99,20 +93,4 @@
     group = "grafana";
     mode = "0644";
   };
-
-  # 🔔 Alerts provisioning
-  environment.etc."grafana/provisioning/alerting/alerts.yml" = {
-    source = ./provisioning/alerting/alerts.yml;
-    user = "grafana";
-    group = "grafana";
-    mode = "0644";
-  };
-
-  # 📬 Contact point provisioning
-  environment.etc."grafana/provisioning/notifiers/contact-points.yml" = {
-    source = ./provisioning/notifiers/contact-points.yml;
-    user = "grafana";
-    group = "grafana";
-    mode = "0644";
-  };
 }
diff --git a/nixos/hosts/monitoring/host.nix b/nixos/hosts/monitoring/host.nix
index d2114c4..bccda47 100644
--- a/nixos/hosts/monitoring/host.nix
+++ b/nixos/hosts/monitoring/host.nix
@@ -3,9 +3,12 @@
 {
   imports = [
     ../../templates/base.nix
+    ../../secrets/shared-sops.nix
     ./networking.nix
     ./prometheus.nix
     ./grafana.nix
     ./loki.nix
+    ./alertmanager.nix
+    ./sops.nix
   ];
 }
diff --git a/nixos/hosts/monitoring/networking.nix b/nixos/hosts/monitoring/networking.nix
index 7e6e7a9..a8fd74e 100644
--- a/nixos/hosts/monitoring/networking.nix
+++ b/nixos/hosts/monitoring/networking.nix
@@ -1,17 +1,4 @@
-{ config, lib, pkgs, ... }: {
-
-  networking = {
-    hostName = "monitor";
-    # interfaces.eth0 = {
-    #   ipv4.addresses = [{
-    #     address = "192.168.1.171";
-    #     prefixLength = 24;
-    #   }];
-    # };
-    # firewall.allowedTCPPorts = [ 80 3000 9090 ];
-    # defaultGateway = {
-    #   address = "192.168.1.1";
-    #   interface = "eth0";
-    # };
-  };
+{ config, lib, pkgs, ... }:
+{
+  networking.hostName = "monitor";
 }
diff --git a/nixos/hosts/monitoring/prometheus.nix b/nixos/hosts/monitoring/prometheus.nix
index ac34e0e..6eed8bc 100644
--- a/nixos/hosts/monitoring/prometheus.nix
+++ b/nixos/hosts/monitoring/prometheus.nix
@@ -1,18 +1,46 @@
 { config, pkgs, modulesPath, lib, ... }:
 
 let
-  monitor_ip = "monitor.local";
-  traefik_ip = "traefik.local";
-  sandbox_ip = "sandbox.local";
-  forgejo_ip = "forgejo.local";
+  monitor_ip = "monitor.lab";
+  traefik_ip = "traefik.lab";
+  sandbox_ip = "sandbox.lab";
+  forgejo_ip = "forgejo.lab";
+  dnsmasq_ip = "dns.lab";
 
   prometheus_exporter_port = 9100;
   postgres_exporter_port = 9187;
   prometheus_port = 9090;
+  alertmanager_port = 9093;
   grafana_port = 3000;
   promtail_port = 9080;
   traefik_monitor_port = 8082;
   forgejo_monitor_port = 3000;
+  dnsmasq_exporter_port = 9153;
+
+  exporters = {
+    node = [
+        "${monitor_ip}:${toString prometheus_exporter_port}"
+        "${traefik_ip}:${toString prometheus_exporter_port}"
+        "${sandbox_ip}:${toString prometheus_exporter_port}"
+        "${forgejo_ip}:${toString prometheus_exporter_port}"
+    ];
+    promtail = [
+        "${monitor_ip}:${toString promtail_port}"
+        "${traefik_ip}:${toString promtail_port}"
+        "${sandbox_ip}:${toString promtail_port}"
+        "${forgejo_ip}:${toString promtail_port}"
+    ];
+    grafana = [ "${monitor_ip}:${toString grafana_port}" ];
+    prometheus = [ "${monitor_ip}:${toString prometheus_port}" ];
+    alertmanager = [ "${monitor_ip}:${toString alertmanager_port}" ];
+    traefik = [ "${traefik_ip}:${toString traefik_monitor_port}" ];
+    gitea = [ "${forgejo_ip}:${toString forgejo_monitor_port}" ];
+    postgres = [ "${forgejo_ip}:${toString postgres_exporter_port}" ];
+
+    dnsmasq = [ "${dnsmasq_ip}:${toString dnsmasq_exporter_port}" ];
+  };
+
+
 in {
   networking.firewall.allowedTCPPorts = [ 9090 ];
 
@@ -22,64 +50,27 @@ in {
     globalConfig = {
       scrape_timeout = "10s";
       scrape_interval = "30s";
+      # A short evaluation_interval will check alerting rules very often.
+      # It can be costly if you run Prometheus with 100+ alerts.
+      evaluation_interval = "20s";
     };
-    scrapeConfigs = [
-      {
-        job_name = "node";
-        static_configs = [
-          {
-            targets = [
-              "${monitor_ip}:${toString prometheus_exporter_port}"
-              "${traefik_ip}:${toString prometheus_exporter_port}"
-              "${sandbox_ip}:${toString prometheus_exporter_port}"
-              "${forgejo_ip}:${toString prometheus_exporter_port}"
-            ];
-          }
-        ];
-      }
-      {
-        job_name = "grafana";
-        static_configs = [
-          { targets = [ "${monitor_ip}:${toString grafana_port}" ]; }
-        ];
-      }
-      {
-        job_name = "prometheus";
-        static_configs = [
-          { targets = [ "${monitor_ip}:${toString prometheus_port}" ]; }
-        ];
-      }
-      {
-        job_name = "traefik";
-        static_configs = [
-          { targets = [ "${traefik_ip}:${toString traefik_monitor_port}" ]; }
-        ];
-      }
-      {
-        job_name = "gitea";
-        static_configs = [
-          { targets = [ "${forgejo_ip}:${toString forgejo_monitor_port}" ]; }
-        ];
-      }
-      {
-        job_name = "postgres";
-        static_configs = [
-          { targets = [ "${forgejo_ip}:${toString postgres_exporter_port}" ]; }
-        ];
-      }
-      {
-        job_name = "promtail";
-        static_configs = [
-          {
-            targets = [
-              "${monitor_ip}:${toString promtail_port}"
-              "${traefik_ip}:${toString promtail_port}"
-              "${sandbox_ip}:${toString promtail_port}"
-              "${forgejo_ip}:${toString promtail_port}"
-            ];
-          }
-        ];
-      }
+    extraFlags = [
+      "--web.enable-admin-api"
+    ];
+
+    scrapeConfigs = lib.mapAttrsToList (job_name: targets: {
+      inherit job_name;
+      static_configs = [ { inherit targets; } ];
+    }) exporters;
+
+    # 🔔 Alerts provisioning
+    ruleFiles = [
+      (pkgs.writeText "prometheus-alerts.yml" (builtins.readFile ./provisioning/alerts/prometheus-alerts.yml))
+      (pkgs.writeText "loki-alerts.yml" (builtins.readFile ./provisioning/alerts/loki-alerts.yml))
+      (pkgs.writeText "promtail-alerts.yml" (builtins.readFile ./provisioning/alerts/promtail-alerts.yml))
+      (pkgs.writeText "postgres-alerts.yml" (builtins.readFile ./provisioning/alerts/postgres-alerts.yml))
+      (pkgs.writeText "traefik-alerts.yml" (builtins.readFile ./provisioning/alerts/traefik-alerts.yml))
+      (pkgs.writeText "node-exporter-alerts.yml" (builtins.readFile ./provisioning/alerts/node-exporter-alerts.yml))
     ];
   };
 }
diff --git a/nixos/hosts/monitoring/provisioning/alerting/alerts.yml b/nixos/hosts/monitoring/provisioning/alerting/alerts.yml
deleted file mode 100644
index a9fbd51..0000000
--- a/nixos/hosts/monitoring/provisioning/alerting/alerts.yml
+++ /dev/null
@@ -1,265 +0,0 @@
-apiVersion: 1
-
-groups:
-  - name: node-alerts
-    folder: "VM Alerts"
-    interval: 1m
-    rules:
-      - uid: high_cpu
-        title: High CPU Usage
-        condition: A
-        for: 5m
-        annotations:
-          summary: "High CPU on {{ $labels.instance }}"
-          description: "CPU > 90% for 5 minutes"
-        labels:
-          severity: warning
-        data:
-          - refId: A
-            relativeTimeRange: { from: 300, to: 0 }
-            datasourceUid: prometheus
-            model:
-              expr: avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) by (instance) > 0.9
-              interval: ""
-              datasource: { type: prometheus, uid: prometheus }
-              instant: false
-              intervalMs: 15000
-              maxDataPoints: 43200
-
-      - uid: high_memory
-        title: High Memory Usage
-        condition: A
-        for: 5m
-        annotations:
-          summary: "High memory on {{ $labels.instance }}"
-          description: "Memory > 90% for 5 minutes"
-        labels:
-          severity: warning
-        data:
-          - refId: A
-            relativeTimeRange: { from: 300, to: 0 }
-            datasourceUid: prometheus
-            model:
-              expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.9
-              interval: ""
-              datasource: { type: prometheus, uid: prometheus }
-              instant: false
-              intervalMs: 15000
-              maxDataPoints: 43200
-
-      - uid: low_disk
-        title: Low Disk Space
-        condition: A
-        for: 5m
-        annotations:
-          summary: "Low disk on {{ $labels.instance }} {{ $labels.mountpoint }}"
-          description: "< 10% space left"
-        labels:
-          severity: critical
-        data:
-          - refId: A
-            relativeTimeRange: { from: 300, to: 0 }
-            datasourceUid: prometheus
-            model:
-              expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.1
-              interval: ""
-              datasource: { type: prometheus, uid: prometheus }
-              instant: false
-              intervalMs: 15000
-              maxDataPoints: 43200
-
-      - uid: node_down
-        title: Node is Down
-        condition: A
-        for: 1m
-        annotations:
-          summary: "{{ $labels.instance }} is DOWN"
-          description: "Exporter is not responding"
-        labels:
-          severity: critical
-        data:
-          - refId: A
-            relativeTimeRange: { from: 60, to: 0 }
-            datasourceUid: prometheus
-            model:
-              expr: up == 0
-              interval: ""
-              datasource: { type: prometheus, uid: prometheus }
-              instant: true
-              intervalMs: 15000
-              maxDataPoints: 43200
-
-  - name: prometheus-alerts
-    folder: "Prometheus Alerts"
-    interval: 1m
-    rules:
-      - uid: prometheus_down
-        title: Prometheus is Down
-        condition: A
-        for: 1m
-        annotations:
-          summary: "Prometheus is not responding"
-          description: "up{job='prometheus'} == 0"
-        labels:
-          severity: critical
-        data:
-          - refId: A
-            relativeTimeRange: { from: 60, to: 0 }
-            datasourceUid: prometheus
-            model:
-              expr: up{job="prometheus"} == 0
-              interval: ""
-              datasource: { type: prometheus, uid: prometheus }
-              instant: true
-              intervalMs: 15000
-              maxDataPoints: 43200
-
-  - name: traefik-alerts
-    folder: "Traefik Alerts"
-    interval: 1m
-    rules:
-      - uid: traefik_5xx
-        title: Traefik 5xx Errors
-        condition: A
-        for: 1m
-        annotations:
-          summary: "Traefik has a high rate of 5xx responses"
-          description: "Rate of HTTP 5xx responses is high"
-        labels:
-          severity: warning
-        data:
-          - refId: A
-            relativeTimeRange: { from: 60, to: 0 }
-            datasourceUid: prometheus
-            model:
-              expr: rate(traefik_service_requests_total{code=~"5.."}[1m]) > 0
-              interval: ""
-              datasource: { type: prometheus, uid: prometheus }
-              instant: false
-              intervalMs: 15000
-              maxDataPoints: 43200
-
-  - name: grafana-alerts
-    folder: "Grafana Alerts"
-    interval: 1m
-    rules:
-      - uid: grafana_down
-        title: Grafana is Down
-        condition: A
-        for: 1m
-        annotations:
-          summary: "Grafana is not responding"
-          description: "up{job='grafana'} == 0"
-        labels:
-          severity: critical
-        data:
-          - refId: A
-            relativeTimeRange: { from: 60, to: 0 }
-            datasourceUid: prometheus
-            model:
-              expr: up{job="grafana"} == 0
-              interval: ""
-              datasource: { type: prometheus, uid: prometheus }
-              instant: true
-              intervalMs: 15000
-              maxDataPoints: 43200
-
-  - name: postgres-alerts
-    folder: "Postgres Alerts"
-    interval: 1m
-    rules:
-      - uid: postgres_down
-        title: Postgres is Down
-        condition: A
-        for: 1m
-        annotations:
-          summary: "Postgres is not responding"
-          description: "up{job='postgres'} == 0"
-        labels:
-          severity: critical
-        data:
-          - refId: A
-            relativeTimeRange: { from: 60, to: 0 }
-            datasourceUid: prometheus
-            model:
-              expr: up{job="postgres"} == 0
-              interval: ""
-              datasource: { type: prometheus, uid: prometheus }
-              instant: true
-              intervalMs: 15000
-              maxDataPoints: 43200
-
-  - name: gitea-alerts
-    folder: "Gitea Alerts"
-    interval: 1m
-    rules:
-      - uid: gitea_down
-        title: Gitea is Down
-        condition: A
-        for: 1m
-        annotations:
-          summary: "Gitea is not responding"
-          description: "up{job='gitea'} == 0"
-        labels:
-          severity: critical
-        data:
-          - refId: A
-            relativeTimeRange: { from: 60, to: 0 }
-            datasourceUid: prometheus
-            model:
-              expr: up{job="gitea"} == 0
-              interval: ""
-              datasource: { type: prometheus, uid: prometheus }
-              instant: true
-              intervalMs: 15000
-              maxDataPoints: 43200
-
-  - name: promtail-alerts
-    folder: "Promtail Alerts"
-    interval: 1m
-    rules:
-      - uid: promtail_down
-        title: Promtail is Down
-        condition: A
-        for: 1m
-        annotations:
-          summary: "Promtail is not responding"
-          description: "up{job='promtail'} == 0"
-        labels:
-          severity: critical
-        data:
-          - refId: A
-            relativeTimeRange: { from: 60, to: 0 }
-            datasourceUid: prometheus
-            model:
-              expr: up{job="promtail"} == 0
-              interval: ""
-              datasource: { type: prometheus, uid: prometheus }
-              instant: true
-              intervalMs: 15000
-              maxDataPoints: 43200
-
-  - name: logs-alerts
-    folder: "Logs Alerts"
-    interval: 1m
-    rules:
-      - uid: failed_ssh_logins
-        title: Failed SSH Logins
-        condition: A
-        for: 1m
-        annotations:
-          summary: "Too many failed SSH login attempts"
-          description: "Check for brute force login attempts"
-        labels:
-          severity: warning
-        data:
-          - refId: A
-            relativeTimeRange: { from: 300, to: 0 }
-            datasourceUid: prometheus
-            model:
-              expr: sum(rate(failed_ssh_logins[5m])) > 5
-              interval: ""
-              datasource: { type: prometheus, uid: prometheus }
-              instant: false
-              intervalMs: 15000
-              maxDataPoints: 43200
diff --git a/nixos/hosts/monitoring/provisioning/alerts/loki-alerts.yml b/nixos/hosts/monitoring/provisioning/alerts/loki-alerts.yml
new file mode 100644
index 0000000..0d4f91e
--- /dev/null
+++ b/nixos/hosts/monitoring/provisioning/alerts/loki-alerts.yml
@@ -0,0 +1,39 @@
+groups:
+  - name: Loki
+
+    rules:
+      - alert: LokiProcessTooManyRestarts
+        expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Loki process too many restarts (instance {{ $labels.instance }})
+          description: "A loki process had too many restarts (target {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: LokiRequestErrors
+        expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
+        for: 15m
+        labels:
+          severity: critical
+        annotations:
+          summary: Loki request errors (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: LokiRequestPanic
+        expr: "sum(increase(loki_panic_total[10m])) by (namespace, job) > 0"
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Loki request panic (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: LokiRequestLatency
+        expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le)))  > 1'
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Loki request latency (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts-v2.yml b/nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts-v2.yml
new file mode 100644
index 0000000..97a3390
--- /dev/null
+++ b/nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts-v2.yml
@@ -0,0 +1,299 @@
+groups:
+  - name: NodeExporterV2
+    rules:
+      - alert: Node down
+        expr: up{job="monitoring-pi"} == 0
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          title: Node {{ $labels.instance }} is down
+          description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down.
+
+      - alert: HostOutOfMemory
+        expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of memory (instance {{ $labels.instance }})
+          description: Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}
+
+      - alert: HostMemoryUnderMemoryPressure
+        expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host memory under memory pressure (instance {{ $labels.instance }})
+          description: The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}
+
+      - alert: HostUnusualNetworkThroughputIn
+        expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual network throughput in (instance {{ $labels.instance }})
+          description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}
+
+      - alert: HostUnusualNetworkThroughputOut
+        expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual network throughput out (instance {{ $labels.instance }})
+          description: Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}
+
+      - alert: HostUnusualDiskReadRate
+        expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk read rate (instance {{ $labels.instance }})
+          description: Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}
+
+      - alert: HostUnusualDiskWriteRate
+        expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk write rate (instance {{ $labels.instance }})
+          description: Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}
+
+      # Please add ignored mountpoints in node_exporter parameters like
+      # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+      # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+      - alert: HostOutOfDiskSpace
+        expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of disk space (instance {{ $labels.instance }})
+          description: Disk is almost full (< 10% left)\n  VALUE = {{ $value }}
+
+      # Please add ignored mountpoints in node_exporter parameters like
+      # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+      # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+      - alert: HostDiskWillFillIn24Hours
+        expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+          description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}
+
+      - alert: HostOutOfInodes
+        expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of inodes (instance {{ $labels.instance }})
+          description: Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}
+
+      - alert: HostInodesWillFillIn24Hours
+        expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+          description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}
+
+      - alert: HostUnusualDiskReadLatency
+        expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk read latency (instance {{ $labels.instance }})
+          description: Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}
+
+      - alert: HostUnusualDiskWriteLatency
+        expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk write latency (instance {{ $labels.instance }})
+          description: Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}
+
+      - alert: HostHighCpuLoad
+        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host high CPU load (instance {{ $labels.instance }})
+          description: CPU load is > 80%\n  VALUE = {{ $value }}
+
+      - alert: HostCpuStealNoisyNeighbor
+        expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+          description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}
+
+      # 1000 context switches is an arbitrary number.
+      # Alert threshold depends on nature of application.
+      # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
+      - alert: HostContextSwitching
+        expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host context switching (instance {{ $labels.instance }})
+          description: Context switching is growing on node (> 1000 / s)\n  VALUE = {{ $value }}
+
+      - alert: HostSwapIsFillingUp
+        expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host swap is filling up (instance {{ $labels.instance }})
+          description: Swap is filling up (>80%)\n  VALUE = {{ $value }}
+
+      - alert: HostSystemdServiceCrashed
+        expr: node_systemd_unit_state{state="failed"} == 1
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host SystemD service crashed (instance {{ $labels.instance }})
+          description: SystemD service crashed\n  VALUE = {{ $value }}
+
+      - alert: HostPhysicalComponentTooHot
+        expr: node_hwmon_temp_celsius > 75
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host physical component too hot (instance {{ $labels.instance }})
+          description: Physical hardware component too hot\n  VALUE = {{ $value }}
+
+      - alert: HostNodeOvertemperatureAlarm
+        expr: node_hwmon_temp_crit_alarm_celsius == 1
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+          description: Physical node temperature alarm triggered\n  VALUE = {{ $value }}
+
+      - alert: HostRaidArrayGotInactive
+        expr: node_md_state{state="inactive"} > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Host RAID array got inactive (instance {{ $labels.instance }})
+          description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n  VALUE = {{ $value }}
+
+      - alert: HostRaidDiskFailure
+        expr: node_md_disks{state="failed"} > 0
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host RAID disk failure (instance {{ $labels.instance }})
+          description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}
+
+      - alert: HostKernelVersionDeviations
+        expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
+        for: 6h
+        labels:
+          severity: warning
+        annotations:
+          summary: Host kernel version deviations (instance {{ $labels.instance }})
+          description: Different kernel versions are running\n  VALUE = {{ $value }}
+
+      - alert: HostOomKillDetected
+        expr: increase(node_vmstat_oom_kill[1m]) > 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host OOM kill detected (instance {{ $labels.instance }})
+          description: OOM kill detected\n  VALUE = {{ $value }}
+
+      - alert: HostEdacCorrectableErrorsDetected
+        expr: increase(node_edac_correctable_errors_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: info
+        annotations:
+          summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+          description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}
+
+      - alert: HostEdacUncorrectableErrorsDetected
+        expr: node_edac_uncorrectable_errors_total > 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+          description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}
+
+      - alert: HostNetworkReceiveErrors
+        expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }})
+          description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n  VALUE = {{ $value }}
+
+      - alert: HostNetworkTransmitErrors
+        expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }})
+          description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n  VALUE = {{ $value }}
+
+      - alert: HostNetworkInterfaceSaturated
+        expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }})
+          description: The network interface is getting overloaded.\n  VALUE = {{ $value }}
+
+      - alert: HostConntrackLimit
+        expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host conntrack limit (instance {{ $labels.instance }})
+          description: The number of conntrack is approching limit\n  VALUE = {{ $value }}
+
+      - alert: HostClockSkew
+        expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host clock skew (instance {{ $labels.instance }})
+          description: Clock skew detected. Clock is out of sync.\n  VALUE = {{ $value }}
+
+      - alert: HostClockNotSynchronising
+        expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host clock not synchronising (instance {{ $labels.instance }})
+          description: Clock not synchronising.\n  VALUE = {{ $value }}
diff --git a/nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts.yml b/nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts.yml
new file mode 100644
index 0000000..2cd649a
--- /dev/null
+++ b/nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts.yml
@@ -0,0 +1,320 @@
+groups:
+
+- name: NodeExporter
+
+  rules:
+
+    - alert: HostOutOfMemory
+      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of memory (instance {{ $labels.instance }})
+        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryUnderMemoryPressure
+      expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host memory under memory pressure (instance {{ $labels.instance }})
+        description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostMemoryIsUnderutilized
+      expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Host Memory is underutilized (instance {{ $labels.instance }})
+        description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputIn
+      expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput in (instance {{ $labels.instance }})
+        description: "Host receive bandwidth is high (>80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualNetworkThroughputOut
+      expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput out (instance {{ $labels.instance }})
+        description: "Host transmit bandwidth is high (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadRate
+      expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read rate (instance {{ $labels.instance }})
+        description: "Disk is too busy (IO wait > 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfDiskSpace
+      expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host out of disk space (instance {{ $labels.instance }})
+        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostDiskMayFillIn24Hours
+      expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem will likely run out of space within the next 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOutOfInodes
+      expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host out of inodes (instance {{ $labels.instance }})
+        description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostFilesystemDeviceError
+      expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host filesystem device error (instance {{ $labels.instance }})
+        description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostInodesMayFillIn24Hours
+      expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskReadLatency
+      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskWriteLatency
+      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write latency (instance {{ $labels.instance }})
+        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostHighCpuLoad
+      expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host high CPU load (instance {{ $labels.instance }})
+        description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuIsUnderutilized
+      expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
+      for: 1w
+      labels:
+        severity: info
+      annotations:
+        summary: Host CPU is underutilized (instance {{ $labels.instance }})
+        description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuStealNoisyNeighbor
+      expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostCpuHighIowait
+      expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU high iowait (instance {{ $labels.instance }})
+        description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostUnusualDiskIo
+      expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk IO (instance {{ $labels.instance }})
+        description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostContextSwitchingHigh
+      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host context switching high (instance {{ $labels.instance }})
+        description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSwapIsFillingUp
+      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host swap is filling up (instance {{ $labels.instance }})
+        description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSystemdServiceCrashed
+      expr: '(node_systemd_unit_state{state="failed"} == 1)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host systemd service crashed (instance {{ $labels.instance }})
+        description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostPhysicalComponentTooHot
+      expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host physical component too hot (instance {{ $labels.instance }})
+        description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNodeOvertemperatureAlarm
+      expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+        description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSoftwareRaidInsufficientDrives
+      expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
+        description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostSoftwareRaidDiskFailure
+      expr: '(node_md_disks{state="failed"} > 0)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host software RAID disk failure (instance {{ $labels.instance }})
+        description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostKernelVersionDeviations
+      expr: 'changes(node_uname_info[1h]) > 0'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Host kernel version deviations (instance {{ $labels.instance }})
+        description: "Kernel version for {{ $labels.instance }} has changed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOomKillDetected
+      expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host OOM kill detected (instance {{ $labels.instance }})
+        description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacCorrectableErrorsDetected
+      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostEdacUncorrectableErrorsDetected
+      expr: '(node_edac_uncorrectable_errors_total > 0)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkReceiveErrors
+      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Receive Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkTransmitErrors
+      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostNetworkBondDegraded
+      expr: '((node_bonding_active - node_bonding_slaves) != 0)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+        description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostConntrackLimit
+      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host conntrack limit (instance {{ $labels.instance }})
+        description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockSkew
+      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock skew (instance {{ $labels.instance }})
+        description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostClockNotSynchronising
+      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock not synchronising (instance {{ $labels.instance }})
+        description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/nixos/hosts/monitoring/provisioning/alerts/postgres-alerts.yml b/nixos/hosts/monitoring/provisioning/alerts/postgres-alerts.yml
new file mode 100644
index 0000000..4757e88
--- /dev/null
+++ b/nixos/hosts/monitoring/provisioning/alerts/postgres-alerts.yml
@@ -0,0 +1,201 @@
+groups:
+  - name: Postgres
+
+    rules:
+      - alert: PostgresqlDown
+        expr: "pg_up == 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql down (instance {{ $labels.instance }})
+          description: "Postgresql instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlRestarted
+        expr: "time() - pg_postmaster_start_time_seconds < 60"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql restarted (instance {{ $labels.instance }})
+          description: "Postgresql restarted\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlExporterError
+        expr: "pg_exporter_last_scrape_error > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql exporter error (instance {{ $labels.instance }})
+          description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlTableNotAutoVacuumed
+        expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
+          description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlTableNotAutoAnalyzed
+        expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
+          description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlTooManyConnections
+        expr: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql too many connections (instance {{ $labels.instance }})
+          description: "PostgreSQL instance has too many connections (> 80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlNotEnoughConnections
+        expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql not enough connections (instance {{ $labels.instance }})
+          description: "PostgreSQL instance should have more connections (> 5)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlDeadLocks
+        expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql dead locks (instance {{ $labels.instance }})
+          description: "PostgreSQL has dead-locks\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlHighRollbackRate
+        expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql high rollback rate (instance {{ $labels.instance }})
+          description: "Ratio of transactions being aborted compared to committed is > 2 %\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlCommitRateLow
+        expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql commit rate low (instance {{ $labels.instance }})
+          description: "Postgresql seems to be processing very few transactions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlLowXidConsumption
+        expr: "rate(pg_txid_current[1m]) < 5"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql low XID consumption (instance {{ $labels.instance }})
+          description: "Postgresql seems to be consuming transaction IDs very slowly\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlHighRateStatementTimeout
+        expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
+          description: "Postgres transactions showing high rate of statement timeouts\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlHighRateDeadlock
+        expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
+          description: "Postgres detected deadlocks\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlUnusedReplicationSlot
+        expr: "pg_replication_slots_active == 0"
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql unused replication slot (instance {{ $labels.instance }})
+          description: "Unused Replication Slots\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlTooManyDeadTuples
+        expr: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
+          description: "PostgreSQL dead tuples is too large\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlConfigurationChanged
+        expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
+        for: 0m
+        labels:
+          severity: info
+        annotations:
+          summary: Postgresql configuration changed (instance {{ $labels.instance }})
+          description: "Postgres Database configuration change has occurred\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlSslCompressionActive
+        expr: "sum(pg_stat_ssl_compression) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql SSL compression active (instance {{ $labels.instance }})
+          description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlTooManyLocksAcquired
+        expr: "((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20"
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
+          description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlBloatIndexHigh(>80%)
+        expr: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)"
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
+          description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlBloatTableHigh(>80%)
+        expr: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)"
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
+          description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlInvalidIndex
+        expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
+        for: 6h
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql invalid index (instance {{ $labels.instance }})
+          description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlReplicationLag
+        expr: "pg_replication_lag_seconds > 5"
+        for: 30s
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql replication lag (instance {{ $labels.instance }})
+          description: "The PostgreSQL replication lag is high (> 5s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/nixos/hosts/monitoring/provisioning/alerts/prometheus-alerts.yml b/nixos/hosts/monitoring/provisioning/alerts/prometheus-alerts.yml
new file mode 100644
index 0000000..68952cc
--- /dev/null
+++ b/nixos/hosts/monitoring/provisioning/alerts/prometheus-alerts.yml
@@ -0,0 +1,255 @@
+groups:
+  - name: Prometheus
+
+    rules:
+      - alert: PrometheusJobMissing
+        expr: 'absent(up{job="prometheus"})'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus job missing (instance {{ $labels.instance }})
+          description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTargetMissing
+        expr: "up == 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target missing (instance {{ $labels.instance }})
+          description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusAllTargetsMissing
+        expr: "sum by (job) (up) == 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus all targets missing (instance {{ $labels.instance }})
+          description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTargetMissingWithWarmupTime
+        expr: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target missing with warmup time (instance {{ $labels.instance }})
+          description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusConfigurationReloadFailure
+        expr: "prometheus_config_last_reload_successful != 1"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+          description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTooManyRestarts
+        expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus too many restarts (instance {{ $labels.instance }})
+          description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusAlertmanagerJobMissing
+        expr: 'absent(up{job="alertmanager"})'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
+          description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusAlertmanagerConfigurationReloadFailure
+        expr: "alertmanager_config_last_reload_successful != 1"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
+          description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusAlertmanagerConfigNotSynced
+        expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
+          description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusAlertmanagerE2eDeadManSwitch
+        expr: "vector(1)"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
+          description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusNotConnectedToAlertmanager
+        expr: "prometheus_notifications_alertmanagers_discovered < 1"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
+          description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusRuleEvaluationFailures
+        expr: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTemplateTextExpansionFailures
+        expr: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusRuleEvaluationSlow
+        expr: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds"
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
+          description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusNotificationsBacklog
+        expr: "min_over_time(prometheus_notifications_queue_length[10m]) > 0"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus notifications backlog (instance {{ $labels.instance }})
+          description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusAlertmanagerNotificationFailing
+        expr: "rate(alertmanager_notifications_failed_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
+          description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTargetEmpty
+        expr: "prometheus_sd_discovered_targets == 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target empty (instance {{ $labels.instance }})
+          description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTargetScrapingSlow
+        expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus target scraping slow (instance {{ $labels.instance }})
+          description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusLargeScrape
+        expr: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10"
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus large scrape (instance {{ $labels.instance }})
+          description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTargetScrapeDuplicate
+        expr: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
+          description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbCheckpointCreationFailures
+        expr: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbCheckpointDeletionFailures
+        expr: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbCompactionsFailed
+        expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbHeadTruncationsFailed
+        expr: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbReloadFailures
+        expr: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbWalCorruptions
+        expr: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbWalTruncationsFailed
+        expr: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTimeseriesCardinality
+        expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
+          description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/nixos/hosts/monitoring/provisioning/alerts/promtail-alerts.yml b/nixos/hosts/monitoring/provisioning/alerts/promtail-alerts.yml
new file mode 100644
index 0000000..39aee40
--- /dev/null
+++ b/nixos/hosts/monitoring/provisioning/alerts/promtail-alerts.yml
@@ -0,0 +1,21 @@
+groups:
+  - name: Promtail
+
+    rules:
+      - alert: PromtailRequestErrors
+        expr: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10'
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Promtail request errors (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PromtailRequestLatency
+        expr: "histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1"
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Promtail request latency (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/nixos/hosts/monitoring/provisioning/alerts/traefik-alerts.yml b/nixos/hosts/monitoring/provisioning/alerts/traefik-alerts.yml
new file mode 100644
index 0000000..218843c
--- /dev/null
+++ b/nixos/hosts/monitoring/provisioning/alerts/traefik-alerts.yml
@@ -0,0 +1,30 @@
+groups:
+  - name: Traefik
+
+    rules:
+      - alert: TraefikServiceDown
+        expr: "count(traefik_service_server_up) by (service) == 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Traefik service down (instance {{ $labels.instance }})
+          description: "All Traefik services are down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: TraefikHighHttp4xxErrorRateService
+        expr: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: Traefik high HTTP 4xx error rate service (instance {{ $labels.instance }})
+          description: "Traefik service 4xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: TraefikHighHttp5xxErrorRateService
+        expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
+          description: "Traefik service 5xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/nixos/hosts/monitoring/provisioning/templates/telegram.markdown.v2.tmpl b/nixos/hosts/monitoring/provisioning/templates/telegram.markdown.v2.tmpl
new file mode 100644
index 0000000..4f3d3ed
--- /dev/null
+++ b/nixos/hosts/monitoring/provisioning/templates/telegram.markdown.v2.tmpl
@@ -0,0 +1,37 @@
+{{ define "telegram.markdown.message" }}
+{{- if eq .Status "firing" -}}
+    {{- if eq .CommonLabels.severity "critical" -}}
+        🔴 Alert: {{ .CommonLabels.alertname }}
+    {{- else if eq .CommonLabels.severity "warning" -}}
+        🟠 Alert: {{ .CommonLabels.alertname }}
+    {{- else -}}
+        ⚪️ Alert: {{ .CommonLabels.alertname }}
+    {{- end }}
+Status: 🔥 FIRING
+Severity: {{ if eq .CommonLabels.severity "critical" }}🔴 {{ .CommonLabels.severity | title }}{{ else if eq .CommonLabels.severity "warning" }}🟠 {{ .CommonLabels.severity | title }}{{ else }}⚪️ {{ .CommonLabels.severity | title }}{{ end }}
+{{- else if eq .Status "resolved" -}}
+    ⚪️ Alert: {{ .CommonLabels.alertname }}
+Status: ✅ RESOLVED
+Severity: {{ if eq .CommonLabels.severity "critical" }}🟢 {{ .CommonLabels.severity | title }}{{ else if eq .CommonLabels.severity "warning" }}🟢 {{ .CommonLabels.severity | title }}{{ else }}⚪️ {{ .CommonLabels.severity | title }}{{ end }}
+{{- end }}
+
+{{- range .Alerts -}}
+
+{{- if .Labels.job }}
+Job: `{{ .Labels.job }}`
+{{- end }}
+
+{{- if .Labels.namespace }}
+Namespace: `{{ .Labels.namespace }}`
+{{- end }}
+
+{{- if .Labels.instance }}
+Instance: `{{ .Labels.instance }}`
+{{- end }}
+
+{{- if .Annotations.runbook_url }}
+[RunbookURL]({{ .Annotations.runbook_url }})
+
+{{- end }}
+{{- end }}
+{{ end }}
diff --git a/nixos/hosts/monitoring/provisioning/templates/telegram.tmpl b/nixos/hosts/monitoring/provisioning/templates/telegram.tmpl
new file mode 100644
index 0000000..1f814c7
--- /dev/null
+++ b/nixos/hosts/monitoring/provisioning/templates/telegram.tmpl
@@ -0,0 +1,28 @@
+{{ define "alert_list" }}{{ range . }}
+---
+🪪 <b>{{ .Labels.alertname }}</b>
+{{- if eq .Labels.severity "critical" }}
+🚨 CRITICAL 🚨 {{ end }}
+{{- if eq .Labels.severity "warning" }}
+⚠️ WARNING ⚠️{{ end }}
+{{- if .Annotations.summary }}
+📝 {{ .Annotations.summary }}{{ end }}
+{{- if .Annotations.description }}
+📖 {{ .Annotations.description }}{{ end }}
+
+🏷 Labels:
+{{ range .Labels.SortedPairs }}  <i>{{ .Name }}</i>: <code>{{ .Value }}</code>
+{{ end }}{{ end }}
+🛠 <a href="https://grafana.prod.global:3000">Grafana</a> 💊 <a href="https://alertmanager.prod.global:9093">Alertmanager</a> 💊 <a href="https://">Any other link</a> 🛠
+{{ end }}
+
+{{ define "telegram.message" }}
+{{ if gt (len .Alerts.Firing) 0 }}
+🔥 Alerts Firing 🔥
+{{ template "alert_list" .Alerts.Firing }}
+{{ end }}
+{{ if gt (len .Alerts.Resolved) 0 }}
+✅ Alerts Resolved ✅
+{{ template "alert_list" .Alerts.Resolved }}
+{{ end }}
+{{ end }}
diff --git a/nixos/hosts/monitoring/sops.nix b/nixos/hosts/monitoring/sops.nix
new file mode 100644
index 0000000..2b0c9ab
--- /dev/null
+++ b/nixos/hosts/monitoring/sops.nix
@@ -0,0 +1,7 @@
+{ config, lib, ... }:
+{
+  sops.secrets."telegram-alert-bot-token" = {
+    sopsFile = ../../secrets/telegram/secrets.yml;
+    owner = "prometheus";
+  };
+}
diff --git a/nixos/hosts/sandbox/networking.nix b/nixos/hosts/sandbox/networking.nix
index f0397ab..e1419e9 100644
--- a/nixos/hosts/sandbox/networking.nix
+++ b/nixos/hosts/sandbox/networking.nix
@@ -1,20 +1,6 @@
-{ config, lib, pkgs, ... }: {
-
-  networking = {
-    hostName = "sandbox";
-    interfaces.eth0 = {
-      ipv4.addresses = [{
-        address = "192.168.1.148";
-        prefixLength = 24;
-      }];
-      ipv6.addresses = [{
-        address = "fe80::148";
-        prefixLength = 64;
-      }];
-    };
-    defaultGateway = {
-      address = "192.168.1.1";
-      interface = "eth0";
-    };
-  };
+{ config, lib, pkgs, ... }:
+{
+  networking.hostName = "sandbox";
+  networking.interfaces.ens18.useDHCP = true;
+  networking.defaultGateway = "192.168.1.1";
 }
diff --git a/nixos/hosts/traefik/configuration/middlewares.nix b/nixos/hosts/traefik/configuration/middlewares.nix
new file mode 100644
index 0000000..d3995a5
--- /dev/null
+++ b/nixos/hosts/traefik/configuration/middlewares.nix
@@ -0,0 +1,10 @@
+{ lib, config, ... }:
+
+let
+  internalNetwork = "192.168.1.0/24";
+in
+{
+  internal-whitelist = {
+    ipWhiteList.sourceRange = [ internalNetwork ];
+  };
+}
diff --git a/nixos/hosts/traefik/configuration/routers.nix b/nixos/hosts/traefik/configuration/routers.nix
new file mode 100644
index 0000000..71a087b
--- /dev/null
+++ b/nixos/hosts/traefik/configuration/routers.nix
@@ -0,0 +1,140 @@
+{ lib, config, ... }:
+
+{
+  traefik = {
+    rule = "Host(`traefik.procopius.dk`)";
+    service = "traefik";
+    entryPoints = [ "websecure" ];
+    middlewares = [ "internal-whitelist" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  proxmox = {
+    rule = "Host(`proxmox.procopius.dk`)";
+    service = "proxmox";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  forgejo = {
+    rule = "Host(`git.procopius.dk`)";
+    service = "forgejo";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  prometheus = {
+    rule = "Host(`prometheus.procopius.dk`)";
+    service = "prometheus";
+    entryPoints = [ "websecure" ];
+    middlewares = [ "internal-whitelist" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  grafana = {
+    rule = "Host(`grafana.procopius.dk`)";
+    service = "grafana";
+    entryPoints = [ "websecure" ];
+    middlewares = [ "internal-whitelist" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  alertmanager = {
+    rule = "Host(`alertmanager.procopius.dk`)";
+    service = "alertmanager";
+    entryPoints = [ "websecure" ];
+    middlewares = [ "internal-whitelist" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  jellyfin = {
+    rule = "Host(`jellyfin.procopius.dk`)";
+    service = "jellyfin";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  sonarr = {
+    rule = "Host(`sonarr.procopius.dk`)";
+    service = "sonarr";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  radarr = {
+    rule = "Host(`radarr.procopius.dk`)";
+    service = "radarr";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  ente = {
+    rule = "Host(`ente.procopius.dk`)";
+    service = "ente";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  photos = {
+    rule = "Host(`photos.procopius.dk`)";
+    service = "photos";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  minio = {
+    rule = "Host(`minio.procopius.dk`)";
+    service = "minio";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  minio-api = {
+    rule = "Host(`minio-api.procopius.dk`)";
+    service = "minio-api";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  account = {
+    rule = "Host(`account.procopius.dk`)";
+    service = "account";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  auth = {
+    rule = "Host(`auth.procopius.dk`)";
+    service = "auth";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  nas = {
+    rule = "Host(`nas.procopius.dk`)";
+    service = "nas";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  umami = {
+    rule = "Host(`umami.procopius.dk`)";
+    service = "umami";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  mesterjakob = {
+    rule = "Host(`mester.jakobblum.dk`)";
+    service = "mesterjakob";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+
+  catchAll = {
+    rule = "HostRegexp(`.+`)";
+    service = "nginx";
+    entryPoints = [ "websecure" ];
+    tls = { certResolver = "letsencrypt"; };
+  };
+}
diff --git a/nixos/hosts/traefik/configuration/services.nix b/nixos/hosts/traefik/configuration/services.nix
new file mode 100644
index 0000000..971183d
--- /dev/null
+++ b/nixos/hosts/traefik/configuration/services.nix
@@ -0,0 +1,38 @@
+{ lib, config, ... }:
+
+{
+  proxmox.loadBalancer.servers = [ { url = "https://192.168.1.205:8006"; } ];
+  proxmox.loadBalancer.serversTransport = "insecureTransport";
+
+  traefik.loadBalancer.servers = [ { url = "http://localhost:8080"; } ];
+
+  forgejo.loadBalancer.servers = [ { url = "http://forgejo.lab:3000"; } ];
+
+  nginx.loadBalancer.servers = [ { url = "https://192.168.1.226:4433"; } ];
+  nginx.loadBalancer.serversTransport = "insecureTransport";
+
+  prometheus.loadBalancer.servers = [ { url = "http://monitor.lab:9090"; } ];
+  grafana.loadBalancer.servers = [ { url = "http://monitor.lab:3000"; } ];
+  alertmanager.loadBalancer.servers = [ { url = "http://monitor.lab:9093"; } ];
+
+
+  # from nginx
+  account.loadBalancer.servers = [ { url = "http://192.168.1.226:3001"; } ];
+  auth.loadBalancer.servers = [ { url = "http://192.168.1.226:3005"; } ];
+  ente.loadBalancer.servers = [ { url = "http://192.168.1.226:8087"; } ];
+  photos.loadBalancer.servers = [ { url = "http://192.168.1.226:3000"; } ];
+  minio.loadBalancer.servers = [ { url = "http://192.168.1.226:3201"; } ];
+  minio-api.loadBalancer.servers = [ { url = "http://192.168.1.226:3200"; } ];
+
+  nas.loadBalancer.servers = [ { url = "https://192.168.1.226:5001"; } ];
+  nas.loadBalancer.serversTransport = "insecureTransport";
+
+
+  jellyfin.loadBalancer.servers = [ { url = "http://192.168.1.226:8096"; } ];
+  radarr.loadBalancer.servers = [ { url = "http://192.168.1.226:7878"; } ];
+  sonarr.loadBalancer.servers = [ { url = "http://192.168.1.226:8989"; } ];
+
+  umami.loadBalancer.servers = [ { url = "http://192.168.1.226:3333"; } ];
+
+  mesterjakob.loadBalancer.servers = [ { url = "http://192.168.1.226:4200"; } ];
+}
diff --git a/nixos/hosts/traefik/configuration/static.nix b/nixos/hosts/traefik/configuration/static.nix
new file mode 100644
index 0000000..22bd1f4
--- /dev/null
+++ b/nixos/hosts/traefik/configuration/static.nix
@@ -0,0 +1,61 @@
+{ lib, config, ... }:
+
+{
+  entryPoints = {
+    web = {
+      address = ":80";
+      asDefault = true;
+      http.redirections.entrypoint = {
+        to = "websecure";
+        scheme = "https";
+      };
+    };
+
+    websecure = {
+      address = ":443";
+      http.tls.certResolver = "letsencrypt";
+    };
+
+    metrics = {
+      address = ":8082";
+    };
+  };
+
+  api = {
+    dashboard = true;
+    insecure = true;
+  };
+
+  certificatesResolvers = {
+    letsencrypt = {
+      acme = {
+        email = "david.mikael@proton.me";
+        storage = "/var/lib/traefik/acme.json";
+        # httpChallenge = {
+        #   entryPoint = "web";
+        # };
+        dnsChallenge = {
+          provider = "cloudflare";
+          delayBeforeCheck = 10;
+          resolvers = [ "1.1.1.1:53" "8.8.8.8:53" ];
+        };
+      };
+    };
+  };
+
+  metrics = {
+    prometheus = {
+      entryPoint = "metrics";
+    };
+  };
+
+  log = {
+    level = "DEBUG";
+    filePath = "/var/log/traefik/traefik.log";
+  };
+
+  accessLog = {
+    format = "json";
+    filePath = "/var/log/traefik/access.log";
+  };
+}
diff --git a/nixos/hosts/traefik/networking.nix b/nixos/hosts/traefik/networking.nix
index 9d07344..4c09486 100644
--- a/nixos/hosts/traefik/networking.nix
+++ b/nixos/hosts/traefik/networking.nix
@@ -1,18 +1,13 @@
 { config, lib, pkgs, ... }: {
 
-  networking = {
-    hostName = "traefik";
-    interfaces.eth0 = {
-      ipv4.addresses = [{
-        address = "192.168.1.171";
-        prefixLength = 24;
-      }];
-    };
-    firewall.allowedTCPPorts = [ 80 443 8080 8082 ];
+  networking.hostName = "traefik";
+  networking.interfaces.eth0.ipv4.addresses = [{
+    address = "192.168.1.80";
+    prefixLength = 24;
+  }];
 
-    defaultGateway = {
-      address = "192.168.1.1";
-      interface = "eth0";
-    };
-  };
+  networking.firewall.allowedTCPPorts = [ 80 443 8080 8082 ];
+
+  networking.nameservers = [ "192.168.1.53" ];
+  networking.defaultGateway = "192.168.1.1";
 }
diff --git a/nixos/hosts/traefik/promtail.nix b/nixos/hosts/traefik/promtail.nix
index 4a48e8d..7b4f91b 100644
--- a/nixos/hosts/traefik/promtail.nix
+++ b/nixos/hosts/traefik/promtail.nix
@@ -14,10 +14,10 @@
         {
           targets = [ "localhost" ];
           labels = {
-            job = "traefik";
+            job = "/var/log/traefik/*.log";
             host = config.networking.hostName;
             env = "proxmox";
-            instance = "${config.networking.hostName}.local"; # prometheus scrape target
+            instance = "${config.networking.hostName}.lab"; # prometheus scrape target
             __path__ = "/var/log/traefik/*.log";
           };
         }
diff --git a/nixos/hosts/traefik/traefik.nix b/nixos/hosts/traefik/traefik.nix
index 36e27a9..52462bd 100644
--- a/nixos/hosts/traefik/traefik.nix
+++ b/nixos/hosts/traefik/traefik.nix
@@ -1,158 +1,36 @@
-{ config, lib, pkgs, ... }: {
+{ config, lib, pkgs, ... }:
 
-  # Traefik reverse proxy setup
+let
+  staticConfig = import ./configuration/static.nix { inherit lib config; };
+  middlewaresConfig = import ./configuration/middlewares.nix { inherit lib config; };
+  routersConfig = import ./configuration/routers.nix { inherit lib config; };
+  servicesConfig = import ./configuration/services.nix { inherit lib config; };
+in
+{
   services.traefik = {
     enable = true;
 
-    staticConfigOptions = {
-      entryPoints = {
-        web = {
-          address = ":80";
-          asDefault = true;
-          http.redirections.entrypoint = {
-            to = "websecure";
-            scheme = "https";
-          };
-        };
+    # ==== Static Configuration ====
+    staticConfigOptions = staticConfig;
 
-        websecure = {
-          address = ":443";
-          asDefault = true;
-          http.tls.certResolver = "letsencrypt";
-        };
+    # ==== Dynamic Configuration ====
+    dynamicConfigOptions.http = {
+      routers = routersConfig;
+      services = servicesConfig;
+      middlewares = middlewaresConfig;
 
-        metrics = {
-          address = ":8082";
+      serversTransports = {
+        insecureTransport = {
+          insecureSkipVerify = true;
         };
       };
-
-      api.dashboard = true;
-      api.insecure = true;
-
-      # Enable Let's Encrypt
-      certificatesResolvers = {
-        letsencrypt = {
-          acme = {
-            email = "david.mikael@proton.me";  # Replace with your email
-            storage = "/var/lib/traefik/acme.json";  # Location to store ACME certificates
-            httpChallenge = {
-              entryPoint = "web";  # Uses HTTP challenge (can also use DNS)
-            };
-            # Uncomment the following for staging (testing) environment
-            # caServer = "https://acme-staging-v02.api.letsencrypt.org/directory";
-          };
-        };
-      };
-
-      # Enable Prometheus metrics
-      metrics = {
-        prometheus = {
-          entryPoint = "metrics";
-        };
-      };
-      log = {
-        level = "DEBUG";
-        filePath = "/var/log/traefik/traefik.log";
-      };
-
-      accessLog = {
-        format = "json";
-        filePath = "/var/log/traefik/access.log";
-      };
-
-      # Enable access logs (you can customize the log format)
-      # accessLog = {
-      #   filePath = "/var/log/traefik/access.log";  # Log to a file
-      #   format = "common";  # You can adjust this to `json` or `common`
-      # };
-      # tracing = {
-      #   enabled = true;
-      #   provider = "jaeger";  # or zipkin, or other
-      #   jaeger = {
-      #     apiURL = "http://localhost:5775";  # Replace with your Jaeger instance URL
-      #   };
-      # };
-    };
-
-    dynamicConfigOptions = {
-      # Add IP whitelisting middleware to restrict access to internal network only
-      http.middlewares = {
-        internal-whitelist = {
-          ipWhiteList = {
-            sourceRange = ["192.168.1.0/24"]; # Adjust to your internal network range
-            # Alternatively use `127.0.0.1/32` for localhost access
-          };
-        };
-      };
-
-      # Route to Proxmox UI
-      http.routers.proxmox = {
-        rule = "Host(`proxmox.procopius.dk`)";
-        service = "proxmox";
-        entryPoints = [ "web" "websecure" ];
-        tls = {
-          certResolver = "letsencrypt";  # Use Let's Encrypt
-        };
-      };
-      # Route to Traefik Dashboard
-      http.routers.traefik = {
-        rule = "Host(`traefik.procopius.dk`)";
-        service = "traefik";
-        entryPoints = [ "web" "websecure" ];
-        middlewares = ["internal-whitelist"];
-        tls = {
-          certResolver = "letsencrypt";  # Use Let's Encrypt
-        };
-      };
-
-      http.routers.forgejo = {
-        rule = "Host(`git.procopius.dk`)";
-        service = "forgejo";
-        entryPoints = [ "web" "websecure" ];
-        tls = {
-          certResolver = "letsencrypt";  # Use Let's Encrypt
-        };
-      };
-
-      # Route to Traefik Dashboard
-      http.routers.catchAll = {
-        # rule = "Host(`jellyfin.procopius.dk`)";
-        rule = "HostRegexp(`.+`)";
-        # rule = "HostRegexp(`{host:.+}`)";
-        service = "nginx";
-        entryPoints = [ "web" "websecure" ];
-        tls = {
-          certResolver = "letsencrypt";  # Use Let's Encrypt
-        };
-      };
-
-
-      # Define the services
-      http.services.proxmox.loadBalancer.servers = [
-        { url = "https://192.168.1.205:8006"; }  # Proxmox
-      ];
-      http.services.proxmox.loadBalancer.serversTransport = "insecureTransport";
-
-
-      http.services.traefik.loadBalancer.servers = [
-        { url = "http://traefik.local:8080"; }  # Traefik Dashboard
-      ];
-
-      http.services.forgejo.loadBalancer.servers = [
-        { url = "http://192.168.1.249:3000"; }  # forgejo
-      ];
-
-      http.services.nginx.loadBalancer.servers = [
-        { url = "https://192.168.1.226:4433"; }  # nginx
-      ];
-      http.services.nginx.loadBalancer.serversTransport = "insecureTransport";
-
-
-      http.serversTransports.insecureTransport.insecureSkipVerify = true;
-
     };
   };
 
-  # Optionally, you can add Docker support if using Docker Compose
+  systemd.services.traefik.serviceConfig.Environment = [
+    "CLOUDFLARE_DNS_API_TOKEN=gQYyG6cRw-emp_qpsUj9TrkYgoVC1v9UUtv94ozA"
+    "CLOUDFLARE_ZONE_API_TOKEN=gQYyG6cRw-emp_qpsUj9TrkYgoVC1v9UUtv94ozA"
+  ];
+
   virtualisation.docker.enable = true;
 }
diff --git a/nixos/hosts/vpn/README.md b/nixos/hosts/vpn/README.md
new file mode 100644
index 0000000..e76e6ef
--- /dev/null
+++ b/nixos/hosts/vpn/README.md
@@ -0,0 +1,127 @@
+Great question — and you're absolutely right to ask.
+
+You **don’t need** Innernet or Headscale+patch *if* you're comfortable building a small self-hosted portal that handles:
+
+* ✅ OIDC login
+* ✅ WireGuard peer key management
+* ✅ Config generation and download
+
+So let’s break it down:
+
+---
+
+## 🧩 Why *Innernet* and *Headscale+patch* Exist
+
+Those tools solve **user/device coordination**, dynamic routing, and access control *automatically*, **with batteries included**. They're great if you want a more managed, “plug-and-play” experience. Specifically:
+
+| Tool                     | Purpose                                                           |
+| ------------------------ | ----------------------------------------------------------------- |
+| **Innernet**             | Full WireGuard-based mesh + OIDC login + per-device management    |
+| **Headscale**            | Tailscale backend, enables user login + ACL + device coordination |
+| **Headscale+OIDC patch** | Add OIDC login to Headscale (by default, only pre-shared keys)    |
+
+These tools do things like:
+
+* Assign IPs/DNS names to users
+* Regenerate keys
+* Handle coordination between peers
+* Gate access with identity
+
+---
+
+## 🛠️ Why You *Don’t* Need Them (Your Custom Portal Idea)
+
+If you're fine with:
+
+* Managing WireGuard server and IPs manually
+* Writing a small web service to issue `.conf` files after login
+* Using firewall rules for access control
+
+Then you absolutely **can** build a custom system around **WireGuard + OIDC** like this:
+
+---
+
+### 🧱 Architecture: WireGuard + OIDC + Config Generator
+
+```
+                 ┌──────────────┐
+                 │ Your OIDC    │
+                 │ provider     │◄────── Google, Authentik, Keycloak
+                 └─────┬────────┘
+                       │
+                  (OIDC Login)
+                       │
+              ┌────────▼────────┐
+              │ Web Portal      │◄────── nginx + Go, Node.js, Flask, etc.
+              │ (self-hosted)   │
+              └──┬────────────┬─┘
+      Gen Keys  │            │ Serve config
+     & register │            └─────> `client.conf` / QR code
+                ▼
+      ┌──────────────────────┐
+      │ WireGuard server     │ (NixOS)
+      │ - Handles all peers  │
+      └──────────────────────┘
+```
+
+---
+
+### 🔐 What Your Portal Needs to Do
+
+* Authenticate via OIDC (OAuth2 flow)
+* Store a list of "authorized users"
+* When a user logs in:
+
+  * Check if they already have a WireGuard peer entry
+  * If not: generate a keypair, assign IP (e.g. `10.100.0.5`)
+  * Append to server config (`/etc/wireguard/wg0.conf`) or use `wg set`
+  * Return config file or QR code for the mobile client
+
+---
+
+## ✅ Benefits of This DIY Approach
+
+* 💡 Full control: you own every step
+* 🔐 OIDC is the gatekeeper (centrally managed access)
+* ⚡ No extra daemon/software overhead (just WireGuard + your app)
+* 🧩 Easily extensible (e.g., device limits, IP lease expiration)
+
+---
+
+## ⚙️ Tools You Can Use
+
+| Purpose                | Tools                                                                      |
+| ---------------------- | -------------------------------------------------------------------------- |
+| OIDC Auth              | [Dex](https://github.com/dexidp/dex), Keycloak, Authentik, or Firebase     |
+| Web Portal Backend     | Node.js (Express), Go, Flask, FastAPI                                      |
+| Config Template Engine | Go templates, EJS, Jinja2                                                  |
+| WireGuard Integration  | `wg`, `wg-quick`, or directly edit `wg0.conf`                              |
+| NixOS Support          | Use `networking.wireguard.interfaces.wg0.peers` to declaratively add peers |
+
+---
+
+## 🧪 Bonus Features You Could Build
+
+* 🔐 Per-user firewall rules
+* 🔁 One-time config URLs (`vpn.mydomain.com/invite/xyz`)
+* 🔐 Optional Authelia/Keycloak MFA
+* 📉 Dashboard with currently active peers (use `wg show`)
+
+---
+
+## ✨ Summary
+
+| Option            | You Need Innernet / Headscale? | Login via OIDC? | DNS Routing? | Complexity |
+| ----------------- | ------------------------------ | --------------- | ------------ | ---------- |
+| Your own portal   | ❌ No                           | ✅ Yes           | ❌ Manual     | 🟡 Medium  |
+| Innernet          | ✅ Yes                          | ✅ Yes           | ✅ Built-in   | 🟠 Medium  |
+| Headscale + patch | ✅ Yes                          | ✅ Yes           | ✅ Built-in   | 🔴 High    |
+
+You’re on a great path by choosing the custom VPN portal idea.
+Let me know if you'd like:
+
+* A **sample architecture repo**
+* A **NixOS module to support peer configs**
+* Help building the **login + config generator backend**
+
+I can generate a Nix flake and a working OIDC portal template to kickstart the project.
diff --git a/nixos/modules/promtail.nix b/nixos/modules/promtail.nix
index af5d478..84147a0 100644
--- a/nixos/modules/promtail.nix
+++ b/nixos/modules/promtail.nix
@@ -20,7 +20,7 @@ in
         filename = "/var/lib/promtail/positions.yaml";
       };
       clients = [{
-        url = "http://monitor.local:3100/loki/api/v1/push";
+        url = "http://monitor.lab:3100/loki/api/v1/push";
       }];
       scrape_configs = [{
         job_name = "journal";
@@ -30,13 +30,14 @@ in
             job = "promtail";
             host = config.networking.hostName;
             env = "proxmox";
-            instance = "${config.networking.hostName}.local";
+            instance = "${config.networking.hostName}.lab";
           };
         };
         relabel_configs = [{
           source_labels = ["__journal__systemd_unit"];
           target_label = "unit";
         }];
+
       }];
     };
   };
diff --git a/nixos/secrets/HOWTO.md b/nixos/secrets/HOWTO.md
new file mode 100644
index 0000000..791314d
--- /dev/null
+++ b/nixos/secrets/HOWTO.md
@@ -0,0 +1,59 @@
+### 🔧 Using Secrets in NixOS Configurations
+
+You can use decrypted SOPS secrets in your `configuration.nix`, service modules, and flake-based setups.
+
+#### 🔑 1. Use as environment variable (e.g. password)
+
+```nix
+systemd.services.my-service.serviceConfig.EnvironmentFile =
+  config.sops.secrets."my-password".path;
+```
+
+> Your `secrets.yaml` should contain:
+>
+> ```yaml
+> my-password: PASSWORD=supersecret
+> ```
+
+---
+
+#### 🗂 2. Use as file source (e.g. private key or token)
+
+```nix
+environment.etc."ssh/id_ed25519".source =
+  config.sops.secrets."ssh-private-key".path;
+```
+
+> This places the decrypted secret at `/etc/ssh/id_ed25519` with appropriate permissions.
+
+---
+
+#### 👤 3. Read a secret value directly (not recommended for sensitive data)
+
+```nix
+# Use a secret as a string value in a setting
+services.myapp.settings.apiKey = builtins.readFile config.sops.secrets."api-key".path;
+```
+
+---
+
+#### 🛠 4. Use in systemd preStart scripts
+
+```nix
+systemd.services.my-service.preStart = ''
+  export PASSWORD=$(<${config.sops.secrets."my-password".path})
+  ./myapp --auth $PASSWORD
+'';
+```
+
+---
+
+#### 🧠 5. Use in Forgejo user creation
+
+```nix
+systemd.services.forgejo.preStart = ''
+  ${lib.getExe cfg.package} admin user create \
+    --username admin \
+    --password "$(tr -d '\n' < ${config.sops.secrets."admin-password".path})"
+'';
+```
diff --git a/nixos/secrets/README.md b/nixos/secrets/README.md
index a536d14..f14a377 100644
--- a/nixos/secrets/README.md
+++ b/nixos/secrets/README.md
@@ -1,51 +1,104 @@
+# 🔐 Secrets Management (with SOPS + Nix)
 
-🔑 2. Generate an age Keypair
+This directory contains encrypted secrets used across the infrastructure managed by NixOS and [sops-nix](https://github.com/Mic92/sops-nix). Secrets are stored using [SOPS](https://github.com/mozilla/sops) and encrypted with an `age` key located on each host at `/etc/sops/age.key`.
 
-age-keygen -o secrets/age.key
+---
 
-This will output something like:
-
-# created: 2025-06-02T22:00:00Z
-# public key: age1abcdefghijk...
-
-Copy that public key somewhere — you’ll need it for encrypting.
-
-✅ You should now have:
+## 📁 Directory Structure
 
+```
 secrets/
-├── age.key        # keep this safe and private!
+├── forgejo/
+│   └── secrets.yaml            # Forgejo-specific secrets (admin password, DB password, secret key)
+├── runner/
+│   └── secrets.yaml            # Forgejo runner secrets (tokens, etc.)
+├── shared/
+│   └── secrets.yaml            # Shared secrets used across multiple VMs (SSH keys, tokens)
+````
 
-📝 3. Create Encrypted Secrets File
+---
 
-sops --age age1abcdefghijk... secrets/secrets.yaml
+## 🛠 SOPS Basics
 
-This opens a YAML file in your $EDITOR. Add secrets like:
+### ✅ Encrypt a **new secret file**
 
-forgejo-admin-password: "my-super-secret-password"
+```bash
+sops --age <YOUR-AGE-PUBKEY> -e > secrets/myservice/secrets.yaml
+````
+Example:
+```bash
+sops --age $(cat ~/.config/sops/age/keys.txt | grep public) -e > secrets/forgejo/secrets.yaml
+```
+> Press `i` to enter edit mode if prompted, or fill it using YAML format:
+```yaml
+admin-password: hunter2
+db-password: supersecret
+```
 
-Save and close the file — it’s now encrypted using the public key.
+---
 
-✅ Now you should have:
+### ✏️ Edit secrets in an existing file
 
-secrets/
-├── age.key
-├── secrets.yaml   # encrypted file (safe to commit)
+```bash
+sops secrets/forgejo/secrets.yaml
+```
 
-You can commit secrets.yaml, but do not commit age.key unless you're OK with putting it on a VM.
+---
 
+## 🧬 Using Secrets in Nix
 
-🧪 Test Decryption Locally
+### 🧩 Option 1: Reference shared secrets (via `defaultSopsFile`)
 
-export SOPS_AGE_KEY_FILE=secrets/age.key
+```nix
+# shared-sops.nix
+{
+  sops = {
+    age.keyFile = "/etc/sops/age.key";
+    defaultSopsFile = ../secrets/shared/secrets.yaml;
 
-To test:
+    secrets = {
+      "monitoring-token".owner = "prometheus";
+    };
+  };
+}
+```
 
-sops -d secrets/secrets.yaml
+Then in services:
 
-To edit:
+```nix
+environment.etc."monitoring/token".source = config.sops.secrets."monitoring-token".path;
+```
 
-sops secrets/secrets.yaml
+---
 
+### 🧩 Option 2: Reference per-service secrets with explicit `sopsFile`
 
+```nix
+# forgejo/sops.nix
+{
+  sops.secrets = {
+    "admin-password" = {
+      sopsFile = ./../secrets/forgejo/secrets.yaml;
+      owner = "forgejo";
+    };
+  };
+}
+```
 
-[plasmagoat@forgejo:~]$ sudo chmod 400 /etc/sops/age.key && sudo chown root:root /etc/sops/age.key
+---
+
+## 🧪 Testing secrets setup
+
+Check which secrets will be applied:
+
+```bash
+nixos-rebuild dry-activate --flake .#my-hostname
+```
+
+---
+
+## 📚 Resources
+
+* [sops-nix](https://github.com/Mic92/sops-nix)
+* [Mozilla SOPS](https://github.com/mozilla/sops)
+* [age encryption](https://github.com/FiloSottile/age)
diff --git a/nixos/secrets/forgejo/runner-secrets.yml b/nixos/secrets/forgejo/runner-secrets.yml
new file mode 100644
index 0000000..1165df5
--- /dev/null
+++ b/nixos/secrets/forgejo/runner-secrets.yml
@@ -0,0 +1,16 @@
+forgejo-runner-registration-token: ENC[AES256_GCM,data:ms0Ouy5GP6rlwkiLXoq31ZPSi9bpDKpNOqzEFATHLHflt+YTIjWuPAVRvKEIEQ==,iv:z2snOwdGq3e7Mxl+CmnoOh8c+ZaA+6lNDdXh2vVLULM=,tag:5ZpELR8K5JBQraMBYdXSuA==,type:str]
+sops:
+    age:
+        - recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
+          enc: |
+            -----BEGIN AGE ENCRYPTED FILE-----
+            YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSA0d3kzWXd2RElqdnViNGJG
+            NHd5bER1S2dVQUpxOER6Mi9TYlVuOHFhVzNNCnNWZVJzdU1LSG4yR3BNdmFEVzA2
+            bFNzK2cxNG9OcTB6NC8wdDAxcCtDekkKLS0tIExQdWNJQnBmb05RMktoeXF0dDZC
+            M3FyUEswckYrUDdvdmdUYnBqaTZFcncK8aNh8jL8nzYv2vWwhxX4QPed1pjFr2zK
+            9znxO+osZsUNIXySioLBfsA1kfqZCzaASsM2ezfWHKt1nCVQAvbXGA==
+            -----END AGE ENCRYPTED FILE-----
+    lastmodified: "2025-06-06T21:10:54Z"
+    mac: ENC[AES256_GCM,data:cDk2zKgxX01y/X9eQCbLm6OW74nE9HJdtliE6iye3gsDKbM+SqCuU1JTBvEcOAeROLn4svJmlRe3DDTGhrnuNO8tL8qLXKt2oQ0CM+A/3kXBb/jG13ps57fEpD32u/QbK6smVDS0Li+TCHEtfqiLyVat42lgyy9kakgjOll//K0=,iv:K8ly08WGyHLpk07oUwaO7ygEqcriJ3Uq1Ev/FtUcfiY=,tag:8VRCalipvZv0DAOAu9tSlg==,type:str]
+    unencrypted_suffix: _unencrypted
+    version: 3.10.2
diff --git a/nixos/secrets/forgejo/secrets.yml b/nixos/secrets/forgejo/secrets.yml
new file mode 100644
index 0000000..94f993a
--- /dev/null
+++ b/nixos/secrets/forgejo/secrets.yml
@@ -0,0 +1,18 @@
+forgejo-admin-password: ENC[AES256_GCM,data:S05b/J9AK2SuIKDSWmtRf72C7V5FwMgZv/o5yxzNXRZEH2eIm18sC6+FEg==,iv:Ig/c4K9Io0S07Ywl4JQtbfxhjXJ7Rvea7+N4KhLUqjc=,tag:rx44tRuAbERBZR45QN6b9A==,type:str]
+forgejo-db-password: ENC[AES256_GCM,data:5YwRl6HNa1LzJgr73ArllG9s+vWCS7m/s6QQh5YUz8I0anG7GQ==,iv:5ARq3unUy2xbDcAFkucvEhjz/QYC2rYgutEo4T2bw2E=,tag:k7eHKqeA7k6XzksLVcnXRw==,type:str]
+forgejo-secret-key: ENC[AES256_GCM,data:iserDzOnJkM4HLP4c6rekSFANtRmEXwuCPyfMqo=,iv:3CNqN/DyS4PIl/iOO4JCpWJn3ARlb5KQSCNv5Orx2mo=,tag:q34jEpGrK2EKf0bcBznpQQ==,type:str]
+sops:
+    age:
+        - recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
+          enc: |
+            -----BEGIN AGE ENCRYPTED FILE-----
+            YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBjeTdJNVExVjB2dzF0dTRu
+            ZEV1RHlvd3VPNUZ1b0FsQW14bHJOUUM5Z1NjCmhudWRoUjd5a3dWSEhwK1dDd0hK
+            N1JUUHhlOVFGVWxwalpvbXJVMlhtcGcKLS0tIFJmRjM4bnJ0TUIyWElaUUd3Y2Zq
+            LzBHRWZXODVDZTE2WnVZOGNQckk4KzAKdm3xnA03JnQnc07yhVVtYkVYS6654Zm1
+            4AcLRSCcWvWrvp26XYVE2UGqU7acfxrTsk07o0nHAQpa5LjgJ4oFKw==
+            -----END AGE ENCRYPTED FILE-----
+    lastmodified: "2025-06-06T18:38:08Z"
+    mac: ENC[AES256_GCM,data:BvpIz6tfVSR3m1l7g4ilUyoTKKqirt+k6tPizxCsAgjztt0IyDCio+cLTln4P1tGSy/frjvbxy1mR3tIDkWn6aDFoYz/gnsbTKHSo/K5Q77jJ3uJffoB3/Wruigojl3EBIQHALicq9xhF8rsH/RKjpWqh+TrQwO+ibbA6ff76cw=,iv:Z0ZwJ9aPpI9MtbsZnvFkW7zsFFOMj5/Gv+tF/mal+yI=,tag:knf01NC/XwgjPUHH+8RpSg==,type:str]
+    unencrypted_suffix: _unencrypted
+    version: 3.10.2
diff --git a/nixos/secrets/secrets.yaml b/nixos/secrets/secrets.yaml
deleted file mode 100644
index e4bc266..0000000
--- a/nixos/secrets/secrets.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-forgejo-admin-password: ENC[AES256_GCM,data:cLC4JQC8PMF4/aeVBzOROupPLzd7TbYwvudr7yVx4YpLCGSmYXRwJQAoXg==,iv:tG2kL66ZshwZkJodZQ5K8SZKfG1eJYeX9eYsZ7yM7rA=,tag:0roW0M9eUmzejkH6pwN/IA==,type:str]
-forgejo-db-password: ENC[AES256_GCM,data:0KZJHmNuxpO8TmLNuryipICPTjG9h56+II1Azk+v3fkE5MAb9g==,iv:zb14BvbC2OehCYATgMMoPXv742jjD4v0B12cVhNCWBw=,tag:pnrboj5IvwXYXaZJbZpxTQ==,type:str]
-hello: ENC[AES256_GCM,data:XkOLnE2Mkunc0zNF1932jOuz1olAwWf56lkqL2dt+h99WoL/vNLfSQ0al8NfEA==,iv:WC2xbB9WmB/khOVjdClFerJ8kjtHjaR/p6rDYaaDZhY=,tag:tT92FNrRm74XoZxoFFXm5g==,type:str]
-example_key: ENC[AES256_GCM,data:kBk87OXu+qfJjP/2EA==,iv:64WcHaVfQrVCouUCZoHk0z/4ii8U9m61/E9SqLeB3Ms=,tag:MZJ6m7m4+s6BNGhtNs+ZFQ==,type:str]
-#ENC[AES256_GCM,data:lM4LNQNU2S66a73pUymyUA==,iv:pAHgR+ViSO3Ff2zSaZQcXNGb2r2KH+ZbRd33vpq8ncs=,tag:WTNQCjaESLXTXwcwZePU2A==,type:comment]
-example_array:
-    - ENC[AES256_GCM,data:Sc1q0Yd3sQ6eOzSwfQA=,iv:L4YBbWWeQZAYROHpiNEtHLDCdcuW+vvEpYhGxD0b62g=,tag:82L6MlHWIMpxKb4B3+Lszg==,type:str]
-    - ENC[AES256_GCM,data:Ud9dpSAcHc8NOq48wQI=,iv:9ERTBUQqKHPUIG57KXbRPMXN37cx+WcxOCDxCWpbE1k=,tag:ftTGF/obIJVZSTodIGoABw==,type:str]
-example_number: ENC[AES256_GCM,data:1Xvp578L4rjW6g==,iv:82z/MQM586y4WilPZgmisa2C7GTdG0vmIEkyx/aMCXw=,tag:UtNDNKbu0tuhSyu1OQiJJA==,type:float]
-example_booleans:
-    - ENC[AES256_GCM,data:RkxG/g==,iv:RNZpV/1KRWOazIuHj+SH7r3AmwnRBIUgXgfDplrk5X0=,tag:cKv0dVJGQcluscNspIrPgg==,type:bool]
-    - ENC[AES256_GCM,data:PvghSeY=,iv:xPlMb1LMsg5gAWsCXT3UnMyOfQmSKDKdDrjt+n9+Nqs=,tag:B2aROAGdcupDmoOHAiXeTg==,type:bool]
-sops:
-    age:
-        - recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
-          enc: |
-            -----BEGIN AGE ENCRYPTED FILE-----
-            YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBwVElvVXluZCsxK1BiT3c2
-            Zm9kaURNdnZ2Nk9EM0dld2tjdFhrZlFiSEVnCk8zZVpWWlFXS3JYS0Q2WHExLzFU
-            WkFwcDFmR3VrdHFmS2JmVC95TnZIMjQKLS0tIGsyVmp1Sm1uL3FKVWlERUZHdmVw
-            TG9HYXdUdlZNYXJUZng2ejBwbjJoNVkK0ER6mqLdz0hEaovWME4p56tjuYbPIuhb
-            X1smwLmHxgcRboeFU5dyp3wZKBg7ccRPneQKsgJvYb929BesynHr6g==
-            -----END AGE ENCRYPTED FILE-----
-    lastmodified: "2025-06-03T16:03:32Z"
-    mac: ENC[AES256_GCM,data:mLCtH1EPm1cD7KD/fCVO0hrIfG6AOl396kcwdahyr326IRvTneT+6lr+f0XAHSkPXtRsmSCiD9WNhLYAh/kCfsP7tVPKl4X17OHkK9blUJ5JpuqnZJfOQ3PXNitYFvcSUUi1Y1/vIQmDf52oTPlcZgxmTgsQj4MEJIIni7d0SOc=,iv:MhAJ0QAdyHv8BzHIBQ/lZ7zV/MKjcsicbBOw9kwo7Nc=,tag:qrfTfCPxAMvXOm69BMWJ4g==,type:str]
-    unencrypted_suffix: _unencrypted
-    version: 3.10.2
diff --git a/nixos/secrets/shared-sops.nix b/nixos/secrets/shared-sops.nix
new file mode 100644
index 0000000..dd47aec
--- /dev/null
+++ b/nixos/secrets/shared-sops.nix
@@ -0,0 +1,11 @@
+{
+  sops = {
+    age.keyFile = "/etc/sops/age.key";
+    defaultSopsFile = ./shared/secrets.yml;
+    secrets = {
+      # "monitoring-token".owner = "prometheus";
+      # "ssh-private-key".owner = "root";
+      # "ssh-public-key".owner = "root";
+    };
+  };
+}
diff --git a/nixos/secrets/shared/secrets.yml b/nixos/secrets/shared/secrets.yml
new file mode 100644
index 0000000..53b1990
--- /dev/null
+++ b/nixos/secrets/shared/secrets.yml
@@ -0,0 +1,16 @@
+test: ENC[AES256_GCM,data:a+pmog==,iv:3Ledge90oTzTM8uNFWWIgLafa7/Hhx9WzXRAS3flUZo=,tag:mfWiEWxkZVihuX3S3SY12w==,type:str]
+sops:
+    age:
+        - recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
+          enc: |
+            -----BEGIN AGE ENCRYPTED FILE-----
+            YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBudWI1RXZBOGFoaGIwWWc2
+            cXdpems4SmY1QjNhOWxHTVBhR1ZPYjdSZUcwClhHb1IzSTBmcFllbWl2UVpabGFo
+            Sm1neTlUVVUwalZUSWVLaVNBUWhUSTAKLS0tIG5yVHBiWGMwMG1OTnBXckh4eXdv
+            MU5UQ29lMmw0ZEJnV09IWEpWc2NYT2MKfl+3cZvgunrgGr7KhjGZOlJ0EKRAiAED
+            pDGr25OcjQgpsg4/LCPPOMsi9Yyi/RICJGyDDINaTLMEQDhIsoOyUQ==
+            -----END AGE ENCRYPTED FILE-----
+    lastmodified: "2025-06-06T17:29:59Z"
+    mac: ENC[AES256_GCM,data:nvb3Wc3578e45ob2FyyWlsadVOdErTfJ2Ni5jb06f/WbzDkyJd3lCBRTUIAdyXijT4ErtogHImBjYXzRuCi9xP68mTtaoQb6l8bULKJLdY/yDcMzMyKGZLDxTVW80nLvDrqs5piKBYFWtyFaAEio8fVlA4RIUsyFx/mgcbI3ChA=,iv:9/DmD48MKzBNGSODUr4jqDv17r2o4xgH7TVbpQeuyCU=,tag:1uVui9sVI9SfTlgtqPCLMA==,type:str]
+    unencrypted_suffix: _unencrypted
+    version: 3.10.2
diff --git a/nixos/secrets/sops.nix b/nixos/secrets/sops.nix
deleted file mode 100644
index 05c4290..0000000
--- a/nixos/secrets/sops.nix
+++ /dev/null
@@ -1,8 +0,0 @@
-{ config, lib, ... }:
-{
-  sops = {
-    defaultSopsFile = ./secrets.yaml;
-    age.keyFile = "/etc/sops/age.key";
-    #secrets."forgejo-admin-password".owner = "forgejo";
-  };
-}
diff --git a/nixos/secrets/telegram/secrets.yml b/nixos/secrets/telegram/secrets.yml
new file mode 100644
index 0000000..e5d637e
--- /dev/null
+++ b/nixos/secrets/telegram/secrets.yml
@@ -0,0 +1,16 @@
+telegram-alert-bot-token: ENC[AES256_GCM,data:mM1aYhpcCecRUdwkdlBKA+dWOHZEwUvP+m4MIg4n89SzgY8GWw0z1OaIpxfR0w==,iv:tzmCjiYntDYpkO4S0a/tMQkfGQpZjLBiBu4Rs/5RHbc=,tag:5cZDEK474WzXwIW5Jc7S5w==,type:str]
+sops:
+    age:
+        - recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
+          enc: |
+            -----BEGIN AGE ENCRYPTED FILE-----
+            YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSA5RE82S05lbHV6aE9qZFhL
+            MTZ4Y1R6cXFSZUFhZHdHbzZ5OWdrOXhwdEZ3CkxzM1NtSjNzeXM5byt1VzVtRHpH
+            Tk9ORUtEZ05FMTgrYVNhU3dKRkFKaHMKLS0tIFlLeWJ1dmJsRWc5SkhDbjdEb0or
+            UHl6emN0My8wcFZWYlZEaElrb2NidjgKlZols9SJQxgaoOdJJxghqlACBcwuFs94
+            IGAOoQVUSFhMCWzyXqAQ/1/VkbWqfiUmvqDa3ulEK2Ri+1F+u3mB1Q==
+            -----END AGE ENCRYPTED FILE-----
+    lastmodified: "2025-06-06T21:21:32Z"
+    mac: ENC[AES256_GCM,data:YS7BLFXkQ/A5PVLVOyMaqRHGavY0YttFps3njzSiYgBUa4VfPHqMcl2fW5vMec5MwM3GKPFGtrSEZKK1NVqLxUWZrfIF6ugAZ4vhRCyWe1Kze2Zs2S0ia2C3mUdhQR2wb7M7YzohI/e7PDZo0UcrcG3YeEzS5NL7qb0hzFsrGLY=,iv:kqzD06q5X0ZkZ1sIoUQz05b6QRDWQVsPqQYxPP2OAl8=,tag:eexvJspUxpDpwJqU1zEMnA==,type:str]
+    unencrypted_suffix: _unencrypted
+    version: 3.10.2