colmena initial implementation for sandbox and monitor

2025-07-06 21:25:57 +02:00 · 2025-07-06 21:25:57 +02:00 · 5feb74d56d
commit 5feb74d56d
parent a90630ecb6
40 changed files with 27629 additions and 141 deletions
--- a/.forgejo/workflows/build-image.yml
+++ b/.forgejo/workflows/build-image.yml
@ -1,61 +0,0 @@
-name: "Build NixOS Image"
-
-on:
-  workflow_dispatch:
-
-jobs:
-  build:
-    runs-on: native
-    steps:
-      - name: Install nodejs
-        run: nix-env -iA nixpkgs.nodejs
-
-      - name: Checkout repo
-        uses: actions/checkout@v4
-
-      # - name: Install Nix
-      #   uses: cachix/install-nix-action@v31
-      #   with:
-      #     nix_path: nixpkgs=channel:nixos-unstable
-      #     extra_nix_config: |
-      #       experimental-features = nix-command flakes
-
-      # - name: Enable experimental features
-      #   run: |
-      #     mkdir -p ~/.config/nix
-      #     echo "experimental-features = nix-command flakes" >> ~/.config/nix/nix.conf
-
-      # - name: Update Channel
-      #   run: nix-channel --update
-
-      - name: Build NixOS image
-        working-directory: nixos
-        run: nix build .#proxmoxTemplate
-
-      # - name: Upload & Restore to Proxmox
-      #   working-directory: nixos
-      #   env:
-      #     PROXMOX_SSH_KEY: ${{ secrets.PROXMOX_SSH_KEY }}
-      #     PROXMOX_HOST: 192.168.1.205
-      #     PROXMOX_USER: root
-      #   run: |
-      #     set -e
-
-      #     IMAGE_NAME="vm-image.vma.zst"
-      #     REMOTE_PATH="/var/lib/vz/template/$IMAGE_NAME"
-      #     VM_ID="9000"
-
-      #     echo "Starting ssh-agent and uploading..."
-      #     eval "$(ssh-agent -s)"
-      #     ssh-add <(echo "$PROXMOX_SSH_KEY")
-
-      #     echo "Uploading image..."
-      #     scp -o StrictHostKeyChecking=no ./result/$IMAGE_NAME $PROXMOX_USER@$PROXMOX_HOST:$REMOTE_PATH
-
-      #     echo "Restoring VM $VM_ID..."
-      #     ssh -o StrictHostKeyChecking=no $PROXMOX_USER@$PROXMOX_HOST "
-      #       qm stop $VM_ID || true
-      #       qm destroy $VM_ID || true
-      #       qmrestore --unique $REMOTE_PATH $VM_ID
-      #       qm template $VM_ID
-      #     "
--- a/.forgejo/workflows/colmena-apply.yml
+++ b/.forgejo/workflows/colmena-apply.yml
@ -0,0 +1,53 @@
+name: "Colmena apply"
+
+on:
+  push:
+    tags:
+      - "v*" # triggers on v1.0.0, v1.2.3, etc.
+  workflow_dispatch:
+
+jobs:
+  apply:
+    name: Apply flake configurations to colmena hive
+    # Ensure 'nixos-latest' runner has Docker, SSH client, and basic Nix tools installed.
+    # It seems it already does.
+    runs-on: nixos-latest
+    env:
+      NIXOS_BUILER_HOST: nixos-builder.lab
+      NIXOS_BUILER_USER: runner
+
+    steps:
+      # Use nix-env for setup (as you prefer and it works well for ephemeral environments)
+      - name: Install dependencies via nix-env
+        run: |
+          nix-env -iA nixpkgs.nodejs
+          nix-env -iA nixpkgs.openssh
+          nix-env -if https://github.com/zhaofengli/colmena/tarball/main
+          nix-env -iA cachix -f https://cachix.org/api/v1/install
+          cachix use plasmagoat
+          cachix authtoken ${{ secrets.CACHIX_AUTH_TOKEN }}
+
+      - name: Checkout repo
+        uses: actions/checkout@v4
+
+      - name: Enable experimental features
+        run: |
+          mkdir -p ~/.config/nix
+          echo "experimental-features = nix-command flakes" >> ~/.config/nix/nix.conf
+
+      - name: Prepare SSH keys and known_hosts for builder and Proxmox
+        run: |
+          mkdir -p ~/.ssh
+          echo "${{ secrets.RUNNER_SSH_KEY }}" > ~/.ssh/id_rsa
+          chmod 600 ~/.ssh/id_rsa
+          ssh-keyscan -H "$NIXOS_BUILER_HOST" >> ~/.ssh/known_hosts
+          chmod 600 ~/.ssh/known_hosts
+
+      - name: Test SSH connection to NixOS Builder
+        run: |
+          echo "Testing SSH connection to $NIXOS_BUILER_HOST..."
+          ssh -o StrictHostKeyChecking=yes "$NIXOS_BUILER_USER"@"$NIXOS_BUILER_HOST" "echo 'SSH success. Hostname:' && hostname"
+
+      - name: Apply Colmena
+        id: apply
+        run: colmena apply
--- a/.forgejo/workflows/deploy-nixos.yaml
+++ b/.forgejo/workflows/deploy-nixos.yaml
@ -1,39 +0,0 @@
-name: Deploy NixOS VM
-
-on:
-  workflow_dispatch:
-
-jobs:
-  deploy:
-    runs-on: docker
-    container:
-      image: nixos/nix
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v4
-
-      - name: Install Terraform
-        run: nix-env -iA nixpkgs.terraform
-
-      - name: Setup SSH key
-        run: |
-          mkdir -p ~/.ssh
-          echo "$SSH_PRIVATE_KEY" > ~/.ssh/id_ed25519
-          chmod 600 ~/.ssh/id_ed25519
-        env:
-          SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
-
-      - name: Terraform Init & Apply
-        run: |
-          terraform init
-          terraform apply -auto-approve
-        working-directory: ./terraform
-        env:
-          PROXMOX_PASSWORD: ${{ secrets.PROXMOX_PASSWORD }}
-
-      - name: Deploy NixOS via nixos-anywhere
-        run: |
-          nix run github:numtide/nixos-anywhere -- \
-            --build-on-remote \
-            --flake .#new-vm \
-            root@<new-vm-ip>
--- a/.forgejo/workflows/proxmox-nixos-deploy.yml
+++ b/.forgejo/workflows/proxmox-nixos-deploy.yml
@ -1,34 +0,0 @@
-name: Terraform Proxmox NixOS VM Deploy
-
-on:
-  workflow_dispatch:
-
-jobs:
-  deploy-nixos-vm:
-    runs-on: nixos-latest
-    steps:
-      - name: Install nodejs
-        run: nix-env -iA nixpkgs.nodejs
-
-      - name: Install terraform
-        run: nix-env -iA nixpkgs.terraform
-
-      - name: Install sops
-        run: nix-env -iA nixpkgs.sops
-
-      - name: Checkout repo
-        uses: actions/checkout@v3
-
-      - name: Decrypt secrets
-        env:
-          SOPS_AGE_KEY_FILE: ${{ secrets.AGE_KEY_FILE }}
-        run: |
-          sops --decrypt secrets.yaml.enc > secrets.yaml
-
-      - name: Terraform Init
-        run: terraform init
-
-      - name: Terraform Apply
-        env:
-          PROXMOX_PASSWORD: ${{ secrets.PROXMOX_PASSWORD }}
-        run: terraform apply -auto-approve
--- a/.sops.yaml
+++ b/.sops.yaml
@ -0,0 +1,3 @@
+creation_rules:
+  - age: >-
+      age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
--- a/ansible/books/clone-template.yml
+++ b/ansible/books/clone-template.yml
@ -55,7 +55,7 @@
      register: qga_json
      failed_when: qga_json.rc != 0

-    - name: Parse out eth0’s IPv4 address
+    - name: Parse out eth0's IPv4 address
      ansible.builtin.set_fact:
        vm_ipv4: >-
          {{
@ -71,11 +71,11 @@
            )
          }}

-    - name: Show the VM’s IP
+    - name: Show the VM's IP
      ansible.builtin.debug:
        msg: "VM {{ new_vmid }} ({{ new_name }}) reports IPv4: {{ vm_ipv4 }}"

-    - name: Add new VM’s IP to in-memory inventory (for later tasks)
+    - name: Add new VM's IP to in-memory inventory (for later tasks)
      ansible.builtin.add_host:
        name: "nixos-{{ new_vmid }}"
        ansible_host: "{{ vm_ipv4 }}"
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,154 @@
+{
+  "nodes": {
+    "colmena": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "nix-github-actions": "nix-github-actions",
+        "nixpkgs": "nixpkgs",
+        "stable": "stable"
+      },
+      "locked": {
+        "lastModified": 1751144689,
+        "narHash": "sha256-cgIntaqhcm62V1KU6GmrAGpHpahT4UExEWW2ryS02ZU=",
+        "owner": "zhaofengli",
+        "repo": "colmena",
+        "rev": "3ceec72cfb396a8a8de5fe96a9d75a9ce88cc18e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "zhaofengli",
+        "repo": "colmena",
+        "type": "github"
+      }
+    },
+    "flake-compat": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1650374568,
+        "narHash": "sha256-Z+s0J8/r907g149rllvwhb4pKi8Wam5ij0st8PwAh+E=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "b4a34015c698c7793d592d66adbab377907a2be8",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "locked": {
+        "lastModified": 1659877975,
+        "narHash": "sha256-zllb8aq3YO3h8B/U0/J1WBgAL8EX5yWf5pMj3G0NAmc=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "c0e246b9b83f637f4681389ecabcb2681b4f3af0",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nix-github-actions": {
+      "inputs": {
+        "nixpkgs": [
+          "colmena",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1729742964,
+        "narHash": "sha256-B4mzTcQ0FZHdpeWcpDYPERtyjJd/NIuaQ9+BV1h+MpA=",
+        "owner": "nix-community",
+        "repo": "nix-github-actions",
+        "rev": "e04df33f62cdcf93d73e9a04142464753a16db67",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "repo": "nix-github-actions",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1750134718,
+        "narHash": "sha256-v263g4GbxXv87hMXMCpjkIxd/viIF7p3JpJrwgKdNiI=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "9e83b64f727c88a7711a2c463a7b16eedb69a84c",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "nixpkgs_2": {
+      "locked": {
+        "lastModified": 1751801514,
+        "narHash": "sha256-Ve3ZTzcXEGt4IoXLsWqk35w3w4cH5G1MJb+gLdj/jtE=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "4e3e6431fd60d653bb7f4fa5487e2c500d50f49f",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "colmena": "colmena",
+        "nixpkgs": "nixpkgs_2",
+        "sops-nix": "sops-nix"
+      }
+    },
+    "sops-nix": {
+      "inputs": {
+        "nixpkgs": [
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1751606940,
+        "narHash": "sha256-KrDPXobG7DFKTOteqdSVeL1bMVitDcy7otpVZWDE6MA=",
+        "owner": "Mic92",
+        "repo": "sops-nix",
+        "rev": "3633fc4acf03f43b260244d94c71e9e14a2f6e0d",
+        "type": "github"
+      },
+      "original": {
+        "owner": "Mic92",
+        "repo": "sops-nix",
+        "type": "github"
+      }
+    },
+    "stable": {
+      "locked": {
+        "lastModified": 1750133334,
+        "narHash": "sha256-urV51uWH7fVnhIvsZIELIYalMYsyr2FCalvlRTzqWRw=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "36ab78dab7da2e4e27911007033713bab534187b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-25.05",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,34 @@
+{
+  description = "Declarative NixOS HomeLab";
+
+  inputs = {
+    nixpkgs.url = "github:nixos/nixpkgs";
+    # systems.url = "github:nix-systems/default";
+    sops-nix = {
+      url = "github:Mic92/sops-nix";
+      inputs.nixpkgs.follows = "nixpkgs";
+    };
+    # home-manager = {
+    #   url = "home-manager";
+    #   inputs.nixpkgs.follows = "nixpkgs";
+    # };
+    colmena.url = "github:zhaofengli/colmena";
+  };
+
+  outputs = {
+    self,
+    nixpkgs,
+    # systems,
+    sops-nix,
+    # home-manager,
+    colmena,
+    ...
+  } @ inputs: let
+    overlays = [
+      colmena.overlays.default
+    ];
+  in {
+    colmenaHive = colmena.lib.makeHive self.outputs.colmena;
+    colmena = (import ./hive.nix) (inputs // {inherit overlays;});
+  };
+}
--- a/hive.nix
+++ b/hive.nix
@ -0,0 +1,48 @@
+inputs @ {
+  self,
+  nixpkgs,
+  sops-nix,
+  # home-manager,
+  overlays,
+  ...
+}: {
+  meta = {
+    nixpkgs = import nixpkgs {
+      system = "x86_64-linux";
+    };
+    specialArgs.flakeInputs = inputs;
+  };
+
+  defaults = {
+    pkgs,
+    lib,
+    name,
+    nodes,
+    meta,
+    config,
+    ...
+  }: {
+    imports = [
+      ./machines/_default
+      ./machines/modules
+      sops-nix.nixosModules.sops
+      # home-manager.nixosModules.home-manager
+    ];
+    nixpkgs = {
+      inherit overlays;
+      system = lib.mkDefault "x86_64-linux";
+      config.allowUnfree = true;
+    };
+    deployment.tags = [config.nixpkgs.system name];
+  };
+
+  sandbox = {name, ...}: {
+    imports = [./machines/${name}/definition.nix];
+    deployment.tags = ["sandbox"];
+  };
+
+  monitor = {name, ...}: {
+    imports = [./machines/${name}/definition.nix];
+    deployment.tags = ["grafana" "prometheus"];
+  };
+}
--- a/machines/_default/common_config.nix
+++ b/machines/_default/common_config.nix
@ -0,0 +1,89 @@
+{
+  pkgs,
+  lib,
+  modulesPath,
+  ...
+}: {
+  imports = [
+    # Enables QEMU Guest Agent support in the VM
+    (modulesPath + "/profiles/qemu-guest.nix")
+  ];
+
+  services.qemuGuest.enable = lib.mkDefault true;
+
+  boot.loader.grub.enable = lib.mkDefault true;
+  boot.loader.grub.devices = ["nodev"];
+
+  boot.growPartition = lib.mkDefault true;
+
+  boot.tmp.cleanOnBoot = true;
+
+  fileSystems."/" = lib.mkDefault {
+    device = "/dev/disk/by-label/nixos";
+    autoResize = true; # grow on first boot
+    fsType = "ext4";
+  };
+
+  nix = {
+    settings.experimental-features = ["nix-command" "flakes"];
+    gc.automatic = true;
+    gc.options = "--delete-older-than 15d";
+    gc.dates = "daily";
+    optimise.automatic = true;
+    settings = {
+      auto-optimise-store = true;
+      allowed-users = ["@wheel"];
+      trusted-users = ["root" "@wheel"];
+    };
+    extraOptions = ''
+      keep-outputs = true
+      keep-derivations = true
+    '';
+  };
+
+  security.sudo.wheelNeedsPassword = false;
+
+  users.users.plasmagoat = {
+    isNormalUser = true;
+    description = "plasmagoat";
+    extraGroups = ["wheel" "docker"];
+    # shell = pkgs.zsh;
+    # shell = pkgs.fish;
+  };
+
+  services.openssh.enable = true;
+  services.openssh.openFirewall = true;
+  services.openssh.settings.PasswordAuthentication = false;
+  services.openssh.settings.PermitRootLogin = "prohibit-password";
+  services.openssh.settings.KbdInteractiveAuthentication = false;
+
+  services.sshguard.enable = true;
+
+  programs.ssh.startAgent = true;
+
+  users.users.plasmagoat.openssh.authorizedKeys.keys = [
+    "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCeg/n/vst9KME8byhxX2FhA+FZNQ60W38kkNt45eNzK5zFqBYuwo1nDXVanJSh9unRvB13b+ygpZhrb4sHvkETGWiEioc49MiWr8czEhu6Wpo0vv5MAJkiYvGZUYPdUW52jUzWcYdw8PukG2rowrxL5G0CmsqLwHMPU2FyeCe5aByFI/JZb8R80LoEacgjUiipJcoLWUVgG2koMomHClqGu+16kB8nL5Ja3Kc9lgLfDK7L0A5R8JXhCjrlEsmXbxZmwDKuxvjDAZdE9Sl1VZmMDfWkyrRlenrt01eR3t3Fec6ziRm5ZJk9e2Iu1DPoz+PoHH9aZGVwmlvvnr/gMF3OILxcqb0qx+AYlCCnb6D6pJ9zufhZkKcPRS1Q187F6fz+v2oD1xLZWFHJ92+7ItM0WmbDOHOC29s5EA6wNm3iXZCq86OI3n6T34njDtPqh6Z7Pk2sdK4GBwnFj4KwEWXvdKZKSX1qb2EVlEBE9QI4Gf3eg4SiBu2cAFt3nOSzs8c= asol\dbs@ALPHA-DBS-P14sG2"
+    "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC+U3DWOrklcA8n8wdbLBGyli5LsJI3dpL2Zod8mx8eOdC4H127ZT1hzuk2uSmkic4c73BykPyQv8rcqwaRGW94xdMRanKmHYxnbHXo5FBiGrCkNlNNZuahthAGO49c6sUhJMq0eLhYOoFWjtf15sr5Zu7Ug2YTUL3HXB1o9PZ3c9sqYHo2rC/Il1x2j3jNAMKST/qUZYySvdfNJEeQhMbQcdoKJsShcE3oGRL6DFBoV/mjJAJ+wuDhGLDnqi79nQjYfbYja1xKcrKX+D3MfkFxFl6ZIzomR1t75AnZ+09oaWcv1J7ehZ3h9PpDBFNXvzyLwDBMNS+UYcH6SyFjkUbF David@NZXT"
+    "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICUP7m8jZJiclZGfSje8CeBYFhX10SrdtjYziuChmj1X plasmagoat@macbook-air"
+  ];
+
+  users.users.root.openssh.authorizedKeys.keys = [
+    "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCeg/n/vst9KME8byhxX2FhA+FZNQ60W38kkNt45eNzK5zFqBYuwo1nDXVanJSh9unRvB13b+ygpZhrb4sHvkETGWiEioc49MiWr8czEhu6Wpo0vv5MAJkiYvGZUYPdUW52jUzWcYdw8PukG2rowrxL5G0CmsqLwHMPU2FyeCe5aByFI/JZb8R80LoEacgjUiipJcoLWUVgG2koMomHClqGu+16kB8nL5Ja3Kc9lgLfDK7L0A5R8JXhCjrlEsmXbxZmwDKuxvjDAZdE9Sl1VZmMDfWkyrRlenrt01eR3t3Fec6ziRm5ZJk9e2Iu1DPoz+PoHH9aZGVwmlvvnr/gMF3OILxcqb0qx+AYlCCnb6D6pJ9zufhZkKcPRS1Q187F6fz+v2oD1xLZWFHJ92+7ItM0WmbDOHOC29s5EA6wNm3iXZCq86OI3n6T34njDtPqh6Z7Pk2sdK4GBwnFj4KwEWXvdKZKSX1qb2EVlEBE9QI4Gf3eg4SiBu2cAFt3nOSzs8c= asol\\dbs@ALPHA-DBS-P14sG2"
+    "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC+U3DWOrklcA8n8wdbLBGyli5LsJI3dpL2Zod8mx8eOdC4H127ZT1hzuk2uSmkic4c73BykPyQv8rcqwaRGW94xdMRanKmHYxnbHXo5FBiGrCkNlNNZuahthAGO49c6sUhJMq0eLhYOoFWjtf15sr5Zu7Ug2YTUL3HXB1o9PZ3c9sqYHo2rC/Il1x2j3jNAMKST/qUZYySvdfNJEeQhMbQcdoKJsShcE3oGRL6DFBoV/mjJAJ+wuDhGLDnqi79nQjYfbYja1xKcrKX+D3MfkFxFl6ZIzomR1t75AnZ+09oaWcv1J7ehZ3h9PpDBFNXvzyLwDBMNS+UYcH6SyFjkUbF David@NZXT"
+    "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICUP7m8jZJiclZGfSje8CeBYFhX10SrdtjYziuChmj1X plasmagoat@macbook-air"
+  ];
+
+  environment.systemPackages = with pkgs; [
+    dig
+    nmap
+    traceroute
+    vim
+    git
+    curl
+    python3
+  ];
+
+  time.timeZone = "Europe/Copenhagen";
+
+  console.keyMap = "dk-latin1";
+}
--- a/machines/_default/default.nix
+++ b/machines/_default/default.nix
@ -0,0 +1,40 @@
+{
+  lib,
+  name,
+  ...
+}: {
+  imports = [
+    ./common_config.nix
+  ];
+
+  networking.hostName = name;
+
+  deployment = {
+    replaceUnknownProfiles = lib.mkDefault true;
+    buildOnTarget = lib.mkDefault false;
+    targetHost = lib.mkDefault "${name}.lab";
+    tags = lib.mkDefault ["homelab"];
+  };
+
+  sops = {
+    age.keyFile = "/etc/sops/age.key";
+    defaultSopsFile = ../../secrets/secrets.yml;
+  };
+
+  # home-manager = {
+  #   useGlobalPkgs = true;
+  #   useUserPackages = true;
+  #   users.cottand = {
+  #     imports = with flakeInputs.cottand.homeManagerModules; [cli];
+  #     home.stateVersion = "22.11";
+  #   };
+  #   users.root = {
+  #     imports = with flakeInputs.cottand.homeManagerModules; [cli];
+  #     home.stateVersion = "22.11";
+  #   };
+  # };
+
+  # consulNode.enable = lib.mkDefault true;
+  nodeExporter.enable = lib.mkDefault true;
+  journalLog.enable = lib.mkDefault true;
+}
--- a/machines/modules/default.nix
+++ b/machines/modules/default.nix
@ -0,0 +1,11 @@
+{
+  imports = [
+    ./node-exporter.nix
+    ./journal-log.nix
+    # ./wireguard.nix
+    # ./nomad.nix
+    # ./vault.nix
+    # ./vaultSecret.nix
+    # ./consul.nix
+  ];
+}
--- a/machines/modules/journal-log.nix
+++ b/machines/modules/journal-log.nix
@ -0,0 +1,95 @@
+{
+  lib,
+  config,
+  nodes,
+  # name,
+  # meta,
+  ...
+}:
+with lib; let
+  cfg = config.journalLog;
+in {
+  options.journalLog = {
+    enable = mkOption {
+      type = types.bool;
+      default = false;
+    };
+
+    port = mkOption {
+      type = types.number;
+      default = 9080;
+    };
+
+    clientUrl = mkOption {
+      type = types.string;
+      default = "http://monitor.lab:3100/loki/api/v1/push";
+    };
+
+    extraConfig = mkOption {
+      type = types.attrs;
+      default = {};
+    };
+  };
+
+  config = mkIf cfg.enable {
+    networking.firewall.allowedTCPPorts = [cfg.port];
+
+    systemd.tmpfiles.rules = [
+      "d /var/lib/promtail 0755 promtail promtail -"
+    ];
+
+    services.promtail = {
+      enable = true;
+      configuration = {
+        server = {
+          http_listen_port = cfg.port;
+          grpc_listen_port = 0;
+        };
+        positions = {
+          filename = "/var/lib/promtail/positions.yaml";
+        };
+        clients = [
+          {
+            url = cfg.clientUrl;
+          }
+        ];
+        scrape_configs = [
+          {
+            job_name = "journal";
+            journal = {
+              path = "/var/log/journal";
+              labels = {
+                job = "promtail";
+                host = config.networking.hostName;
+                env = "proxmox";
+                instance = "${config.networking.hostName}.lab";
+              };
+            };
+            relabel_configs = [
+              {
+                source_labels = ["__journal__systemd_unit"];
+                target_label = "unit";
+              }
+              {
+                source_labels = ["__journal__hostname"];
+                target_label = "host";
+              }
+              {
+                source_labels = ["__journal__systemd_user_unit"];
+                target_label = "user_unit";
+              }
+              {
+                source_labels = ["__journal__transport"];
+                target_label = "transport";
+              }
+              {
+                source_labels = ["__journal_priority_keyword"];
+                target_label = "severity";
+              }
+            ];
+          }
+        ];
+      };
+    };
+  };
+}
--- a/machines/modules/node-exporter.nix
+++ b/machines/modules/node-exporter.nix
@ -0,0 +1,40 @@
+{
+  lib,
+  config,
+  # name,
+  # meta,
+  ...
+}:
+with lib; let
+  cfg = config.nodeExporter;
+in {
+  options.nodeExporter = {
+    enable = mkOption {
+      type = types.bool;
+      default = false;
+    };
+
+    port = mkOption {
+      type = types.number;
+      default = 9100;
+    };
+
+    extraConfig = mkOption {
+      type = types.attrs;
+      default = {};
+    };
+  };
+
+  config = mkIf cfg.enable {
+    networking.firewall.allowedTCPPorts = [cfg.port];
+
+    services.prometheus.exporters.node =
+      {
+        enable = true;
+        enabledCollectors = ["systemd"];
+        port = cfg.port;
+        extraFlags = ["--collector.ethtool" "--collector.softirqs" "--collector.tcpstat" "--collector.wifi"];
+      }
+      // cfg.extraConfig;
+  };
+}
--- a/machines/monitor/alertmanager.nix
+++ b/machines/monitor/alertmanager.nix
@ -0,0 +1,63 @@
+{
+  config,
+  pkgs,
+  ...
+}: let
+  alertmanagerEnv = config.sops.secrets."alertmanager/env".path;
+in {
+  sops.secrets."alertmanager/env" = {
+    sopsFile = ../../secrets/secrets.yaml;
+    mode = "0440";
+  };
+
+  services.prometheus.alertmanager = {
+    enable = true;
+    openFirewall = true;
+    environmentFile = alertmanagerEnv;
+
+    webExternalUrl = "http://monitor.lab:9093"; # optional but helpful
+    configuration = {
+      route = {
+        receiver = "null";
+        group_by = ["alertname"];
+        group_wait = "10s";
+        group_interval = "5m";
+        repeat_interval = "4h";
+
+        routes = [
+          {
+            receiver = "telegram";
+            matchers = [
+              "severity =~ \"warning|critical\""
+            ];
+            group_wait = "10s";
+            continue = true;
+          }
+        ];
+      };
+
+      receivers = [
+        {name = "null";}
+        {
+          name = "telegram";
+          telegram_configs = [
+            {
+              api_url = "https://api.telegram.org";
+              bot_token = "$TELEGRAM_BOT_TOKEN";
+              chat_id = -1002642560007;
+              message_thread_id = 4;
+              parse_mode = "HTML";
+              send_resolved = true;
+              message = "{{ template \"telegram.message\". }}";
+            }
+          ];
+        }
+      ];
+
+      templates = [
+        (pkgs.writeText "telegram.tmpl" (builtins.readFile ./provisioning/templates/telegram.tmpl))
+        # (pkgs.writeText "telegram.markdown.v2.tmpl" (builtins.readFile ./provisioning/templates/telegram.markdown.v2.tmpl))
+      ];
+    };
+  };
+}
--- a/machines/monitor/dashboards/15356_rev14.json
+++ b/machines/monitor/dashboards/15356_rev14.json
--- a/machines/monitor/dashboards/gitea.json
+++ b/machines/monitor/dashboards/gitea.json
--- a/machines/monitor/dashboards/grafana-traefik.json
+++ b/machines/monitor/dashboards/grafana-traefik.json
@ -0,0 +1,692 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "datasource",
+          "uid": "grafana"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "Traefik dashboard prometheus",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": 2,
+  "links": [],
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 10,
+      "panels": [],
+      "title": "$backend stats",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            }
+          },
+          "decimals": 0,
+          "mappings": [],
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 0,
+        "y": 1
+      },
+      "id": 2,
+      "maxDataPoints": 3,
+      "options": {
+        "displayLabels": [],
+        "legend": {
+          "calcs": [],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true,
+          "values": ["value", "percent"]
+        },
+        "pieType": "pie",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "text": {},
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.0+security-01",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "exemplar": true,
+          "expr": "traefik_service_requests_total{service=\"$service\"}",
+          "format": "time_series",
+          "interval": "",
+          "intervalFactor": 2,
+          "legendFormat": "{{method}} : {{code}}",
+          "refId": "A"
+        }
+      ],
+      "title": "$service return code",
+      "type": "piechart"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "fixedColor": "rgb(31, 120, 193)",
+            "mode": "fixed"
+          },
+          "mappings": [
+            {
+              "options": {
+                "match": "null",
+                "result": {
+                  "text": "N/A"
+                }
+              },
+              "type": "special"
+            }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 12,
+        "y": 1
+      },
+      "id": 4,
+      "maxDataPoints": 100,
+      "options": {
+        "colorMode": "none",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "horizontal",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": {
+          "calcs": ["mean"],
+          "fields": "",
+          "values": false
+        },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "12.0.0+security-01",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "exemplar": true,
+          "expr": "sum(traefik_service_request_duration_seconds_sum{service=\"$service\"}) / sum(traefik_service_requests_total{service=\"$service\"}) * 1000",
+          "format": "time_series",
+          "interval": "",
+          "intervalFactor": 2,
+          "legendFormat": "",
+          "refId": "A"
+        }
+      ],
+      "title": "$service response time",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "bars",
+            "fillOpacity": 100,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 24,
+        "x": 0,
+        "y": 8
+      },
+      "id": 3,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": ["mean", "max", "min"],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.0+security-01",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "exemplar": true,
+          "expr": "sum(rate(traefik_service_requests_total{service=\"$service\"}[5m]))",
+          "format": "time_series",
+          "interval": "",
+          "intervalFactor": 2,
+          "legendFormat": "Total requests $service",
+          "refId": "A"
+        }
+      ],
+      "title": "Total requests over 5min $service",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 15
+      },
+      "id": 12,
+      "panels": [],
+      "title": "Global stats",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "bars",
+            "fillOpacity": 100,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "normal"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
+      "id": 5,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": ["lastNotNull", "max", "min"],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.0+security-01",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code=\"200\"}[5m])",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "{{method}} : {{code}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Status code 200 over 5min",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "bars",
+            "fillOpacity": 100,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "normal"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
+      "id": 6,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": ["lastNotNull", "max", "min"],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.0+security-01",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code!=\"200\"}[5m])",
+          "format": "time_series",
+          "intervalFactor": 2,
+          "legendFormat": "{{ method }} : {{code}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Others status code over 5min",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            }
+          },
+          "decimals": 0,
+          "mappings": [],
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 0,
+        "y": 23
+      },
+      "id": 7,
+      "maxDataPoints": 3,
+      "options": {
+        "displayLabels": [],
+        "legend": {
+          "calcs": [],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true,
+          "values": ["value"]
+        },
+        "pieType": "pie",
+        "reduceOptions": {
+          "calcs": ["sum"],
+          "fields": "",
+          "values": false
+        },
+        "text": {},
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.0+security-01",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "exemplar": true,
+          "expr": "sum(rate(traefik_service_requests_total[5m])) by (service) ",
+          "format": "time_series",
+          "interval": "",
+          "intervalFactor": 2,
+          "legendFormat": "{{ service }}",
+          "refId": "A"
+        }
+      ],
+      "title": "Requests by service",
+      "type": "piechart"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            }
+          },
+          "decimals": 0,
+          "mappings": [],
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 12,
+        "y": 23
+      },
+      "id": 8,
+      "maxDataPoints": 3,
+      "options": {
+        "displayLabels": [],
+        "legend": {
+          "calcs": [],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true,
+          "values": ["value"]
+        },
+        "pieType": "pie",
+        "reduceOptions": {
+          "calcs": ["sum"],
+          "fields": "",
+          "values": false
+        },
+        "text": {},
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.0+security-01",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "exemplar": true,
+          "expr": "sum(rate(traefik_entrypoint_requests_total{entrypoint =~ \"$entrypoint\"}[5m])) by (entrypoint) ",
+          "format": "time_series",
+          "interval": "",
+          "intervalFactor": 2,
+          "legendFormat": "{{ entrypoint }}",
+          "refId": "A"
+        }
+      ],
+      "title": "Requests by protocol",
+      "type": "piechart"
+    }
+  ],
+  "preload": false,
+  "schemaVersion": 41,
+  "tags": ["traefik", "prometheus"],
+  "templating": {
+    "list": [
+      {
+        "current": {},
+        "datasource": "Prometheus",
+        "definition": "label_values({job=\"traefik\"},service)",
+        "includeAll": false,
+        "name": "service",
+        "options": [],
+        "query": {
+          "qryType": 1,
+          "query": "label_values({job=\"traefik\"},service)",
+          "refId": "PrometheusVariableQueryEditor-VariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "type": "query"
+      },
+      {
+        "current": {},
+        "datasource": "Prometheus",
+        "definition": "",
+        "includeAll": true,
+        "multi": true,
+        "name": "entrypoint",
+        "options": [],
+        "query": {
+          "query": "label_values(entrypoint)",
+          "refId": "Prometheus-entrypoint-Variable-Query"
+        },
+        "refresh": 1,
+        "regex": "",
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Traefik",
+  "uid": "qPdAviJmz",
+  "version": 1
+}
--- a/machines/monitor/dashboards/node-exporter.json
+++ b/machines/monitor/dashboards/node-exporter.json
--- a/machines/monitor/dashboards/postgres.json
+++ b/machines/monitor/dashboards/postgres.json
--- a/machines/monitor/dashboards/promtail.json
+++ b/machines/monitor/dashboards/promtail.json
--- a/machines/monitor/dashboards/traefik-access.json
+++ b/machines/monitor/dashboards/traefik-access.json
--- a/machines/monitor/dashboards/traefik.json
+++ b/machines/monitor/dashboards/traefik.json
--- a/machines/monitor/definition.nix
+++ b/machines/monitor/definition.nix
@ -0,0 +1,13 @@
+{
+  imports = [
+    ./alertmanager.nix
+    ./prometheus.nix
+    ./influxdb.nix
+    ./loki.nix
+    ./grafana.nix
+
+    ./jellyfin-exporter.nix
+  ];
+
+  system.stateVersion = "25.05";
+}
--- a/machines/monitor/grafana.nix
+++ b/machines/monitor/grafana.nix
@ -0,0 +1,126 @@
+{
+  config,
+  pkgs,
+  modulesPath,
+  lib,
+  ...
+}: {
+  services.grafana.enable = true;
+  services.grafana.settings = {
+    server = {
+      http_port = 3000;
+      http_addr = "0.0.0.0";
+      # Grafana needs to know on which domain and URL it's running
+      domain = "grafana.procopius.dk";
+      root_url = "https://grafana.procopius.dk"; # Not needed if it is `https://your.domain/`
+      # serve_from_sub_path = true;
+      oauth_auto_login = false;
+    };
+    "auth.generic_oauth" = {
+      enabled = false;
+    };
+    "auth" = {
+      disable_login_form = false;
+    };
+  };
+
+  networking.firewall.allowedTCPPorts = [3000];
+
+  services.grafana = {
+    # declarativePlugins = with pkgs.grafanaPlugins; [ ... ];
+
+    provision = {
+      enable = true;
+
+      datasources.settings.datasources = [
+        # "Built-in" datasources can be provisioned - c.f. https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources
+        {
+          uid = "prometheus";
+          name = "Prometheus";
+          type = "prometheus";
+          url = "http://127.0.0.1:${toString config.services.prometheus.port}";
+        }
+        {
+          uid = "loki";
+          name = "Loki";
+          type = "loki";
+          url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}";
+        }
+        {
+          uid = "influxdb";
+          name = "InfluxDB";
+          type = "influxdb";
+          url = "http://127.0.0.1:8086";
+          access = "proxy";
+          jsonData = {
+            dbName = "proxmox";
+            httpHeaderName1 = "Authorization";
+          };
+          secureJsonData = {
+            httpHeaderValue1 = "Token iY4MTuqUAVJbBkDUiMde";
+          };
+        }
+      ];
+
+      # Note: removing attributes from the above `datasources.settings.datasources` is not enough for them to be deleted on `grafana`;
+      # One needs to use the following option:
+      # datasources.settings.deleteDatasources = [ { name = "prometheus"; orgId = 1; } { name = "loki"; orgId = 1; } ];
+
+      dashboards.settings.providers = [
+        {
+          name = "my dashboards";
+          options.path = "/etc/grafana-dashboards";
+        }
+      ];
+    };
+  };
+
+  environment.etc."grafana-dashboards/traefik.json" = {
+    source = ./dashboards/traefik.json;
+    user = "grafana";
+    group = "grafana";
+    mode = "0644";
+  };
+
+  environment.etc."grafana-dashboards/traefik-access.json" = {
+    source = ./dashboards/traefik-access.json;
+    user = "grafana";
+    group = "grafana";
+    mode = "0644";
+  };
+
+  environment.etc."grafana-dashboards/grafana-traefik.json" = {
+    source = ./dashboards/grafana-traefik.json;
+    user = "grafana";
+    group = "grafana";
+    mode = "0644";
+  };
+
+  environment.etc."grafana-dashboards/node-exporter.json" = {
+    source = ./dashboards/node-exporter.json;
+    user = "grafana";
+    group = "grafana";
+    mode = "0644";
+  };
+
+  environment.etc."grafana-dashboards/promtail.json" = {
+    source = ./dashboards/promtail.json;
+    user = "grafana";
+    group = "grafana";
+    mode = "0644";
+  };
+
+  environment.etc."grafana-dashboards/gitea.json" = {
+    source = ./dashboards/gitea.json;
+    user = "grafana";
+    group = "grafana";
+    mode = "0644";
+  };
+
+  environment.etc."grafana-dashboards/postgres.json" = {
+    source = ./dashboards/postgres.json;
+    user = "grafana";
+    group = "grafana";
+    mode = "0644";
+  };
+}
--- a/machines/monitor/influxdb.nix
+++ b/machines/monitor/influxdb.nix
@ -0,0 +1,35 @@
+{
+  config,
+  pkgs,
+  ...
+}: let
+  influxdbPassword = config.sops.secrets."influxdb/password".path;
+  influxdbToken = config.sops.secrets."influxdb/token".path;
+in {
+  sops.secrets."influxdb/password" = {
+    sopsFile = ../../secrets/secrets.yaml;
+    owner = "influxdb2";
+  };
+  sops.secrets."influxdb/token" = {
+    sopsFile = ../../secrets/secrets.yaml;
+    owner = "influxdb2";
+  };
+
+  networking.firewall.allowedTCPPorts = [8086];
+
+  services.influxdb2 = {
+    enable = true;
+    settings = {
+    };
+    provision = {
+      enable = true;
+      initialSetup = {
+        username = "plasmagoat";
+        passwordFile = influxdbPassword;
+        tokenFile = influxdbToken;
+        organization = "procopius";
+        bucket = "proxmox";
+      };
+    };
+  };
+}
--- a/machines/monitor/jellyfin-exporter.nix
+++ b/machines/monitor/jellyfin-exporter.nix
@ -0,0 +1,14 @@
+{
+  virtualisation.oci-containers.containers = {
+    jellyfin_exporter = {
+      image = "rebelcore/jellyfin-exporter:latest";
+      ports = [
+        "9594:9594"
+      ];
+      cmd = [
+        "--jellyfin.address=http://media.lab:8096"
+        "--jellyfin.token=f7c89e5aa307434c9b3ecb329e896335"
+      ];
+    };
+  };
+}
--- a/machines/monitor/loki.nix
+++ b/machines/monitor/loki.nix
@ -0,0 +1,37 @@
+{
+  networking.firewall.allowedTCPPorts = [ 3100 ];
+
+  services.loki = {
+    enable = true;
+    configuration = {
+      server.http_listen_port = 3100;
+      auth_enabled = false;
+      analytics.reporting_enabled = false;
+
+      common = {
+        ring = {
+          instance_addr = "127.0.0.1";
+          kvstore.store = "inmemory";
+        };
+        replication_factor = 1;
+        path_prefix = "/tmp/loki";
+      };
+
+      schema_config = {
+        configs = [
+          {
+            from = "2020-05-15";
+            store = "tsdb";
+            object_store = "filesystem";
+            schema = "v13";
+            index = {
+              prefix = "index_";
+              period = "24h";
+            };
+          }
+        ];
+      };
+      storage_config.filesystem.directory = "/var/lib/loki/chunk";
+    };
+  };
+}
--- a/machines/monitor/prometheus.nix
+++ b/machines/monitor/prometheus.nix
@ -0,0 +1,185 @@
+{
+  config,
+  pkgs,
+  modulesPath,
+  lib,
+  ...
+}: let
+  monitor_hostname = "monitor.lab";
+  traefik_hostname = "traefik.lab";
+  sandbox_hostname = "sandbox.lab";
+  forgejo_hostname = "forgejo.lab";
+  runner01_hostname = "forgejo-runner-01.lab";
+  dnsmasq_hostname = "dns.lab";
+  media_hostname = "media.lab";
+  mail_hostname = "mail.lab";
+  keycloak_hostname = "keycloak.lab";
+
+  monitored_hosts = [
+    monitor_hostname
+    traefik_hostname
+    sandbox_hostname
+    forgejo_hostname
+    runner01_hostname
+    dnsmasq_hostname
+    media_hostname
+    mail_hostname
+    keycloak_hostname
+  ];
+
+  # integrate colmena names and targetHost to generate nodeexporters
+  generateTargets = port:
+    map (host: "${host}:${toString port}") monitored_hosts;
+
+  instance_relabel_config = [
+    {
+      source_labels = ["__address__"];
+      regex = "([^:]+):\\d+"; # Captures everything before the last colon
+      target_label = "instance";
+      replacement = "$1";
+    }
+  ];
+
+  node_exporter_port = 9100;
+  node_exporter_job = {
+    job_name = "node";
+    static_configs = [{targets = generateTargets node_exporter_port;}];
+    relabel_configs = instance_relabel_config;
+  };
+
+  promtail_port = 9080;
+  promtail_job = {
+    job_name = "promtail";
+    static_configs = [{targets = generateTargets promtail_port;}];
+    relabel_configs = instance_relabel_config;
+  };
+
+  prometheus_target = "${monitor_hostname}:9090";
+  prometheus_job = {
+    job_name = "prometheus";
+    static_configs = [{targets = [prometheus_target];}];
+    relabel_configs = instance_relabel_config;
+  };
+
+  alertmanager_target = "${monitor_hostname}:9093";
+  alertmanager_job = {
+    job_name = "alertmanager";
+    static_configs = [{targets = [alertmanager_target];}];
+    relabel_configs = instance_relabel_config;
+  };
+
+  grafana_target = "${monitor_hostname}:3000";
+  grafana_job = {
+    job_name = "grafana";
+    static_configs = [{targets = [grafana_target];}];
+    relabel_configs = instance_relabel_config;
+  };
+
+  traefik_monitor_port = 8082;
+  traefik_job = {
+    job_name = "traefik";
+    static_configs = [{targets = ["${traefik_hostname}:${toString traefik_monitor_port}"];}];
+    relabel_configs = instance_relabel_config;
+  };
+
+  forgejo_monitor_port = 3000;
+  forgejo_job = {
+    job_name = "forgejo";
+    static_configs = [{targets = ["${forgejo_hostname}:${toString forgejo_monitor_port}"];}];
+    relabel_configs = instance_relabel_config;
+  };
+
+  postgres_exporter_port = 9187;
+  postgres_job = {
+    job_name = "postgres";
+    static_configs = [{targets = ["${forgejo_hostname}:${toString postgres_exporter_port}"];}];
+    relabel_configs = instance_relabel_config;
+  };
+
+  dnsmasq_exporter_port = 9153;
+  dnsmasq_job = {
+    job_name = "dnsmasq";
+    static_configs = [{targets = ["${dnsmasq_hostname}:${toString dnsmasq_exporter_port}"];}];
+    relabel_configs = instance_relabel_config;
+  };
+
+  # --- Media Stack Scrape Job ---
+  media_stack_job = {
+    job_name = "media_stack";
+    static_configs = [
+      {
+        targets = [
+          "${media_hostname}:9707" # sonarr
+          "${media_hostname}:9708" # readarr
+          "${media_hostname}:9709" # radarr
+          "${media_hostname}:9710" # prowlarr
+          "${media_hostname}:9711" # lidarr
+          "${media_hostname}:9712" # bazarr
+        ];
+      }
+    ];
+    relabel_configs = instance_relabel_config;
+  };
+
+  jellyfin_port = 8096;
+  jellyfin_exporter_port = 9594;
+  jellyfin_job = {
+    job_name = "jellyfin";
+    static_configs = [
+      {
+        targets = [
+          "${media_hostname}:${toString jellyfin_port}"
+          "${monitor_hostname}:${toString jellyfin_exporter_port}"
+        ];
+      }
+    ];
+    relabel_configs = instance_relabel_config;
+  };
+in {
+  networking.firewall.allowedTCPPorts = [9090];
+
+  services.prometheus = {
+    enable = true;
+    retentionTime = "7d";
+    globalConfig = {
+      scrape_timeout = "10s";
+      scrape_interval = "30s";
+      # A short evaluation_interval will check alerting rules very often.
+      # It can be costly if you run Prometheus with 100+ alerts.
+      evaluation_interval = "20s";
+    };
+    extraFlags = [
+      "--web.enable-admin-api"
+    ];
+
+    scrapeConfigs = [
+      node_exporter_job
+      promtail_job
+      prometheus_job
+      alertmanager_job
+      grafana_job
+      traefik_job
+      forgejo_job
+      postgres_job
+      dnsmasq_job
+      media_stack_job
+      jellyfin_job
+    ];
+
+    alertmanagers = [
+      {
+        scheme = "http";
+        static_configs = [{targets = [alertmanager_target];}];
+      }
+    ];
+
+    ruleFiles = [
+      (pkgs.writeText "prometheus-alerts.yml" (builtins.readFile ./provisioning/alerts/prometheus-alerts.yml))
+      (pkgs.writeText "loki-alerts.yml" (builtins.readFile ./provisioning/alerts/loki-alerts.yml))
+      (pkgs.writeText "promtail-alerts.yml" (builtins.readFile ./provisioning/alerts/promtail-alerts.yml))
+      (pkgs.writeText "postgres-alerts.yml" (builtins.readFile ./provisioning/alerts/postgres-alerts.yml))
+      (pkgs.writeText "traefik-alerts.yml" (builtins.readFile ./provisioning/alerts/traefik-alerts.yml))
+      (pkgs.writeText "node-exporter-alerts.yml" (builtins.readFile ./provisioning/alerts/node-exporter-alerts.yml))
+    ];
+  };
+}
--- a/machines/monitor/provisioning/alerts/loki-alerts.yml
+++ b/machines/monitor/provisioning/alerts/loki-alerts.yml
@ -0,0 +1,39 @@
+groups:
+  - name: Loki
+
+    rules:
+      - alert: LokiProcessTooManyRestarts
+        expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Loki process too many restarts (instance {{ $labels.instance }})
+          description: "A loki process had too many restarts (target {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: LokiRequestErrors
+        expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
+        for: 15m
+        labels:
+          severity: critical
+        annotations:
+          summary: Loki request errors (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: LokiRequestPanic
+        expr: "sum(increase(loki_panic_total[10m])) by (namespace, job) > 0"
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Loki request panic (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: LokiRequestLatency
+        expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le)))  > 1'
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Loki request latency (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/machines/monitor/provisioning/alerts/node-exporter-alerts.yml
+++ b/machines/monitor/provisioning/alerts/node-exporter-alerts.yml
@ -0,0 +1,318 @@
+groups:
+  - name: NodeExporter
+
+    rules:
+      - alert: HostOutOfMemory
+        expr: "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of memory (instance {{ $labels.instance }})
+          description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostMemoryUnderMemoryPressure
+        expr: "(rate(node_vmstat_pgmajfault[5m]) > 1000)"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host memory under memory pressure (instance {{ $labels.instance }})
+          description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostMemoryIsUnderutilized
+        expr: "min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8"
+        for: 0m
+        labels:
+          severity: info
+        annotations:
+          summary: Host Memory is underutilized (instance {{ $labels.instance }})
+          description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostUnusualNetworkThroughputIn
+        expr: "((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual network throughput in (instance {{ $labels.instance }})
+          description: "Host receive bandwidth is high (>80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostUnusualNetworkThroughputOut
+        expr: "((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual network throughput out (instance {{ $labels.instance }})
+          description: "Host transmit bandwidth is high (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostUnusualDiskReadRate
+        expr: "(rate(node_disk_io_time_seconds_total[5m]) > .80)"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk read rate (instance {{ $labels.instance }})
+          description: "Disk is too busy (IO wait > 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostOutOfDiskSpace
+        expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Host out of disk space (instance {{ $labels.instance }})
+          description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostDiskMayFillIn24Hours
+        expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
+          description: "Filesystem will likely run out of space within the next 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostOutOfInodes
+        expr: "(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)"
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Host out of inodes (instance {{ $labels.instance }})
+          description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostFilesystemDeviceError
+        expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Host filesystem device error (instance {{ $labels.instance }})
+          description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostInodesMayFillIn24Hours
+        expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
+          description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostUnusualDiskReadLatency
+        expr: "(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk read latency (instance {{ $labels.instance }})
+          description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostUnusualDiskWriteLatency
+        expr: "(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk write latency (instance {{ $labels.instance }})
+          description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostHighCpuLoad
+        expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host high CPU load (instance {{ $labels.instance }})
+          description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostCpuIsUnderutilized
+        expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.95'
+        for: 1w
+        labels:
+          severity: info
+        annotations:
+          summary: Host CPU is underutilized (instance {{ $labels.instance }})
+          description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostCpuStealNoisyNeighbor
+        expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+          description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostCpuHighIowait
+        expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host CPU high iowait (instance {{ $labels.instance }})
+          description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostUnusualDiskIo
+        expr: "rate(node_disk_io_time_seconds_total[5m]) > 0.8"
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk IO (instance {{ $labels.instance }})
+          description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostContextSwitchingHigh
+        expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host context switching high (instance {{ $labels.instance }})
+          description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostSwapIsFillingUp
+        expr: "((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host swap is filling up (instance {{ $labels.instance }})
+          description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostSystemdServiceCrashed
+        expr: '(node_systemd_unit_state{state="failed"} == 1)'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host systemd service crashed (instance {{ $labels.instance }})
+          description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostPhysicalComponentTooHot
+        expr: "node_hwmon_temp_celsius > node_hwmon_temp_max_celsius"
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host physical component too hot (instance {{ $labels.instance }})
+          description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostNodeOvertemperatureAlarm
+        expr: "((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+          description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostSoftwareRaidInsufficientDrives
+        expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
+          description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostSoftwareRaidDiskFailure
+        expr: '(node_md_disks{state="failed"} > 0)'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host software RAID disk failure (instance {{ $labels.instance }})
+          description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostKernelVersionDeviations
+        expr: "changes(node_uname_info[1h]) > 0"
+        for: 0m
+        labels:
+          severity: info
+        annotations:
+          summary: Host kernel version deviations (instance {{ $labels.instance }})
+          description: "Kernel version for {{ $labels.instance }} has changed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostOomKillDetected
+        expr: "(increase(node_vmstat_oom_kill[1m]) > 0)"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host OOM kill detected (instance {{ $labels.instance }})
+          description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostEdacCorrectableErrorsDetected
+        expr: "(increase(node_edac_correctable_errors_total[1m]) > 0)"
+        for: 0m
+        labels:
+          severity: info
+        annotations:
+          summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+          description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostEdacUncorrectableErrorsDetected
+        expr: "(node_edac_uncorrectable_errors_total > 0)"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+          description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostNetworkReceiveErrors
+        expr: "(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host Network Receive Errors (instance {{ $labels.instance }})
+          description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostNetworkTransmitErrors
+        expr: "(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+          description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostNetworkBondDegraded
+        expr: "((node_bonding_active - node_bonding_slaves) != 0)"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+          description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostConntrackLimit
+        expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)"
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host conntrack limit (instance {{ $labels.instance }})
+          description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostClockSkew
+        expr: "((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))"
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host clock skew (instance {{ $labels.instance }})
+          description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: HostClockNotSynchronising
+        expr: "(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host clock not synchronising (instance {{ $labels.instance }})
+          description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/machines/monitor/provisioning/alerts/postgres-alerts.yml
+++ b/machines/monitor/provisioning/alerts/postgres-alerts.yml
@ -0,0 +1,201 @@
+groups:
+  - name: Postgres
+
+    rules:
+      - alert: PostgresqlDown
+        expr: "pg_up == 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql down (instance {{ $labels.instance }})
+          description: "Postgresql instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlRestarted
+        expr: "time() - pg_postmaster_start_time_seconds < 60"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql restarted (instance {{ $labels.instance }})
+          description: "Postgresql restarted\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlExporterError
+        expr: "pg_exporter_last_scrape_error > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql exporter error (instance {{ $labels.instance }})
+          description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlTableNotAutoVacuumed
+        expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
+          description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlTableNotAutoAnalyzed
+        expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
+          description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlTooManyConnections
+        expr: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql too many connections (instance {{ $labels.instance }})
+          description: "PostgreSQL instance has too many connections (> 80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlNotEnoughConnections
+        expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 2'
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql not enough connections (instance {{ $labels.instance }})
+          description: "PostgreSQL instance should have more connections (> 2)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlDeadLocks
+        expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql dead locks (instance {{ $labels.instance }})
+          description: "PostgreSQL has dead-locks\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlHighRollbackRate
+        expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql high rollback rate (instance {{ $labels.instance }})
+          description: "Ratio of transactions being aborted compared to committed is > 2 %\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlCommitRateLow
+        expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql commit rate low (instance {{ $labels.instance }})
+          description: "Postgresql seems to be processing very few transactions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlLowXidConsumption
+        expr: "rate(pg_txid_current[1m]) < 5"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql low XID consumption (instance {{ $labels.instance }})
+          description: "Postgresql seems to be consuming transaction IDs very slowly\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlHighRateStatementTimeout
+        expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
+          description: "Postgres transactions showing high rate of statement timeouts\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlHighRateDeadlock
+        expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
+          description: "Postgres detected deadlocks\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlUnusedReplicationSlot
+        expr: "pg_replication_slots_active == 0"
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql unused replication slot (instance {{ $labels.instance }})
+          description: "Unused Replication Slots\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlTooManyDeadTuples
+        expr: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1"
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
+          description: "PostgreSQL dead tuples is too large\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlConfigurationChanged
+        expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
+        for: 0m
+        labels:
+          severity: info
+        annotations:
+          summary: Postgresql configuration changed (instance {{ $labels.instance }})
+          description: "Postgres Database configuration change has occurred\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlSslCompressionActive
+        expr: "sum(pg_stat_ssl_compression) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql SSL compression active (instance {{ $labels.instance }})
+          description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlTooManyLocksAcquired
+        expr: "((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20"
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
+          description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlBloatIndexHigh(>80%)
+        expr: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)"
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
+          description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlBloatTableHigh(>80%)
+        expr: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)"
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
+          description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlInvalidIndex
+        expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
+        for: 6h
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql invalid index (instance {{ $labels.instance }})
+          description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PostgresqlReplicationLag
+        expr: "pg_replication_lag_seconds > 5"
+        for: 30s
+        labels:
+          severity: warning
+        annotations:
+          summary: Postgresql replication lag (instance {{ $labels.instance }})
+          description: "The PostgreSQL replication lag is high (> 5s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/machines/monitor/provisioning/alerts/prometheus-alerts.yml
+++ b/machines/monitor/provisioning/alerts/prometheus-alerts.yml
@ -0,0 +1,255 @@
+groups:
+  - name: Prometheus
+
+    rules:
+      - alert: PrometheusJobMissing
+        expr: 'absent(up{job="prometheus"})'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus job missing (instance {{ $labels.instance }})
+          description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTargetMissing
+        expr: "up == 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target missing (instance {{ $labels.instance }})
+          description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusAllTargetsMissing
+        expr: "sum by (job) (up) == 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus all targets missing (instance {{ $labels.instance }})
+          description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTargetMissingWithWarmupTime
+        expr: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target missing with warmup time (instance {{ $labels.instance }})
+          description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusConfigurationReloadFailure
+        expr: "prometheus_config_last_reload_successful != 1"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+          description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTooManyRestarts
+        expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus too many restarts (instance {{ $labels.instance }})
+          description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusAlertmanagerJobMissing
+        expr: 'absent(up{job="alertmanager"})'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
+          description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusAlertmanagerConfigurationReloadFailure
+        expr: "alertmanager_config_last_reload_successful != 1"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
+          description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusAlertmanagerConfigNotSynced
+        expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
+          description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusAlertmanagerE2eDeadManSwitch
+        expr: "vector(1)"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
+          description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusNotConnectedToAlertmanager
+        expr: "prometheus_notifications_alertmanagers_discovered < 1"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
+          description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusRuleEvaluationFailures
+        expr: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTemplateTextExpansionFailures
+        expr: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusRuleEvaluationSlow
+        expr: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds"
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
+          description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusNotificationsBacklog
+        expr: "min_over_time(prometheus_notifications_queue_length[10m]) > 0"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus notifications backlog (instance {{ $labels.instance }})
+          description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusAlertmanagerNotificationFailing
+        expr: "rate(alertmanager_notifications_failed_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
+          description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTargetEmpty
+        expr: "prometheus_sd_discovered_targets == 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target empty (instance {{ $labels.instance }})
+          description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTargetScrapingSlow
+        expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus target scraping slow (instance {{ $labels.instance }})
+          description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusLargeScrape
+        expr: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10"
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus large scrape (instance {{ $labels.instance }})
+          description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTargetScrapeDuplicate
+        expr: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0"
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
+          description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbCheckpointCreationFailures
+        expr: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbCheckpointDeletionFailures
+        expr: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbCompactionsFailed
+        expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbHeadTruncationsFailed
+        expr: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbReloadFailures
+        expr: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbWalCorruptions
+        expr: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTsdbWalTruncationsFailed
+        expr: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PrometheusTimeseriesCardinality
+        expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
+          description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/machines/monitor/provisioning/alerts/promtail-alerts.yml
+++ b/machines/monitor/provisioning/alerts/promtail-alerts.yml
@ -0,0 +1,21 @@
+groups:
+  - name: Promtail
+
+    rules:
+      - alert: PromtailRequestErrors
+        expr: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10'
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Promtail request errors (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: PromtailRequestLatency
+        expr: "histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1"
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: Promtail request latency (instance {{ $labels.instance }})
+          description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/machines/monitor/provisioning/alerts/traefik-alerts.yml
+++ b/machines/monitor/provisioning/alerts/traefik-alerts.yml
@ -0,0 +1,30 @@
+groups:
+  - name: Traefik
+
+    rules:
+      - alert: TraefikServiceDown
+        expr: "count(traefik_service_server_up) by (service) == 0"
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Traefik service down (instance {{ $labels.instance }})
+          description: "All Traefik services are down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: TraefikHighHttp4xxErrorRateService
+        expr: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: Traefik high HTTP 4xx error rate service (instance {{ $labels.instance }})
+          description: "Traefik service 4xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      - alert: TraefikHighHttp5xxErrorRateService
+        expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
+          description: "Traefik service 5xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/machines/monitor/provisioning/templates/telegram.tmpl
+++ b/machines/monitor/provisioning/templates/telegram.tmpl
@ -0,0 +1,29 @@
+{{ define "alert_list" }}{{ range . }}
+---
+🪪 <b>{{ .Labels.alertname | html }}</b>
+{{- if eq .Labels.severity "critical" }}
+🚨 CRITICAL 🚨 {{ end }}
+{{- if eq .Labels.severity "warning" }}
+⚠️ WARNING ⚠️{{ end }}
+{{- if .Annotations.summary }}
+📝 {{ .Annotations.summary | html }}{{ end }}
+{{- if .Annotations.description }}
+📖 {{ .Annotations.description | html }}{{ end }}
+
+🏷 Labels:
+{{ range .Labels.SortedPairs }}  <i>{{ .Name | html }}</i>: <code>{{ .Value | html }}</code>
+{{ end }}{{ end }}
+🛠 <a href="https://grafana.procopius.dk">Grafana</a>
+💊 <a href="https://alertmanager.procopius.dk">Alertmanager</a>
+{{ end }}
+
+{{ define "telegram.message" }}
+{{ if gt (len .Alerts.Firing) 0 }}
+🔥 Alerts Firing 🔥
+{{ template "alert_list" .Alerts.Firing }}
+{{ end }}
+{{ if gt (len .Alerts.Resolved) 0 }}
+✅ Alerts Resolved ✅
+{{ template "alert_list" .Alerts.Resolved }}
+{{ end }}
+{{ end }}
--- a/machines/sandbox/definition.nix
+++ b/machines/sandbox/definition.nix
@ -0,0 +1,3 @@
+{
+  system.stateVersion = "25.05";
+}
--- a/nixos/flake.lock
+++ b/nixos/flake.lock
@ -1,8 +1,82 @@
 {
  "nodes": {
+    "colmena": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "nix-github-actions": "nix-github-actions",
+        "nixpkgs": "nixpkgs",
+        "stable": "stable"
+      },
+      "locked": {
+        "lastModified": 1751144689,
+        "narHash": "sha256-cgIntaqhcm62V1KU6GmrAGpHpahT4UExEWW2ryS02ZU=",
+        "owner": "zhaofengli",
+        "repo": "colmena",
+        "rev": "3ceec72cfb396a8a8de5fe96a9d75a9ce88cc18e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "zhaofengli",
+        "repo": "colmena",
+        "type": "github"
+      }
+    },
+    "flake-compat": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1650374568,
+        "narHash": "sha256-Z+s0J8/r907g149rllvwhb4pKi8Wam5ij0st8PwAh+E=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "b4a34015c698c7793d592d66adbab377907a2be8",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "locked": {
+        "lastModified": 1659877975,
+        "narHash": "sha256-zllb8aq3YO3h8B/U0/J1WBgAL8EX5yWf5pMj3G0NAmc=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "c0e246b9b83f637f4681389ecabcb2681b4f3af0",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nix-github-actions": {
+      "inputs": {
+        "nixpkgs": [
+          "colmena",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1729742964,
+        "narHash": "sha256-B4mzTcQ0FZHdpeWcpDYPERtyjJd/NIuaQ9+BV1h+MpA=",
+        "owner": "nix-community",
+        "repo": "nix-github-actions",
+        "rev": "e04df33f62cdcf93d73e9a04142464753a16db67",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "repo": "nix-github-actions",
+        "type": "github"
+      }
+    },
    "nixarr": {
      "inputs": {
-        "nixpkgs": "nixpkgs",
+        "nixpkgs": "nixpkgs_2",
        "vpnconfinement": "vpnconfinement",
        "website-builder": "website-builder"
      },
@ -21,6 +95,22 @@
      }
    },
    "nixpkgs": {
+      "locked": {
+        "lastModified": 1750134718,
+        "narHash": "sha256-v263g4GbxXv87hMXMCpjkIxd/viIF7p3JpJrwgKdNiI=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "9e83b64f727c88a7711a2c463a7b16eedb69a84c",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "nixpkgs_2": {
      "locked": {
        "lastModified": 1748662220,
        "narHash": "sha256-7gGa49iB9nCnFk4h/g9zwjlQAyjtpgcFkODjcOQS0Es=",
@ -36,7 +126,7 @@
        "type": "github"
      }
    },
-    "nixpkgs_2": {
+    "nixpkgs_3": {
      "locked": {
        "lastModified": 1748809735,
        "narHash": "sha256-UR5vKj8rwKQmE8wxKFHgoJKbod05DMoH5phTje4L1l8=",
@ -53,8 +143,9 @@
    },
    "root": {
      "inputs": {
+        "colmena": "colmena",
        "nixarr": "nixarr",
-        "nixpkgs": "nixpkgs_2",
+        "nixpkgs": "nixpkgs_3",
        "sops-nix": "sops-nix"
      }
    },
@ -78,6 +169,22 @@
        "type": "github"
      }
    },
+    "stable": {
+      "locked": {
+        "lastModified": 1750133334,
+        "narHash": "sha256-urV51uWH7fVnhIvsZIELIYalMYsyr2FCalvlRTzqWRw=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "36ab78dab7da2e4e27911007033713bab534187b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-25.05",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
    "vpnconfinement": {
      "locked": {
        "lastModified": 1743810720,
--- a/nixos/flake.nix
+++ b/nixos/flake.nix
@ -3,6 +3,7 @@

  inputs = {
    nixpkgs.url = "github:nixos/nixpkgs";
+    colmena.url = "github:zhaofengli/colmena";
    sops-nix = {
      url = "github:Mic92/sops-nix";
      inputs.nixpkgs.follows = "nixpkgs";
@ -14,7 +15,11 @@
    # };
  };

-  outputs = inputs @ {...}: let
+  outputs = inputs @ {
+    nixpkgs,
+    colmena,
+    ...
+  }: let
    system = "x86_64-linux";

    liveVMs = {
@ -102,5 +107,32 @@
    };
  in {
    nixosConfigurations = liveVMs;
+
+    colmenaHive = colmena.lib.makeHive {
+      meta = {
+        nixpkgs = import nixpkgs {
+          system = "x86_64-linux";
+          overlays = [];
+        };
+
+        defaults = {pkgs, ...}: {
+        };
+      };
+
+      host-b = {
+        name,
+        nodes,
+        pkgs,
+        ...
+      }: {
+        deployment = {
+          targetHost = "somehost.tld";
+          targetPort = 1234;
+          targetUser = "luser";
+        };
+        boot.isContainer = true;
+        time.timeZone = "America/Los_Angeles";
+      };
+    };
  };
 }
--- a/secrets/secrets.yaml
+++ b/secrets/secrets.yaml
@ -0,0 +1,21 @@
+telegram-alert-bot-token: ENC[AES256_GCM,data:7Bvhtrkaqc06xmSeOsw730cAfHAw4qBvz1ontPXO/6j4Hy6AAKbb8LYGIONVGA==,iv:2xdhmAM2anEH7flV72BlfVeXjStu6sEUqT97PW+dY2w=,tag:h12zGj8J0ftKuGuKZuCEmw==,type:str]
+alertmanager:
+    env: ENC[AES256_GCM,data:lMZVLGY4JNeEa1OhiQsAyBqArDttpMjAILjtFQfi7933RfckJMike9cWOV8pSVMJjKUBCtmnir/KDhbyYCZ3oYQh,iv:dLGqXsvJ8x32bqtcaq66O85HcbF/I78HSmo3o/Sx76o=,tag:SHv07/J8O+JPkpTu5rRCzA==,type:str]
+influxdb:
+    password: ENC[AES256_GCM,data:OP+4vK6ulZs7jVM4lgnpUatr+Qs=,iv:MEmD6yyy+Z7beVOdR1xNDn0c27DYDIDTYdnaNiaVHks=,tag:dyG7VPPV40JqSE4UAeVbtQ==,type:str]
+    token: ENC[AES256_GCM,data:QraVWLW1uCSF0YvbkHCKYtPvqs0=,iv:pzkfEyLksjRFVj7wZS8LxO0idQTpEk7OTMpQSsuIRvQ=,tag:d6U6vMqEYbu3CaTpnc0gGw==,type:str]
+sops:
+    age:
+        - recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
+          enc: |
+            -----BEGIN AGE ENCRYPTED FILE-----
+            YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBVa29TZnF4K2dtcEc2N1Qy
+            cmVTeWJpZXlYRWV6SG84M2NjVHhiRER4TFdFCjNCb1owUU5lL1BDOGt4ZXhad242
+            cFI2YWd0SnBCV2RXVUNlY2dKYWpUK1kKLS0tIHh6UmRkdlJqeWZHaTFYQ0M4L2xo
+            QzNYRk5ERmR4aGtLQ3dwQ1lPeDZyaEkKJMLXqv6tBBql7VVnWDIwAh24SfQ2O6Ca
+            CEOQTGEonbqr5doWqTsXUXrdQAS0amL45UdT6ITFtfNAjaHwCMfhZg==
+            -----END AGE ENCRYPTED FILE-----
+    lastmodified: "2025-07-06T17:10:59Z"
+    mac: ENC[AES256_GCM,data:dXLWT5fmSs2ddpFPXA1yOtwaej7b3lPesFxN7aEZ/bV6YRr+Ht5dHFQcXO0TfJArzhFRRtAumdcdVsorMkR4tao4XCcimACcWrZgXlXGM6XgT3hdPJ4006QLePXU+uyzpqyEuOouaxF7fyuSTL68uDr+E/NAHgmP2dnqpWnebpY=,iv:oUkmH/ngp8wvbuXay+2X6YBqhesNdtOPZOV4lvsc/s4=,tag:GErA9zgdkTarUD6fWiMupg==,type:str]
+    unencrypted_suffix: _unencrypted
+    version: 3.10.2