colmena initial implementation for sandbox and monitor
All checks were successful
Hello World / test (push) Successful in 4s

This commit is contained in:
plasmagoat 2025-07-06 21:25:57 +02:00
parent a90630ecb6
commit 5feb74d56d
40 changed files with 27629 additions and 141 deletions

View file

@ -1,61 +0,0 @@
name: "Build NixOS Image"
on:
workflow_dispatch:
jobs:
build:
runs-on: native
steps:
- name: Install nodejs
run: nix-env -iA nixpkgs.nodejs
- name: Checkout repo
uses: actions/checkout@v4
# - name: Install Nix
# uses: cachix/install-nix-action@v31
# with:
# nix_path: nixpkgs=channel:nixos-unstable
# extra_nix_config: |
# experimental-features = nix-command flakes
# - name: Enable experimental features
# run: |
# mkdir -p ~/.config/nix
# echo "experimental-features = nix-command flakes" >> ~/.config/nix/nix.conf
# - name: Update Channel
# run: nix-channel --update
- name: Build NixOS image
working-directory: nixos
run: nix build .#proxmoxTemplate
# - name: Upload & Restore to Proxmox
# working-directory: nixos
# env:
# PROXMOX_SSH_KEY: ${{ secrets.PROXMOX_SSH_KEY }}
# PROXMOX_HOST: 192.168.1.205
# PROXMOX_USER: root
# run: |
# set -e
# IMAGE_NAME="vm-image.vma.zst"
# REMOTE_PATH="/var/lib/vz/template/$IMAGE_NAME"
# VM_ID="9000"
# echo "Starting ssh-agent and uploading..."
# eval "$(ssh-agent -s)"
# ssh-add <(echo "$PROXMOX_SSH_KEY")
# echo "Uploading image..."
# scp -o StrictHostKeyChecking=no ./result/$IMAGE_NAME $PROXMOX_USER@$PROXMOX_HOST:$REMOTE_PATH
# echo "Restoring VM $VM_ID..."
# ssh -o StrictHostKeyChecking=no $PROXMOX_USER@$PROXMOX_HOST "
# qm stop $VM_ID || true
# qm destroy $VM_ID || true
# qmrestore --unique $REMOTE_PATH $VM_ID
# qm template $VM_ID
# "

View file

@ -0,0 +1,53 @@
name: "Colmena apply"
on:
push:
tags:
- "v*" # triggers on v1.0.0, v1.2.3, etc.
workflow_dispatch:
jobs:
apply:
name: Apply flake configurations to colmena hive
# Ensure 'nixos-latest' runner has Docker, SSH client, and basic Nix tools installed.
# It seems it already does.
runs-on: nixos-latest
env:
NIXOS_BUILER_HOST: nixos-builder.lab
NIXOS_BUILER_USER: runner
steps:
# Use nix-env for setup (as you prefer and it works well for ephemeral environments)
- name: Install dependencies via nix-env
run: |
nix-env -iA nixpkgs.nodejs
nix-env -iA nixpkgs.openssh
nix-env -if https://github.com/zhaofengli/colmena/tarball/main
nix-env -iA cachix -f https://cachix.org/api/v1/install
cachix use plasmagoat
cachix authtoken ${{ secrets.CACHIX_AUTH_TOKEN }}
- name: Checkout repo
uses: actions/checkout@v4
- name: Enable experimental features
run: |
mkdir -p ~/.config/nix
echo "experimental-features = nix-command flakes" >> ~/.config/nix/nix.conf
- name: Prepare SSH keys and known_hosts for builder and Proxmox
run: |
mkdir -p ~/.ssh
echo "${{ secrets.RUNNER_SSH_KEY }}" > ~/.ssh/id_rsa
chmod 600 ~/.ssh/id_rsa
ssh-keyscan -H "$NIXOS_BUILER_HOST" >> ~/.ssh/known_hosts
chmod 600 ~/.ssh/known_hosts
- name: Test SSH connection to NixOS Builder
run: |
echo "Testing SSH connection to $NIXOS_BUILER_HOST..."
ssh -o StrictHostKeyChecking=yes "$NIXOS_BUILER_USER"@"$NIXOS_BUILER_HOST" "echo 'SSH success. Hostname:' && hostname"
- name: Apply Colmena
id: apply
run: colmena apply

View file

@ -1,39 +0,0 @@
name: Deploy NixOS VM
on:
workflow_dispatch:
jobs:
deploy:
runs-on: docker
container:
image: nixos/nix
steps:
- name: Checkout repo
uses: actions/checkout@v4
- name: Install Terraform
run: nix-env -iA nixpkgs.terraform
- name: Setup SSH key
run: |
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
env:
SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Terraform Init & Apply
run: |
terraform init
terraform apply -auto-approve
working-directory: ./terraform
env:
PROXMOX_PASSWORD: ${{ secrets.PROXMOX_PASSWORD }}
- name: Deploy NixOS via nixos-anywhere
run: |
nix run github:numtide/nixos-anywhere -- \
--build-on-remote \
--flake .#new-vm \
root@<new-vm-ip>

View file

@ -1,34 +0,0 @@
name: Terraform Proxmox NixOS VM Deploy
on:
workflow_dispatch:
jobs:
deploy-nixos-vm:
runs-on: nixos-latest
steps:
- name: Install nodejs
run: nix-env -iA nixpkgs.nodejs
- name: Install terraform
run: nix-env -iA nixpkgs.terraform
- name: Install sops
run: nix-env -iA nixpkgs.sops
- name: Checkout repo
uses: actions/checkout@v3
- name: Decrypt secrets
env:
SOPS_AGE_KEY_FILE: ${{ secrets.AGE_KEY_FILE }}
run: |
sops --decrypt secrets.yaml.enc > secrets.yaml
- name: Terraform Init
run: terraform init
- name: Terraform Apply
env:
PROXMOX_PASSWORD: ${{ secrets.PROXMOX_PASSWORD }}
run: terraform apply -auto-approve

3
.sops.yaml Normal file
View file

@ -0,0 +1,3 @@
creation_rules:
- age: >-
age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y

View file

@ -55,7 +55,7 @@
register: qga_json
failed_when: qga_json.rc != 0
- name: Parse out eth0s IPv4 address
- name: Parse out eth0's IPv4 address
ansible.builtin.set_fact:
vm_ipv4: >-
{{
@ -71,11 +71,11 @@
)
}}
- name: Show the VMs IP
- name: Show the VM's IP
ansible.builtin.debug:
msg: "VM {{ new_vmid }} ({{ new_name }}) reports IPv4: {{ vm_ipv4 }}"
- name: Add new VMs IP to in-memory inventory (for later tasks)
- name: Add new VM's IP to in-memory inventory (for later tasks)
ansible.builtin.add_host:
name: "nixos-{{ new_vmid }}"
ansible_host: "{{ vm_ipv4 }}"

154
flake.lock generated Normal file
View file

@ -0,0 +1,154 @@
{
"nodes": {
"colmena": {
"inputs": {
"flake-compat": "flake-compat",
"flake-utils": "flake-utils",
"nix-github-actions": "nix-github-actions",
"nixpkgs": "nixpkgs",
"stable": "stable"
},
"locked": {
"lastModified": 1751144689,
"narHash": "sha256-cgIntaqhcm62V1KU6GmrAGpHpahT4UExEWW2ryS02ZU=",
"owner": "zhaofengli",
"repo": "colmena",
"rev": "3ceec72cfb396a8a8de5fe96a9d75a9ce88cc18e",
"type": "github"
},
"original": {
"owner": "zhaofengli",
"repo": "colmena",
"type": "github"
}
},
"flake-compat": {
"flake": false,
"locked": {
"lastModified": 1650374568,
"narHash": "sha256-Z+s0J8/r907g149rllvwhb4pKi8Wam5ij0st8PwAh+E=",
"owner": "edolstra",
"repo": "flake-compat",
"rev": "b4a34015c698c7793d592d66adbab377907a2be8",
"type": "github"
},
"original": {
"owner": "edolstra",
"repo": "flake-compat",
"type": "github"
}
},
"flake-utils": {
"locked": {
"lastModified": 1659877975,
"narHash": "sha256-zllb8aq3YO3h8B/U0/J1WBgAL8EX5yWf5pMj3G0NAmc=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "c0e246b9b83f637f4681389ecabcb2681b4f3af0",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nix-github-actions": {
"inputs": {
"nixpkgs": [
"colmena",
"nixpkgs"
]
},
"locked": {
"lastModified": 1729742964,
"narHash": "sha256-B4mzTcQ0FZHdpeWcpDYPERtyjJd/NIuaQ9+BV1h+MpA=",
"owner": "nix-community",
"repo": "nix-github-actions",
"rev": "e04df33f62cdcf93d73e9a04142464753a16db67",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "nix-github-actions",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1750134718,
"narHash": "sha256-v263g4GbxXv87hMXMCpjkIxd/viIF7p3JpJrwgKdNiI=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "9e83b64f727c88a7711a2c463a7b16eedb69a84c",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"nixpkgs_2": {
"locked": {
"lastModified": 1751801514,
"narHash": "sha256-Ve3ZTzcXEGt4IoXLsWqk35w3w4cH5G1MJb+gLdj/jtE=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "4e3e6431fd60d653bb7f4fa5487e2c500d50f49f",
"type": "github"
},
"original": {
"owner": "nixos",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"colmena": "colmena",
"nixpkgs": "nixpkgs_2",
"sops-nix": "sops-nix"
}
},
"sops-nix": {
"inputs": {
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1751606940,
"narHash": "sha256-KrDPXobG7DFKTOteqdSVeL1bMVitDcy7otpVZWDE6MA=",
"owner": "Mic92",
"repo": "sops-nix",
"rev": "3633fc4acf03f43b260244d94c71e9e14a2f6e0d",
"type": "github"
},
"original": {
"owner": "Mic92",
"repo": "sops-nix",
"type": "github"
}
},
"stable": {
"locked": {
"lastModified": 1750133334,
"narHash": "sha256-urV51uWH7fVnhIvsZIELIYalMYsyr2FCalvlRTzqWRw=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "36ab78dab7da2e4e27911007033713bab534187b",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-25.05",
"repo": "nixpkgs",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

34
flake.nix Normal file
View file

@ -0,0 +1,34 @@
{
description = "Declarative NixOS HomeLab";
inputs = {
nixpkgs.url = "github:nixos/nixpkgs";
# systems.url = "github:nix-systems/default";
sops-nix = {
url = "github:Mic92/sops-nix";
inputs.nixpkgs.follows = "nixpkgs";
};
# home-manager = {
# url = "home-manager";
# inputs.nixpkgs.follows = "nixpkgs";
# };
colmena.url = "github:zhaofengli/colmena";
};
outputs = {
self,
nixpkgs,
# systems,
sops-nix,
# home-manager,
colmena,
...
} @ inputs: let
overlays = [
colmena.overlays.default
];
in {
colmenaHive = colmena.lib.makeHive self.outputs.colmena;
colmena = (import ./hive.nix) (inputs // {inherit overlays;});
};
}

48
hive.nix Normal file
View file

@ -0,0 +1,48 @@
inputs @ {
self,
nixpkgs,
sops-nix,
# home-manager,
overlays,
...
}: {
meta = {
nixpkgs = import nixpkgs {
system = "x86_64-linux";
};
specialArgs.flakeInputs = inputs;
};
defaults = {
pkgs,
lib,
name,
nodes,
meta,
config,
...
}: {
imports = [
./machines/_default
./machines/modules
sops-nix.nixosModules.sops
# home-manager.nixosModules.home-manager
];
nixpkgs = {
inherit overlays;
system = lib.mkDefault "x86_64-linux";
config.allowUnfree = true;
};
deployment.tags = [config.nixpkgs.system name];
};
sandbox = {name, ...}: {
imports = [./machines/${name}/definition.nix];
deployment.tags = ["sandbox"];
};
monitor = {name, ...}: {
imports = [./machines/${name}/definition.nix];
deployment.tags = ["grafana" "prometheus"];
};
}

View file

@ -0,0 +1,89 @@
{
pkgs,
lib,
modulesPath,
...
}: {
imports = [
# Enables QEMU Guest Agent support in the VM
(modulesPath + "/profiles/qemu-guest.nix")
];
services.qemuGuest.enable = lib.mkDefault true;
boot.loader.grub.enable = lib.mkDefault true;
boot.loader.grub.devices = ["nodev"];
boot.growPartition = lib.mkDefault true;
boot.tmp.cleanOnBoot = true;
fileSystems."/" = lib.mkDefault {
device = "/dev/disk/by-label/nixos";
autoResize = true; # grow on first boot
fsType = "ext4";
};
nix = {
settings.experimental-features = ["nix-command" "flakes"];
gc.automatic = true;
gc.options = "--delete-older-than 15d";
gc.dates = "daily";
optimise.automatic = true;
settings = {
auto-optimise-store = true;
allowed-users = ["@wheel"];
trusted-users = ["root" "@wheel"];
};
extraOptions = ''
keep-outputs = true
keep-derivations = true
'';
};
security.sudo.wheelNeedsPassword = false;
users.users.plasmagoat = {
isNormalUser = true;
description = "plasmagoat";
extraGroups = ["wheel" "docker"];
# shell = pkgs.zsh;
# shell = pkgs.fish;
};
services.openssh.enable = true;
services.openssh.openFirewall = true;
services.openssh.settings.PasswordAuthentication = false;
services.openssh.settings.PermitRootLogin = "prohibit-password";
services.openssh.settings.KbdInteractiveAuthentication = false;
services.sshguard.enable = true;
programs.ssh.startAgent = true;
users.users.plasmagoat.openssh.authorizedKeys.keys = [
"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCeg/n/vst9KME8byhxX2FhA+FZNQ60W38kkNt45eNzK5zFqBYuwo1nDXVanJSh9unRvB13b+ygpZhrb4sHvkETGWiEioc49MiWr8czEhu6Wpo0vv5MAJkiYvGZUYPdUW52jUzWcYdw8PukG2rowrxL5G0CmsqLwHMPU2FyeCe5aByFI/JZb8R80LoEacgjUiipJcoLWUVgG2koMomHClqGu+16kB8nL5Ja3Kc9lgLfDK7L0A5R8JXhCjrlEsmXbxZmwDKuxvjDAZdE9Sl1VZmMDfWkyrRlenrt01eR3t3Fec6ziRm5ZJk9e2Iu1DPoz+PoHH9aZGVwmlvvnr/gMF3OILxcqb0qx+AYlCCnb6D6pJ9zufhZkKcPRS1Q187F6fz+v2oD1xLZWFHJ92+7ItM0WmbDOHOC29s5EA6wNm3iXZCq86OI3n6T34njDtPqh6Z7Pk2sdK4GBwnFj4KwEWXvdKZKSX1qb2EVlEBE9QI4Gf3eg4SiBu2cAFt3nOSzs8c= asol\dbs@ALPHA-DBS-P14sG2"
"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC+U3DWOrklcA8n8wdbLBGyli5LsJI3dpL2Zod8mx8eOdC4H127ZT1hzuk2uSmkic4c73BykPyQv8rcqwaRGW94xdMRanKmHYxnbHXo5FBiGrCkNlNNZuahthAGO49c6sUhJMq0eLhYOoFWjtf15sr5Zu7Ug2YTUL3HXB1o9PZ3c9sqYHo2rC/Il1x2j3jNAMKST/qUZYySvdfNJEeQhMbQcdoKJsShcE3oGRL6DFBoV/mjJAJ+wuDhGLDnqi79nQjYfbYja1xKcrKX+D3MfkFxFl6ZIzomR1t75AnZ+09oaWcv1J7ehZ3h9PpDBFNXvzyLwDBMNS+UYcH6SyFjkUbF David@NZXT"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICUP7m8jZJiclZGfSje8CeBYFhX10SrdtjYziuChmj1X plasmagoat@macbook-air"
];
users.users.root.openssh.authorizedKeys.keys = [
"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCeg/n/vst9KME8byhxX2FhA+FZNQ60W38kkNt45eNzK5zFqBYuwo1nDXVanJSh9unRvB13b+ygpZhrb4sHvkETGWiEioc49MiWr8czEhu6Wpo0vv5MAJkiYvGZUYPdUW52jUzWcYdw8PukG2rowrxL5G0CmsqLwHMPU2FyeCe5aByFI/JZb8R80LoEacgjUiipJcoLWUVgG2koMomHClqGu+16kB8nL5Ja3Kc9lgLfDK7L0A5R8JXhCjrlEsmXbxZmwDKuxvjDAZdE9Sl1VZmMDfWkyrRlenrt01eR3t3Fec6ziRm5ZJk9e2Iu1DPoz+PoHH9aZGVwmlvvnr/gMF3OILxcqb0qx+AYlCCnb6D6pJ9zufhZkKcPRS1Q187F6fz+v2oD1xLZWFHJ92+7ItM0WmbDOHOC29s5EA6wNm3iXZCq86OI3n6T34njDtPqh6Z7Pk2sdK4GBwnFj4KwEWXvdKZKSX1qb2EVlEBE9QI4Gf3eg4SiBu2cAFt3nOSzs8c= asol\\dbs@ALPHA-DBS-P14sG2"
"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC+U3DWOrklcA8n8wdbLBGyli5LsJI3dpL2Zod8mx8eOdC4H127ZT1hzuk2uSmkic4c73BykPyQv8rcqwaRGW94xdMRanKmHYxnbHXo5FBiGrCkNlNNZuahthAGO49c6sUhJMq0eLhYOoFWjtf15sr5Zu7Ug2YTUL3HXB1o9PZ3c9sqYHo2rC/Il1x2j3jNAMKST/qUZYySvdfNJEeQhMbQcdoKJsShcE3oGRL6DFBoV/mjJAJ+wuDhGLDnqi79nQjYfbYja1xKcrKX+D3MfkFxFl6ZIzomR1t75AnZ+09oaWcv1J7ehZ3h9PpDBFNXvzyLwDBMNS+UYcH6SyFjkUbF David@NZXT"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICUP7m8jZJiclZGfSje8CeBYFhX10SrdtjYziuChmj1X plasmagoat@macbook-air"
];
environment.systemPackages = with pkgs; [
dig
nmap
traceroute
vim
git
curl
python3
];
time.timeZone = "Europe/Copenhagen";
console.keyMap = "dk-latin1";
}

View file

@ -0,0 +1,40 @@
{
lib,
name,
...
}: {
imports = [
./common_config.nix
];
networking.hostName = name;
deployment = {
replaceUnknownProfiles = lib.mkDefault true;
buildOnTarget = lib.mkDefault false;
targetHost = lib.mkDefault "${name}.lab";
tags = lib.mkDefault ["homelab"];
};
sops = {
age.keyFile = "/etc/sops/age.key";
defaultSopsFile = ../../secrets/secrets.yml;
};
# home-manager = {
# useGlobalPkgs = true;
# useUserPackages = true;
# users.cottand = {
# imports = with flakeInputs.cottand.homeManagerModules; [cli];
# home.stateVersion = "22.11";
# };
# users.root = {
# imports = with flakeInputs.cottand.homeManagerModules; [cli];
# home.stateVersion = "22.11";
# };
# };
# consulNode.enable = lib.mkDefault true;
nodeExporter.enable = lib.mkDefault true;
journalLog.enable = lib.mkDefault true;
}

View file

@ -0,0 +1,11 @@
{
imports = [
./node-exporter.nix
./journal-log.nix
# ./wireguard.nix
# ./nomad.nix
# ./vault.nix
# ./vaultSecret.nix
# ./consul.nix
];
}

View file

@ -0,0 +1,95 @@
{
lib,
config,
nodes,
# name,
# meta,
...
}:
with lib; let
cfg = config.journalLog;
in {
options.journalLog = {
enable = mkOption {
type = types.bool;
default = false;
};
port = mkOption {
type = types.number;
default = 9080;
};
clientUrl = mkOption {
type = types.string;
default = "http://monitor.lab:3100/loki/api/v1/push";
};
extraConfig = mkOption {
type = types.attrs;
default = {};
};
};
config = mkIf cfg.enable {
networking.firewall.allowedTCPPorts = [cfg.port];
systemd.tmpfiles.rules = [
"d /var/lib/promtail 0755 promtail promtail -"
];
services.promtail = {
enable = true;
configuration = {
server = {
http_listen_port = cfg.port;
grpc_listen_port = 0;
};
positions = {
filename = "/var/lib/promtail/positions.yaml";
};
clients = [
{
url = cfg.clientUrl;
}
];
scrape_configs = [
{
job_name = "journal";
journal = {
path = "/var/log/journal";
labels = {
job = "promtail";
host = config.networking.hostName;
env = "proxmox";
instance = "${config.networking.hostName}.lab";
};
};
relabel_configs = [
{
source_labels = ["__journal__systemd_unit"];
target_label = "unit";
}
{
source_labels = ["__journal__hostname"];
target_label = "host";
}
{
source_labels = ["__journal__systemd_user_unit"];
target_label = "user_unit";
}
{
source_labels = ["__journal__transport"];
target_label = "transport";
}
{
source_labels = ["__journal_priority_keyword"];
target_label = "severity";
}
];
}
];
};
};
};
}

View file

@ -0,0 +1,40 @@
{
lib,
config,
# name,
# meta,
...
}:
with lib; let
cfg = config.nodeExporter;
in {
options.nodeExporter = {
enable = mkOption {
type = types.bool;
default = false;
};
port = mkOption {
type = types.number;
default = 9100;
};
extraConfig = mkOption {
type = types.attrs;
default = {};
};
};
config = mkIf cfg.enable {
networking.firewall.allowedTCPPorts = [cfg.port];
services.prometheus.exporters.node =
{
enable = true;
enabledCollectors = ["systemd"];
port = cfg.port;
extraFlags = ["--collector.ethtool" "--collector.softirqs" "--collector.tcpstat" "--collector.wifi"];
}
// cfg.extraConfig;
};
}

View file

@ -0,0 +1,63 @@
{
config,
pkgs,
...
}: let
alertmanagerEnv = config.sops.secrets."alertmanager/env".path;
in {
sops.secrets."alertmanager/env" = {
sopsFile = ../../secrets/secrets.yaml;
mode = "0440";
};
services.prometheus.alertmanager = {
enable = true;
openFirewall = true;
environmentFile = alertmanagerEnv;
webExternalUrl = "http://monitor.lab:9093"; # optional but helpful
configuration = {
route = {
receiver = "null";
group_by = ["alertname"];
group_wait = "10s";
group_interval = "5m";
repeat_interval = "4h";
routes = [
{
receiver = "telegram";
matchers = [
"severity =~ \"warning|critical\""
];
group_wait = "10s";
continue = true;
}
];
};
receivers = [
{name = "null";}
{
name = "telegram";
telegram_configs = [
{
api_url = "https://api.telegram.org";
bot_token = "$TELEGRAM_BOT_TOKEN";
chat_id = -1002642560007;
message_thread_id = 4;
parse_mode = "HTML";
send_resolved = true;
message = "{{ template \"telegram.message\". }}";
}
];
}
];
templates = [
(pkgs.writeText "telegram.tmpl" (builtins.readFile ./provisioning/templates/telegram.tmpl))
# (pkgs.writeText "telegram.markdown.v2.tmpl" (builtins.readFile ./provisioning/templates/telegram.markdown.v2.tmpl))
];
};
};
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,692 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Traefik dashboard prometheus",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 2,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 10,
"panels": [],
"title": "$backend stats",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
}
},
"decimals": 0,
"mappings": [],
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 1
},
"id": 2,
"maxDataPoints": 3,
"options": {
"displayLabels": [],
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"values": ["value", "percent"]
},
"pieType": "pie",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"text": {},
"tooltip": {
"hideZeros": false,
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "12.0.0+security-01",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"exemplar": true,
"expr": "traefik_service_requests_total{service=\"$service\"}",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{method}} : {{code}}",
"refId": "A"
}
],
"title": "$service return code",
"type": "piechart"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"fixedColor": "rgb(31, 120, 193)",
"mode": "fixed"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ms"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 1
},
"id": 4,
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": ["mean"],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.0.0+security-01",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"exemplar": true,
"expr": "sum(traefik_service_request_duration_seconds_sum{service=\"$service\"}) / sum(traefik_service_requests_total{service=\"$service\"}) * 1000",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
}
],
"title": "$service response time",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "bars",
"fillOpacity": 100,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 8
},
"id": 3,
"options": {
"alertThreshold": true,
"legend": {
"calcs": ["mean", "max", "min"],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.0.0+security-01",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"exemplar": true,
"expr": "sum(rate(traefik_service_requests_total{service=\"$service\"}[5m]))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "Total requests $service",
"refId": "A"
}
],
"title": "Total requests over 5min $service",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 15
},
"id": 12,
"panels": [],
"title": "Global stats",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "bars",
"fillOpacity": 100,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 16
},
"id": 5,
"options": {
"alertThreshold": true,
"legend": {
"calcs": ["lastNotNull", "max", "min"],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.0.0+security-01",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code=\"200\"}[5m])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{method}} : {{code}}",
"refId": "A"
}
],
"title": "Status code 200 over 5min",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "bars",
"fillOpacity": 100,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 16
},
"id": 6,
"options": {
"alertThreshold": true,
"legend": {
"calcs": ["lastNotNull", "max", "min"],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.0.0+security-01",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code!=\"200\"}[5m])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{ method }} : {{code}}",
"refId": "A"
}
],
"title": "Others status code over 5min",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
}
},
"decimals": 0,
"mappings": [],
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 23
},
"id": 7,
"maxDataPoints": 3,
"options": {
"displayLabels": [],
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"values": ["value"]
},
"pieType": "pie",
"reduceOptions": {
"calcs": ["sum"],
"fields": "",
"values": false
},
"text": {},
"tooltip": {
"hideZeros": false,
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "12.0.0+security-01",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"exemplar": true,
"expr": "sum(rate(traefik_service_requests_total[5m])) by (service) ",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{ service }}",
"refId": "A"
}
],
"title": "Requests by service",
"type": "piechart"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
}
},
"decimals": 0,
"mappings": [],
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 23
},
"id": 8,
"maxDataPoints": 3,
"options": {
"displayLabels": [],
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"values": ["value"]
},
"pieType": "pie",
"reduceOptions": {
"calcs": ["sum"],
"fields": "",
"values": false
},
"text": {},
"tooltip": {
"hideZeros": false,
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "12.0.0+security-01",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"exemplar": true,
"expr": "sum(rate(traefik_entrypoint_requests_total{entrypoint =~ \"$entrypoint\"}[5m])) by (entrypoint) ",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{ entrypoint }}",
"refId": "A"
}
],
"title": "Requests by protocol",
"type": "piechart"
}
],
"preload": false,
"schemaVersion": 41,
"tags": ["traefik", "prometheus"],
"templating": {
"list": [
{
"current": {},
"datasource": "Prometheus",
"definition": "label_values({job=\"traefik\"},service)",
"includeAll": false,
"name": "service",
"options": [],
"query": {
"qryType": 1,
"query": "label_values({job=\"traefik\"},service)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
"regex": "",
"type": "query"
},
{
"current": {},
"datasource": "Prometheus",
"definition": "",
"includeAll": true,
"multi": true,
"name": "entrypoint",
"options": [],
"query": {
"query": "label_values(entrypoint)",
"refId": "Prometheus-entrypoint-Variable-Query"
},
"refresh": 1,
"regex": "",
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Traefik",
"uid": "qPdAviJmz",
"version": 1
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,13 @@
{
imports = [
./alertmanager.nix
./prometheus.nix
./influxdb.nix
./loki.nix
./grafana.nix
./jellyfin-exporter.nix
];
system.stateVersion = "25.05";
}

View file

@ -0,0 +1,126 @@
{
config,
pkgs,
modulesPath,
lib,
...
}: {
services.grafana.enable = true;
services.grafana.settings = {
server = {
http_port = 3000;
http_addr = "0.0.0.0";
# Grafana needs to know on which domain and URL it's running
domain = "grafana.procopius.dk";
root_url = "https://grafana.procopius.dk"; # Not needed if it is `https://your.domain/`
# serve_from_sub_path = true;
oauth_auto_login = false;
};
"auth.generic_oauth" = {
enabled = false;
};
"auth" = {
disable_login_form = false;
};
};
networking.firewall.allowedTCPPorts = [3000];
services.grafana = {
# declarativePlugins = with pkgs.grafanaPlugins; [ ... ];
provision = {
enable = true;
datasources.settings.datasources = [
# "Built-in" datasources can be provisioned - c.f. https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources
{
uid = "prometheus";
name = "Prometheus";
type = "prometheus";
url = "http://127.0.0.1:${toString config.services.prometheus.port}";
}
{
uid = "loki";
name = "Loki";
type = "loki";
url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}";
}
{
uid = "influxdb";
name = "InfluxDB";
type = "influxdb";
url = "http://127.0.0.1:8086";
access = "proxy";
jsonData = {
dbName = "proxmox";
httpHeaderName1 = "Authorization";
};
secureJsonData = {
httpHeaderValue1 = "Token iY4MTuqUAVJbBkDUiMde";
};
}
];
# Note: removing attributes from the above `datasources.settings.datasources` is not enough for them to be deleted on `grafana`;
# One needs to use the following option:
# datasources.settings.deleteDatasources = [ { name = "prometheus"; orgId = 1; } { name = "loki"; orgId = 1; } ];
dashboards.settings.providers = [
{
name = "my dashboards";
options.path = "/etc/grafana-dashboards";
}
];
};
};
environment.etc."grafana-dashboards/traefik.json" = {
source = ./dashboards/traefik.json;
user = "grafana";
group = "grafana";
mode = "0644";
};
environment.etc."grafana-dashboards/traefik-access.json" = {
source = ./dashboards/traefik-access.json;
user = "grafana";
group = "grafana";
mode = "0644";
};
environment.etc."grafana-dashboards/grafana-traefik.json" = {
source = ./dashboards/grafana-traefik.json;
user = "grafana";
group = "grafana";
mode = "0644";
};
environment.etc."grafana-dashboards/node-exporter.json" = {
source = ./dashboards/node-exporter.json;
user = "grafana";
group = "grafana";
mode = "0644";
};
environment.etc."grafana-dashboards/promtail.json" = {
source = ./dashboards/promtail.json;
user = "grafana";
group = "grafana";
mode = "0644";
};
environment.etc."grafana-dashboards/gitea.json" = {
source = ./dashboards/gitea.json;
user = "grafana";
group = "grafana";
mode = "0644";
};
environment.etc."grafana-dashboards/postgres.json" = {
source = ./dashboards/postgres.json;
user = "grafana";
group = "grafana";
mode = "0644";
};
}

View file

@ -0,0 +1,35 @@
{
config,
pkgs,
...
}: let
influxdbPassword = config.sops.secrets."influxdb/password".path;
influxdbToken = config.sops.secrets."influxdb/token".path;
in {
sops.secrets."influxdb/password" = {
sopsFile = ../../secrets/secrets.yaml;
owner = "influxdb2";
};
sops.secrets."influxdb/token" = {
sopsFile = ../../secrets/secrets.yaml;
owner = "influxdb2";
};
networking.firewall.allowedTCPPorts = [8086];
services.influxdb2 = {
enable = true;
settings = {
};
provision = {
enable = true;
initialSetup = {
username = "plasmagoat";
passwordFile = influxdbPassword;
tokenFile = influxdbToken;
organization = "procopius";
bucket = "proxmox";
};
};
};
}

View file

@ -0,0 +1,14 @@
{
virtualisation.oci-containers.containers = {
jellyfin_exporter = {
image = "rebelcore/jellyfin-exporter:latest";
ports = [
"9594:9594"
];
cmd = [
"--jellyfin.address=http://media.lab:8096"
"--jellyfin.token=f7c89e5aa307434c9b3ecb329e896335"
];
};
};
}

37
machines/monitor/loki.nix Normal file
View file

@ -0,0 +1,37 @@
{
networking.firewall.allowedTCPPorts = [ 3100 ];
services.loki = {
enable = true;
configuration = {
server.http_listen_port = 3100;
auth_enabled = false;
analytics.reporting_enabled = false;
common = {
ring = {
instance_addr = "127.0.0.1";
kvstore.store = "inmemory";
};
replication_factor = 1;
path_prefix = "/tmp/loki";
};
schema_config = {
configs = [
{
from = "2020-05-15";
store = "tsdb";
object_store = "filesystem";
schema = "v13";
index = {
prefix = "index_";
period = "24h";
};
}
];
};
storage_config.filesystem.directory = "/var/lib/loki/chunk";
};
};
}

View file

@ -0,0 +1,185 @@
{
config,
pkgs,
modulesPath,
lib,
...
}: let
monitor_hostname = "monitor.lab";
traefik_hostname = "traefik.lab";
sandbox_hostname = "sandbox.lab";
forgejo_hostname = "forgejo.lab";
runner01_hostname = "forgejo-runner-01.lab";
dnsmasq_hostname = "dns.lab";
media_hostname = "media.lab";
mail_hostname = "mail.lab";
keycloak_hostname = "keycloak.lab";
monitored_hosts = [
monitor_hostname
traefik_hostname
sandbox_hostname
forgejo_hostname
runner01_hostname
dnsmasq_hostname
media_hostname
mail_hostname
keycloak_hostname
];
# integrate colmena names and targetHost to generate nodeexporters
generateTargets = port:
map (host: "${host}:${toString port}") monitored_hosts;
instance_relabel_config = [
{
source_labels = ["__address__"];
regex = "([^:]+):\\d+"; # Captures everything before the last colon
target_label = "instance";
replacement = "$1";
}
];
node_exporter_port = 9100;
node_exporter_job = {
job_name = "node";
static_configs = [{targets = generateTargets node_exporter_port;}];
relabel_configs = instance_relabel_config;
};
promtail_port = 9080;
promtail_job = {
job_name = "promtail";
static_configs = [{targets = generateTargets promtail_port;}];
relabel_configs = instance_relabel_config;
};
prometheus_target = "${monitor_hostname}:9090";
prometheus_job = {
job_name = "prometheus";
static_configs = [{targets = [prometheus_target];}];
relabel_configs = instance_relabel_config;
};
alertmanager_target = "${monitor_hostname}:9093";
alertmanager_job = {
job_name = "alertmanager";
static_configs = [{targets = [alertmanager_target];}];
relabel_configs = instance_relabel_config;
};
grafana_target = "${monitor_hostname}:3000";
grafana_job = {
job_name = "grafana";
static_configs = [{targets = [grafana_target];}];
relabel_configs = instance_relabel_config;
};
traefik_monitor_port = 8082;
traefik_job = {
job_name = "traefik";
static_configs = [{targets = ["${traefik_hostname}:${toString traefik_monitor_port}"];}];
relabel_configs = instance_relabel_config;
};
forgejo_monitor_port = 3000;
forgejo_job = {
job_name = "forgejo";
static_configs = [{targets = ["${forgejo_hostname}:${toString forgejo_monitor_port}"];}];
relabel_configs = instance_relabel_config;
};
postgres_exporter_port = 9187;
postgres_job = {
job_name = "postgres";
static_configs = [{targets = ["${forgejo_hostname}:${toString postgres_exporter_port}"];}];
relabel_configs = instance_relabel_config;
};
dnsmasq_exporter_port = 9153;
dnsmasq_job = {
job_name = "dnsmasq";
static_configs = [{targets = ["${dnsmasq_hostname}:${toString dnsmasq_exporter_port}"];}];
relabel_configs = instance_relabel_config;
};
# --- Media Stack Scrape Job ---
media_stack_job = {
job_name = "media_stack";
static_configs = [
{
targets = [
"${media_hostname}:9707" # sonarr
"${media_hostname}:9708" # readarr
"${media_hostname}:9709" # radarr
"${media_hostname}:9710" # prowlarr
"${media_hostname}:9711" # lidarr
"${media_hostname}:9712" # bazarr
];
}
];
relabel_configs = instance_relabel_config;
};
jellyfin_port = 8096;
jellyfin_exporter_port = 9594;
jellyfin_job = {
job_name = "jellyfin";
static_configs = [
{
targets = [
"${media_hostname}:${toString jellyfin_port}"
"${monitor_hostname}:${toString jellyfin_exporter_port}"
];
}
];
relabel_configs = instance_relabel_config;
};
in {
networking.firewall.allowedTCPPorts = [9090];
services.prometheus = {
enable = true;
retentionTime = "7d";
globalConfig = {
scrape_timeout = "10s";
scrape_interval = "30s";
# A short evaluation_interval will check alerting rules very often.
# It can be costly if you run Prometheus with 100+ alerts.
evaluation_interval = "20s";
};
extraFlags = [
"--web.enable-admin-api"
];
scrapeConfigs = [
node_exporter_job
promtail_job
prometheus_job
alertmanager_job
grafana_job
traefik_job
forgejo_job
postgres_job
dnsmasq_job
media_stack_job
jellyfin_job
];
alertmanagers = [
{
scheme = "http";
static_configs = [{targets = [alertmanager_target];}];
}
];
ruleFiles = [
(pkgs.writeText "prometheus-alerts.yml" (builtins.readFile ./provisioning/alerts/prometheus-alerts.yml))
(pkgs.writeText "loki-alerts.yml" (builtins.readFile ./provisioning/alerts/loki-alerts.yml))
(pkgs.writeText "promtail-alerts.yml" (builtins.readFile ./provisioning/alerts/promtail-alerts.yml))
(pkgs.writeText "postgres-alerts.yml" (builtins.readFile ./provisioning/alerts/postgres-alerts.yml))
(pkgs.writeText "traefik-alerts.yml" (builtins.readFile ./provisioning/alerts/traefik-alerts.yml))
(pkgs.writeText "node-exporter-alerts.yml" (builtins.readFile ./provisioning/alerts/node-exporter-alerts.yml))
];
};
}

View file

@ -0,0 +1,39 @@
groups:
- name: Loki
rules:
- alert: LokiProcessTooManyRestarts
expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
for: 0m
labels:
severity: warning
annotations:
summary: Loki process too many restarts (instance {{ $labels.instance }})
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestErrors
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
for: 15m
labels:
severity: critical
annotations:
summary: Loki request errors (instance {{ $labels.instance }})
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestPanic
expr: "sum(increase(loki_panic_total[10m])) by (namespace, job) > 0"
for: 5m
labels:
severity: critical
annotations:
summary: Loki request panic (instance {{ $labels.instance }})
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestLatency
expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'
for: 5m
labels:
severity: critical
annotations:
summary: Loki request latency (instance {{ $labels.instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -0,0 +1,318 @@
groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
expr: "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)"
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: "(rate(node_vmstat_pgmajfault[5m]) > 1000)"
for: 0m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryIsUnderutilized
expr: "min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8"
for: 0m
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }})
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: "((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)"
for: 0m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: "((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)"
for: 0m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate
expr: "(rate(node_disk_io_time_seconds_total[5m]) > .80)"
for: 0m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
for: 2m
labels:
severity: critical
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskMayFillIn24Hours
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: "(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)"
for: 2m
labels:
severity: critical
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError
expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Host filesystem device error (instance {{ $labels.instance }})
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesMayFillIn24Hours
expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: "(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)"
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: "(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)"
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuIsUnderutilized
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.95'
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{ $labels.instance }})
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait
expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo
expr: "rate(node_disk_io_time_seconds_total[5m]) > 0.8"
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostContextSwitchingHigh
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching high (instance {{ $labels.instance }})
description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: "((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)"
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: '(node_systemd_unit_state{state="failed"} == 1)'
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
expr: "node_hwmon_temp_celsius > node_hwmon_temp_max_celsius"
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
expr: "((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))"
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSoftwareRaidInsufficientDrives
expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
for: 0m
labels:
severity: critical
annotations:
summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSoftwareRaidDiskFailure
expr: '(node_md_disks{state="failed"} > 0)'
for: 2m
labels:
severity: warning
annotations:
summary: Host software RAID disk failure (instance {{ $labels.instance }})
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
expr: "changes(node_uname_info[1h]) > 0"
for: 0m
labels:
severity: info
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr: "(increase(node_vmstat_oom_kill[1m]) > 0)"
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: "(increase(node_edac_correctable_errors_total[1m]) > 0)"
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: "(node_edac_uncorrectable_errors_total > 0)"
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: "(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)"
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: "(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)"
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded
expr: "((node_bonding_active - node_bonding_slaves) != 0)"
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)"
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
expr: "((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))"
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
expr: "(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)"
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -0,0 +1,201 @@
groups:
- name: Postgres
rules:
- alert: PostgresqlDown
expr: "pg_up == 0"
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql down (instance {{ $labels.instance }})
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlRestarted
expr: "time() - pg_postmaster_start_time_seconds < 60"
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql restarted (instance {{ $labels.instance }})
description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlExporterError
expr: "pg_exporter_last_scrape_error > 0"
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql exporter error (instance {{ $labels.instance }})
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTableNotAutoVacuumed
expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTableNotAutoAnalyzed
expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyConnections
expr: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)"
for: 2m
labels:
severity: warning
annotations:
summary: Postgresql too many connections (instance {{ $labels.instance }})
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlNotEnoughConnections
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 2'
for: 2m
labels:
severity: warning
annotations:
summary: Postgresql not enough connections (instance {{ $labels.instance }})
description: "PostgreSQL instance should have more connections (> 2)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlDeadLocks
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql dead locks (instance {{ $labels.instance }})
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRollbackRate
expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql high rollback rate (instance {{ $labels.instance }})
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlCommitRateLow
expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
for: 2m
labels:
severity: critical
annotations:
summary: Postgresql commit rate low (instance {{ $labels.instance }})
description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlLowXidConsumption
expr: "rate(pg_txid_current[1m]) < 5"
for: 2m
labels:
severity: warning
annotations:
summary: Postgresql low XID consumption (instance {{ $labels.instance }})
description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateStatementTimeout
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateDeadlock
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlUnusedReplicationSlot
expr: "pg_replication_slots_active == 0"
for: 1m
labels:
severity: warning
annotations:
summary: Postgresql unused replication slot (instance {{ $labels.instance }})
description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyDeadTuples
expr: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1"
for: 2m
labels:
severity: warning
annotations:
summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlConfigurationChanged
expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
for: 0m
labels:
severity: info
annotations:
summary: Postgresql configuration changed (instance {{ $labels.instance }})
description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlSslCompressionActive
expr: "sum(pg_stat_ssl_compression) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql SSL compression active (instance {{ $labels.instance }})
description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyLocksAcquired
expr: "((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20"
for: 2m
labels:
severity: critical
annotations:
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlBloatIndexHigh(>80%)
expr: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)"
for: 1h
labels:
severity: warning
annotations:
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlBloatTableHigh(>80%)
expr: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)"
for: 1h
labels:
severity: warning
annotations:
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlInvalidIndex
expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
for: 6h
labels:
severity: warning
annotations:
summary: Postgresql invalid index (instance {{ $labels.instance }})
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlReplicationLag
expr: "pg_replication_lag_seconds > 5"
for: 30s
labels:
severity: warning
annotations:
summary: Postgresql replication lag (instance {{ $labels.instance }})
description: "The PostgreSQL replication lag is high (> 5s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -0,0 +1,255 @@
groups:
- name: Prometheus
rules:
- alert: PrometheusJobMissing
expr: 'absent(up{job="prometheus"})'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus job missing (instance {{ $labels.instance }})
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetMissing
expr: "up == 0"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus target missing (instance {{ $labels.instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAllTargetsMissing
expr: "sum by (job) (up) == 0"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus all targets missing (instance {{ $labels.instance }})
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetMissingWithWarmupTime
expr: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus target missing with warmup time (instance {{ $labels.instance }})
description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusConfigurationReloadFailure
expr: "prometheus_config_last_reload_successful != 1"
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTooManyRestarts
expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus too many restarts (instance {{ $labels.instance }})
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAlertmanagerJobMissing
expr: 'absent(up{job="alertmanager"})'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAlertmanagerConfigurationReloadFailure
expr: "alertmanager_config_last_reload_successful != 1"
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAlertmanagerConfigNotSynced
expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAlertmanagerE2eDeadManSwitch
expr: "vector(1)"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusNotConnectedToAlertmanager
expr: "prometheus_notifications_alertmanagers_discovered < 1"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusRuleEvaluationFailures
expr: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTemplateTextExpansionFailures
expr: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusRuleEvaluationSlow
expr: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds"
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusNotificationsBacklog
expr: "min_over_time(prometheus_notifications_queue_length[10m]) > 0"
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAlertmanagerNotificationFailing
expr: "rate(alertmanager_notifications_failed_total[1m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetEmpty
expr: "prometheus_sd_discovered_targets == 0"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus target empty (instance {{ $labels.instance }})
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetScrapingSlow
expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusLargeScrape
expr: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10"
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus large scrape (instance {{ $labels.instance }})
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetScrapeDuplicate
expr: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0"
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbCheckpointCreationFailures
expr: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbCheckpointDeletionFailures
expr: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbCompactionsFailed
expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbHeadTruncationsFailed
expr: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbReloadFailures
expr: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbWalCorruptions
expr: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTsdbWalTruncationsFailed
expr: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0"
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTimeseriesCardinality
expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -0,0 +1,21 @@
groups:
- name: Promtail
rules:
- alert: PromtailRequestErrors
expr: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10'
for: 5m
labels:
severity: critical
annotations:
summary: Promtail request errors (instance {{ $labels.instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PromtailRequestLatency
expr: "histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1"
for: 5m
labels:
severity: critical
annotations:
summary: Promtail request latency (instance {{ $labels.instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -0,0 +1,30 @@
groups:
- name: Traefik
rules:
- alert: TraefikServiceDown
expr: "count(traefik_service_server_up) by (service) == 0"
for: 0m
labels:
severity: critical
annotations:
summary: Traefik service down (instance {{ $labels.instance }})
description: "All Traefik services are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TraefikHighHttp4xxErrorRateService
expr: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
for: 1m
labels:
severity: critical
annotations:
summary: Traefik high HTTP 4xx error rate service (instance {{ $labels.instance }})
description: "Traefik service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TraefikHighHttp5xxErrorRateService
expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
for: 1m
labels:
severity: critical
annotations:
summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -0,0 +1,29 @@
{{ define "alert_list" }}{{ range . }}
---
🪪 <b>{{ .Labels.alertname | html }}</b>
{{- if eq .Labels.severity "critical" }}
🚨 CRITICAL 🚨 {{ end }}
{{- if eq .Labels.severity "warning" }}
⚠️ WARNING ⚠️{{ end }}
{{- if .Annotations.summary }}
📝 {{ .Annotations.summary | html }}{{ end }}
{{- if .Annotations.description }}
📖 {{ .Annotations.description | html }}{{ end }}
🏷 Labels:
{{ range .Labels.SortedPairs }} <i>{{ .Name | html }}</i>: <code>{{ .Value | html }}</code>
{{ end }}{{ end }}
🛠 <a href="https://grafana.procopius.dk">Grafana</a>
💊 <a href="https://alertmanager.procopius.dk">Alertmanager</a>
{{ end }}
{{ define "telegram.message" }}
{{ if gt (len .Alerts.Firing) 0 }}
🔥 Alerts Firing 🔥
{{ template "alert_list" .Alerts.Firing }}
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
✅ Alerts Resolved ✅
{{ template "alert_list" .Alerts.Resolved }}
{{ end }}
{{ end }}

View file

@ -0,0 +1,3 @@
{
system.stateVersion = "25.05";
}

113
nixos/flake.lock generated
View file

@ -1,8 +1,82 @@
{
"nodes": {
"colmena": {
"inputs": {
"flake-compat": "flake-compat",
"flake-utils": "flake-utils",
"nix-github-actions": "nix-github-actions",
"nixpkgs": "nixpkgs",
"stable": "stable"
},
"locked": {
"lastModified": 1751144689,
"narHash": "sha256-cgIntaqhcm62V1KU6GmrAGpHpahT4UExEWW2ryS02ZU=",
"owner": "zhaofengli",
"repo": "colmena",
"rev": "3ceec72cfb396a8a8de5fe96a9d75a9ce88cc18e",
"type": "github"
},
"original": {
"owner": "zhaofengli",
"repo": "colmena",
"type": "github"
}
},
"flake-compat": {
"flake": false,
"locked": {
"lastModified": 1650374568,
"narHash": "sha256-Z+s0J8/r907g149rllvwhb4pKi8Wam5ij0st8PwAh+E=",
"owner": "edolstra",
"repo": "flake-compat",
"rev": "b4a34015c698c7793d592d66adbab377907a2be8",
"type": "github"
},
"original": {
"owner": "edolstra",
"repo": "flake-compat",
"type": "github"
}
},
"flake-utils": {
"locked": {
"lastModified": 1659877975,
"narHash": "sha256-zllb8aq3YO3h8B/U0/J1WBgAL8EX5yWf5pMj3G0NAmc=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "c0e246b9b83f637f4681389ecabcb2681b4f3af0",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nix-github-actions": {
"inputs": {
"nixpkgs": [
"colmena",
"nixpkgs"
]
},
"locked": {
"lastModified": 1729742964,
"narHash": "sha256-B4mzTcQ0FZHdpeWcpDYPERtyjJd/NIuaQ9+BV1h+MpA=",
"owner": "nix-community",
"repo": "nix-github-actions",
"rev": "e04df33f62cdcf93d73e9a04142464753a16db67",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "nix-github-actions",
"type": "github"
}
},
"nixarr": {
"inputs": {
"nixpkgs": "nixpkgs",
"nixpkgs": "nixpkgs_2",
"vpnconfinement": "vpnconfinement",
"website-builder": "website-builder"
},
@ -21,6 +95,22 @@
}
},
"nixpkgs": {
"locked": {
"lastModified": 1750134718,
"narHash": "sha256-v263g4GbxXv87hMXMCpjkIxd/viIF7p3JpJrwgKdNiI=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "9e83b64f727c88a7711a2c463a7b16eedb69a84c",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"nixpkgs_2": {
"locked": {
"lastModified": 1748662220,
"narHash": "sha256-7gGa49iB9nCnFk4h/g9zwjlQAyjtpgcFkODjcOQS0Es=",
@ -36,7 +126,7 @@
"type": "github"
}
},
"nixpkgs_2": {
"nixpkgs_3": {
"locked": {
"lastModified": 1748809735,
"narHash": "sha256-UR5vKj8rwKQmE8wxKFHgoJKbod05DMoH5phTje4L1l8=",
@ -53,8 +143,9 @@
},
"root": {
"inputs": {
"colmena": "colmena",
"nixarr": "nixarr",
"nixpkgs": "nixpkgs_2",
"nixpkgs": "nixpkgs_3",
"sops-nix": "sops-nix"
}
},
@ -78,6 +169,22 @@
"type": "github"
}
},
"stable": {
"locked": {
"lastModified": 1750133334,
"narHash": "sha256-urV51uWH7fVnhIvsZIELIYalMYsyr2FCalvlRTzqWRw=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "36ab78dab7da2e4e27911007033713bab534187b",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-25.05",
"repo": "nixpkgs",
"type": "github"
}
},
"vpnconfinement": {
"locked": {
"lastModified": 1743810720,

View file

@ -3,6 +3,7 @@
inputs = {
nixpkgs.url = "github:nixos/nixpkgs";
colmena.url = "github:zhaofengli/colmena";
sops-nix = {
url = "github:Mic92/sops-nix";
inputs.nixpkgs.follows = "nixpkgs";
@ -14,7 +15,11 @@
# };
};
outputs = inputs @ {...}: let
outputs = inputs @ {
nixpkgs,
colmena,
...
}: let
system = "x86_64-linux";
liveVMs = {
@ -102,5 +107,32 @@
};
in {
nixosConfigurations = liveVMs;
colmenaHive = colmena.lib.makeHive {
meta = {
nixpkgs = import nixpkgs {
system = "x86_64-linux";
overlays = [];
};
defaults = {pkgs, ...}: {
};
};
host-b = {
name,
nodes,
pkgs,
...
}: {
deployment = {
targetHost = "somehost.tld";
targetPort = 1234;
targetUser = "luser";
};
boot.isContainer = true;
time.timeZone = "America/Los_Angeles";
};
};
};
}

21
secrets/secrets.yaml Normal file
View file

@ -0,0 +1,21 @@
telegram-alert-bot-token: ENC[AES256_GCM,data:7Bvhtrkaqc06xmSeOsw730cAfHAw4qBvz1ontPXO/6j4Hy6AAKbb8LYGIONVGA==,iv:2xdhmAM2anEH7flV72BlfVeXjStu6sEUqT97PW+dY2w=,tag:h12zGj8J0ftKuGuKZuCEmw==,type:str]
alertmanager:
env: ENC[AES256_GCM,data:lMZVLGY4JNeEa1OhiQsAyBqArDttpMjAILjtFQfi7933RfckJMike9cWOV8pSVMJjKUBCtmnir/KDhbyYCZ3oYQh,iv:dLGqXsvJ8x32bqtcaq66O85HcbF/I78HSmo3o/Sx76o=,tag:SHv07/J8O+JPkpTu5rRCzA==,type:str]
influxdb:
password: ENC[AES256_GCM,data:OP+4vK6ulZs7jVM4lgnpUatr+Qs=,iv:MEmD6yyy+Z7beVOdR1xNDn0c27DYDIDTYdnaNiaVHks=,tag:dyG7VPPV40JqSE4UAeVbtQ==,type:str]
token: ENC[AES256_GCM,data:QraVWLW1uCSF0YvbkHCKYtPvqs0=,iv:pzkfEyLksjRFVj7wZS8LxO0idQTpEk7OTMpQSsuIRvQ=,tag:d6U6vMqEYbu3CaTpnc0gGw==,type:str]
sops:
age:
- recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
enc: |
-----BEGIN AGE ENCRYPTED FILE-----
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBVa29TZnF4K2dtcEc2N1Qy
cmVTeWJpZXlYRWV6SG84M2NjVHhiRER4TFdFCjNCb1owUU5lL1BDOGt4ZXhad242
cFI2YWd0SnBCV2RXVUNlY2dKYWpUK1kKLS0tIHh6UmRkdlJqeWZHaTFYQ0M4L2xo
QzNYRk5ERmR4aGtLQ3dwQ1lPeDZyaEkKJMLXqv6tBBql7VVnWDIwAh24SfQ2O6Ca
CEOQTGEonbqr5doWqTsXUXrdQAS0amL45UdT6ITFtfNAjaHwCMfhZg==
-----END AGE ENCRYPTED FILE-----
lastmodified: "2025-07-06T17:10:59Z"
mac: ENC[AES256_GCM,data:dXLWT5fmSs2ddpFPXA1yOtwaej7b3lPesFxN7aEZ/bV6YRr+Ht5dHFQcXO0TfJArzhFRRtAumdcdVsorMkR4tao4XCcimACcWrZgXlXGM6XgT3hdPJ4006QLePXU+uyzpqyEuOouaxF7fyuSTL68uDr+E/NAHgmP2dnqpWnebpY=,iv:oUkmH/ngp8wvbuXay+2X6YBqhesNdtOPZOV4lvsc/s4=,tag:GErA9zgdkTarUD6fWiMupg==,type:str]
unencrypted_suffix: _unencrypted
version: 3.10.2