colmena initial implementation for sandbox and monitor
All checks were successful
Hello World / test (push) Successful in 4s
All checks were successful
Hello World / test (push) Successful in 4s
This commit is contained in:
parent
a90630ecb6
commit
5feb74d56d
40 changed files with 27629 additions and 141 deletions
|
|
@ -1,61 +0,0 @@
|
|||
name: "Build NixOS Image"
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: native
|
||||
steps:
|
||||
- name: Install nodejs
|
||||
run: nix-env -iA nixpkgs.nodejs
|
||||
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
# - name: Install Nix
|
||||
# uses: cachix/install-nix-action@v31
|
||||
# with:
|
||||
# nix_path: nixpkgs=channel:nixos-unstable
|
||||
# extra_nix_config: |
|
||||
# experimental-features = nix-command flakes
|
||||
|
||||
# - name: Enable experimental features
|
||||
# run: |
|
||||
# mkdir -p ~/.config/nix
|
||||
# echo "experimental-features = nix-command flakes" >> ~/.config/nix/nix.conf
|
||||
|
||||
# - name: Update Channel
|
||||
# run: nix-channel --update
|
||||
|
||||
- name: Build NixOS image
|
||||
working-directory: nixos
|
||||
run: nix build .#proxmoxTemplate
|
||||
|
||||
# - name: Upload & Restore to Proxmox
|
||||
# working-directory: nixos
|
||||
# env:
|
||||
# PROXMOX_SSH_KEY: ${{ secrets.PROXMOX_SSH_KEY }}
|
||||
# PROXMOX_HOST: 192.168.1.205
|
||||
# PROXMOX_USER: root
|
||||
# run: |
|
||||
# set -e
|
||||
|
||||
# IMAGE_NAME="vm-image.vma.zst"
|
||||
# REMOTE_PATH="/var/lib/vz/template/$IMAGE_NAME"
|
||||
# VM_ID="9000"
|
||||
|
||||
# echo "Starting ssh-agent and uploading..."
|
||||
# eval "$(ssh-agent -s)"
|
||||
# ssh-add <(echo "$PROXMOX_SSH_KEY")
|
||||
|
||||
# echo "Uploading image..."
|
||||
# scp -o StrictHostKeyChecking=no ./result/$IMAGE_NAME $PROXMOX_USER@$PROXMOX_HOST:$REMOTE_PATH
|
||||
|
||||
# echo "Restoring VM $VM_ID..."
|
||||
# ssh -o StrictHostKeyChecking=no $PROXMOX_USER@$PROXMOX_HOST "
|
||||
# qm stop $VM_ID || true
|
||||
# qm destroy $VM_ID || true
|
||||
# qmrestore --unique $REMOTE_PATH $VM_ID
|
||||
# qm template $VM_ID
|
||||
# "
|
||||
53
.forgejo/workflows/colmena-apply.yml
Normal file
53
.forgejo/workflows/colmena-apply.yml
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
name: "Colmena apply"
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "v*" # triggers on v1.0.0, v1.2.3, etc.
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
apply:
|
||||
name: Apply flake configurations to colmena hive
|
||||
# Ensure 'nixos-latest' runner has Docker, SSH client, and basic Nix tools installed.
|
||||
# It seems it already does.
|
||||
runs-on: nixos-latest
|
||||
env:
|
||||
NIXOS_BUILER_HOST: nixos-builder.lab
|
||||
NIXOS_BUILER_USER: runner
|
||||
|
||||
steps:
|
||||
# Use nix-env for setup (as you prefer and it works well for ephemeral environments)
|
||||
- name: Install dependencies via nix-env
|
||||
run: |
|
||||
nix-env -iA nixpkgs.nodejs
|
||||
nix-env -iA nixpkgs.openssh
|
||||
nix-env -if https://github.com/zhaofengli/colmena/tarball/main
|
||||
nix-env -iA cachix -f https://cachix.org/api/v1/install
|
||||
cachix use plasmagoat
|
||||
cachix authtoken ${{ secrets.CACHIX_AUTH_TOKEN }}
|
||||
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Enable experimental features
|
||||
run: |
|
||||
mkdir -p ~/.config/nix
|
||||
echo "experimental-features = nix-command flakes" >> ~/.config/nix/nix.conf
|
||||
|
||||
- name: Prepare SSH keys and known_hosts for builder and Proxmox
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "${{ secrets.RUNNER_SSH_KEY }}" > ~/.ssh/id_rsa
|
||||
chmod 600 ~/.ssh/id_rsa
|
||||
ssh-keyscan -H "$NIXOS_BUILER_HOST" >> ~/.ssh/known_hosts
|
||||
chmod 600 ~/.ssh/known_hosts
|
||||
|
||||
- name: Test SSH connection to NixOS Builder
|
||||
run: |
|
||||
echo "Testing SSH connection to $NIXOS_BUILER_HOST..."
|
||||
ssh -o StrictHostKeyChecking=yes "$NIXOS_BUILER_USER"@"$NIXOS_BUILER_HOST" "echo 'SSH success. Hostname:' && hostname"
|
||||
|
||||
- name: Apply Colmena
|
||||
id: apply
|
||||
run: colmena apply
|
||||
|
|
@ -1,39 +0,0 @@
|
|||
name: Deploy NixOS VM
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: docker
|
||||
container:
|
||||
image: nixos/nix
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install Terraform
|
||||
run: nix-env -iA nixpkgs.terraform
|
||||
|
||||
- name: Setup SSH key
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "$SSH_PRIVATE_KEY" > ~/.ssh/id_ed25519
|
||||
chmod 600 ~/.ssh/id_ed25519
|
||||
env:
|
||||
SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
|
||||
|
||||
- name: Terraform Init & Apply
|
||||
run: |
|
||||
terraform init
|
||||
terraform apply -auto-approve
|
||||
working-directory: ./terraform
|
||||
env:
|
||||
PROXMOX_PASSWORD: ${{ secrets.PROXMOX_PASSWORD }}
|
||||
|
||||
- name: Deploy NixOS via nixos-anywhere
|
||||
run: |
|
||||
nix run github:numtide/nixos-anywhere -- \
|
||||
--build-on-remote \
|
||||
--flake .#new-vm \
|
||||
root@<new-vm-ip>
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
name: Terraform Proxmox NixOS VM Deploy
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
deploy-nixos-vm:
|
||||
runs-on: nixos-latest
|
||||
steps:
|
||||
- name: Install nodejs
|
||||
run: nix-env -iA nixpkgs.nodejs
|
||||
|
||||
- name: Install terraform
|
||||
run: nix-env -iA nixpkgs.terraform
|
||||
|
||||
- name: Install sops
|
||||
run: nix-env -iA nixpkgs.sops
|
||||
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Decrypt secrets
|
||||
env:
|
||||
SOPS_AGE_KEY_FILE: ${{ secrets.AGE_KEY_FILE }}
|
||||
run: |
|
||||
sops --decrypt secrets.yaml.enc > secrets.yaml
|
||||
|
||||
- name: Terraform Init
|
||||
run: terraform init
|
||||
|
||||
- name: Terraform Apply
|
||||
env:
|
||||
PROXMOX_PASSWORD: ${{ secrets.PROXMOX_PASSWORD }}
|
||||
run: terraform apply -auto-approve
|
||||
3
.sops.yaml
Normal file
3
.sops.yaml
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
creation_rules:
|
||||
- age: >-
|
||||
age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
|
||||
|
|
@ -55,7 +55,7 @@
|
|||
register: qga_json
|
||||
failed_when: qga_json.rc != 0
|
||||
|
||||
- name: Parse out eth0’s IPv4 address
|
||||
- name: Parse out eth0's IPv4 address
|
||||
ansible.builtin.set_fact:
|
||||
vm_ipv4: >-
|
||||
{{
|
||||
|
|
@ -71,11 +71,11 @@
|
|||
)
|
||||
}}
|
||||
|
||||
- name: Show the VM’s IP
|
||||
- name: Show the VM's IP
|
||||
ansible.builtin.debug:
|
||||
msg: "VM {{ new_vmid }} ({{ new_name }}) reports IPv4: {{ vm_ipv4 }}"
|
||||
|
||||
- name: Add new VM’s IP to in-memory inventory (for later tasks)
|
||||
- name: Add new VM's IP to in-memory inventory (for later tasks)
|
||||
ansible.builtin.add_host:
|
||||
name: "nixos-{{ new_vmid }}"
|
||||
ansible_host: "{{ vm_ipv4 }}"
|
||||
|
|
|
|||
154
flake.lock
generated
Normal file
154
flake.lock
generated
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
{
|
||||
"nodes": {
|
||||
"colmena": {
|
||||
"inputs": {
|
||||
"flake-compat": "flake-compat",
|
||||
"flake-utils": "flake-utils",
|
||||
"nix-github-actions": "nix-github-actions",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"stable": "stable"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1751144689,
|
||||
"narHash": "sha256-cgIntaqhcm62V1KU6GmrAGpHpahT4UExEWW2ryS02ZU=",
|
||||
"owner": "zhaofengli",
|
||||
"repo": "colmena",
|
||||
"rev": "3ceec72cfb396a8a8de5fe96a9d75a9ce88cc18e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "zhaofengli",
|
||||
"repo": "colmena",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-compat": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1650374568,
|
||||
"narHash": "sha256-Z+s0J8/r907g149rllvwhb4pKi8Wam5ij0st8PwAh+E=",
|
||||
"owner": "edolstra",
|
||||
"repo": "flake-compat",
|
||||
"rev": "b4a34015c698c7793d592d66adbab377907a2be8",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "edolstra",
|
||||
"repo": "flake-compat",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-utils": {
|
||||
"locked": {
|
||||
"lastModified": 1659877975,
|
||||
"narHash": "sha256-zllb8aq3YO3h8B/U0/J1WBgAL8EX5yWf5pMj3G0NAmc=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "c0e246b9b83f637f4681389ecabcb2681b4f3af0",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nix-github-actions": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"colmena",
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1729742964,
|
||||
"narHash": "sha256-B4mzTcQ0FZHdpeWcpDYPERtyjJd/NIuaQ9+BV1h+MpA=",
|
||||
"owner": "nix-community",
|
||||
"repo": "nix-github-actions",
|
||||
"rev": "e04df33f62cdcf93d73e9a04142464753a16db67",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-community",
|
||||
"repo": "nix-github-actions",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1750134718,
|
||||
"narHash": "sha256-v263g4GbxXv87hMXMCpjkIxd/viIF7p3JpJrwgKdNiI=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "9e83b64f727c88a7711a2c463a7b16eedb69a84c",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs_2": {
|
||||
"locked": {
|
||||
"lastModified": 1751801514,
|
||||
"narHash": "sha256-Ve3ZTzcXEGt4IoXLsWqk35w3w4cH5G1MJb+gLdj/jtE=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "4e3e6431fd60d653bb7f4fa5487e2c500d50f49f",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"colmena": "colmena",
|
||||
"nixpkgs": "nixpkgs_2",
|
||||
"sops-nix": "sops-nix"
|
||||
}
|
||||
},
|
||||
"sops-nix": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1751606940,
|
||||
"narHash": "sha256-KrDPXobG7DFKTOteqdSVeL1bMVitDcy7otpVZWDE6MA=",
|
||||
"owner": "Mic92",
|
||||
"repo": "sops-nix",
|
||||
"rev": "3633fc4acf03f43b260244d94c71e9e14a2f6e0d",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "Mic92",
|
||||
"repo": "sops-nix",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"stable": {
|
||||
"locked": {
|
||||
"lastModified": 1750133334,
|
||||
"narHash": "sha256-urV51uWH7fVnhIvsZIELIYalMYsyr2FCalvlRTzqWRw=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "36ab78dab7da2e4e27911007033713bab534187b",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-25.05",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
34
flake.nix
Normal file
34
flake.nix
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
{
|
||||
description = "Declarative NixOS HomeLab";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:nixos/nixpkgs";
|
||||
# systems.url = "github:nix-systems/default";
|
||||
sops-nix = {
|
||||
url = "github:Mic92/sops-nix";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
# home-manager = {
|
||||
# url = "home-manager";
|
||||
# inputs.nixpkgs.follows = "nixpkgs";
|
||||
# };
|
||||
colmena.url = "github:zhaofengli/colmena";
|
||||
};
|
||||
|
||||
outputs = {
|
||||
self,
|
||||
nixpkgs,
|
||||
# systems,
|
||||
sops-nix,
|
||||
# home-manager,
|
||||
colmena,
|
||||
...
|
||||
} @ inputs: let
|
||||
overlays = [
|
||||
colmena.overlays.default
|
||||
];
|
||||
in {
|
||||
colmenaHive = colmena.lib.makeHive self.outputs.colmena;
|
||||
colmena = (import ./hive.nix) (inputs // {inherit overlays;});
|
||||
};
|
||||
}
|
||||
48
hive.nix
Normal file
48
hive.nix
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
inputs @ {
|
||||
self,
|
||||
nixpkgs,
|
||||
sops-nix,
|
||||
# home-manager,
|
||||
overlays,
|
||||
...
|
||||
}: {
|
||||
meta = {
|
||||
nixpkgs = import nixpkgs {
|
||||
system = "x86_64-linux";
|
||||
};
|
||||
specialArgs.flakeInputs = inputs;
|
||||
};
|
||||
|
||||
defaults = {
|
||||
pkgs,
|
||||
lib,
|
||||
name,
|
||||
nodes,
|
||||
meta,
|
||||
config,
|
||||
...
|
||||
}: {
|
||||
imports = [
|
||||
./machines/_default
|
||||
./machines/modules
|
||||
sops-nix.nixosModules.sops
|
||||
# home-manager.nixosModules.home-manager
|
||||
];
|
||||
nixpkgs = {
|
||||
inherit overlays;
|
||||
system = lib.mkDefault "x86_64-linux";
|
||||
config.allowUnfree = true;
|
||||
};
|
||||
deployment.tags = [config.nixpkgs.system name];
|
||||
};
|
||||
|
||||
sandbox = {name, ...}: {
|
||||
imports = [./machines/${name}/definition.nix];
|
||||
deployment.tags = ["sandbox"];
|
||||
};
|
||||
|
||||
monitor = {name, ...}: {
|
||||
imports = [./machines/${name}/definition.nix];
|
||||
deployment.tags = ["grafana" "prometheus"];
|
||||
};
|
||||
}
|
||||
89
machines/_default/common_config.nix
Normal file
89
machines/_default/common_config.nix
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
{
|
||||
pkgs,
|
||||
lib,
|
||||
modulesPath,
|
||||
...
|
||||
}: {
|
||||
imports = [
|
||||
# Enables QEMU Guest Agent support in the VM
|
||||
(modulesPath + "/profiles/qemu-guest.nix")
|
||||
];
|
||||
|
||||
services.qemuGuest.enable = lib.mkDefault true;
|
||||
|
||||
boot.loader.grub.enable = lib.mkDefault true;
|
||||
boot.loader.grub.devices = ["nodev"];
|
||||
|
||||
boot.growPartition = lib.mkDefault true;
|
||||
|
||||
boot.tmp.cleanOnBoot = true;
|
||||
|
||||
fileSystems."/" = lib.mkDefault {
|
||||
device = "/dev/disk/by-label/nixos";
|
||||
autoResize = true; # grow on first boot
|
||||
fsType = "ext4";
|
||||
};
|
||||
|
||||
nix = {
|
||||
settings.experimental-features = ["nix-command" "flakes"];
|
||||
gc.automatic = true;
|
||||
gc.options = "--delete-older-than 15d";
|
||||
gc.dates = "daily";
|
||||
optimise.automatic = true;
|
||||
settings = {
|
||||
auto-optimise-store = true;
|
||||
allowed-users = ["@wheel"];
|
||||
trusted-users = ["root" "@wheel"];
|
||||
};
|
||||
extraOptions = ''
|
||||
keep-outputs = true
|
||||
keep-derivations = true
|
||||
'';
|
||||
};
|
||||
|
||||
security.sudo.wheelNeedsPassword = false;
|
||||
|
||||
users.users.plasmagoat = {
|
||||
isNormalUser = true;
|
||||
description = "plasmagoat";
|
||||
extraGroups = ["wheel" "docker"];
|
||||
# shell = pkgs.zsh;
|
||||
# shell = pkgs.fish;
|
||||
};
|
||||
|
||||
services.openssh.enable = true;
|
||||
services.openssh.openFirewall = true;
|
||||
services.openssh.settings.PasswordAuthentication = false;
|
||||
services.openssh.settings.PermitRootLogin = "prohibit-password";
|
||||
services.openssh.settings.KbdInteractiveAuthentication = false;
|
||||
|
||||
services.sshguard.enable = true;
|
||||
|
||||
programs.ssh.startAgent = true;
|
||||
|
||||
users.users.plasmagoat.openssh.authorizedKeys.keys = [
|
||||
"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCeg/n/vst9KME8byhxX2FhA+FZNQ60W38kkNt45eNzK5zFqBYuwo1nDXVanJSh9unRvB13b+ygpZhrb4sHvkETGWiEioc49MiWr8czEhu6Wpo0vv5MAJkiYvGZUYPdUW52jUzWcYdw8PukG2rowrxL5G0CmsqLwHMPU2FyeCe5aByFI/JZb8R80LoEacgjUiipJcoLWUVgG2koMomHClqGu+16kB8nL5Ja3Kc9lgLfDK7L0A5R8JXhCjrlEsmXbxZmwDKuxvjDAZdE9Sl1VZmMDfWkyrRlenrt01eR3t3Fec6ziRm5ZJk9e2Iu1DPoz+PoHH9aZGVwmlvvnr/gMF3OILxcqb0qx+AYlCCnb6D6pJ9zufhZkKcPRS1Q187F6fz+v2oD1xLZWFHJ92+7ItM0WmbDOHOC29s5EA6wNm3iXZCq86OI3n6T34njDtPqh6Z7Pk2sdK4GBwnFj4KwEWXvdKZKSX1qb2EVlEBE9QI4Gf3eg4SiBu2cAFt3nOSzs8c= asol\dbs@ALPHA-DBS-P14sG2"
|
||||
"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC+U3DWOrklcA8n8wdbLBGyli5LsJI3dpL2Zod8mx8eOdC4H127ZT1hzuk2uSmkic4c73BykPyQv8rcqwaRGW94xdMRanKmHYxnbHXo5FBiGrCkNlNNZuahthAGO49c6sUhJMq0eLhYOoFWjtf15sr5Zu7Ug2YTUL3HXB1o9PZ3c9sqYHo2rC/Il1x2j3jNAMKST/qUZYySvdfNJEeQhMbQcdoKJsShcE3oGRL6DFBoV/mjJAJ+wuDhGLDnqi79nQjYfbYja1xKcrKX+D3MfkFxFl6ZIzomR1t75AnZ+09oaWcv1J7ehZ3h9PpDBFNXvzyLwDBMNS+UYcH6SyFjkUbF David@NZXT"
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICUP7m8jZJiclZGfSje8CeBYFhX10SrdtjYziuChmj1X plasmagoat@macbook-air"
|
||||
];
|
||||
|
||||
users.users.root.openssh.authorizedKeys.keys = [
|
||||
"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCeg/n/vst9KME8byhxX2FhA+FZNQ60W38kkNt45eNzK5zFqBYuwo1nDXVanJSh9unRvB13b+ygpZhrb4sHvkETGWiEioc49MiWr8czEhu6Wpo0vv5MAJkiYvGZUYPdUW52jUzWcYdw8PukG2rowrxL5G0CmsqLwHMPU2FyeCe5aByFI/JZb8R80LoEacgjUiipJcoLWUVgG2koMomHClqGu+16kB8nL5Ja3Kc9lgLfDK7L0A5R8JXhCjrlEsmXbxZmwDKuxvjDAZdE9Sl1VZmMDfWkyrRlenrt01eR3t3Fec6ziRm5ZJk9e2Iu1DPoz+PoHH9aZGVwmlvvnr/gMF3OILxcqb0qx+AYlCCnb6D6pJ9zufhZkKcPRS1Q187F6fz+v2oD1xLZWFHJ92+7ItM0WmbDOHOC29s5EA6wNm3iXZCq86OI3n6T34njDtPqh6Z7Pk2sdK4GBwnFj4KwEWXvdKZKSX1qb2EVlEBE9QI4Gf3eg4SiBu2cAFt3nOSzs8c= asol\\dbs@ALPHA-DBS-P14sG2"
|
||||
"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC+U3DWOrklcA8n8wdbLBGyli5LsJI3dpL2Zod8mx8eOdC4H127ZT1hzuk2uSmkic4c73BykPyQv8rcqwaRGW94xdMRanKmHYxnbHXo5FBiGrCkNlNNZuahthAGO49c6sUhJMq0eLhYOoFWjtf15sr5Zu7Ug2YTUL3HXB1o9PZ3c9sqYHo2rC/Il1x2j3jNAMKST/qUZYySvdfNJEeQhMbQcdoKJsShcE3oGRL6DFBoV/mjJAJ+wuDhGLDnqi79nQjYfbYja1xKcrKX+D3MfkFxFl6ZIzomR1t75AnZ+09oaWcv1J7ehZ3h9PpDBFNXvzyLwDBMNS+UYcH6SyFjkUbF David@NZXT"
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICUP7m8jZJiclZGfSje8CeBYFhX10SrdtjYziuChmj1X plasmagoat@macbook-air"
|
||||
];
|
||||
|
||||
environment.systemPackages = with pkgs; [
|
||||
dig
|
||||
nmap
|
||||
traceroute
|
||||
vim
|
||||
git
|
||||
curl
|
||||
python3
|
||||
];
|
||||
|
||||
time.timeZone = "Europe/Copenhagen";
|
||||
|
||||
console.keyMap = "dk-latin1";
|
||||
}
|
||||
40
machines/_default/default.nix
Normal file
40
machines/_default/default.nix
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
{
|
||||
lib,
|
||||
name,
|
||||
...
|
||||
}: {
|
||||
imports = [
|
||||
./common_config.nix
|
||||
];
|
||||
|
||||
networking.hostName = name;
|
||||
|
||||
deployment = {
|
||||
replaceUnknownProfiles = lib.mkDefault true;
|
||||
buildOnTarget = lib.mkDefault false;
|
||||
targetHost = lib.mkDefault "${name}.lab";
|
||||
tags = lib.mkDefault ["homelab"];
|
||||
};
|
||||
|
||||
sops = {
|
||||
age.keyFile = "/etc/sops/age.key";
|
||||
defaultSopsFile = ../../secrets/secrets.yml;
|
||||
};
|
||||
|
||||
# home-manager = {
|
||||
# useGlobalPkgs = true;
|
||||
# useUserPackages = true;
|
||||
# users.cottand = {
|
||||
# imports = with flakeInputs.cottand.homeManagerModules; [cli];
|
||||
# home.stateVersion = "22.11";
|
||||
# };
|
||||
# users.root = {
|
||||
# imports = with flakeInputs.cottand.homeManagerModules; [cli];
|
||||
# home.stateVersion = "22.11";
|
||||
# };
|
||||
# };
|
||||
|
||||
# consulNode.enable = lib.mkDefault true;
|
||||
nodeExporter.enable = lib.mkDefault true;
|
||||
journalLog.enable = lib.mkDefault true;
|
||||
}
|
||||
11
machines/modules/default.nix
Normal file
11
machines/modules/default.nix
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
{
|
||||
imports = [
|
||||
./node-exporter.nix
|
||||
./journal-log.nix
|
||||
# ./wireguard.nix
|
||||
# ./nomad.nix
|
||||
# ./vault.nix
|
||||
# ./vaultSecret.nix
|
||||
# ./consul.nix
|
||||
];
|
||||
}
|
||||
95
machines/modules/journal-log.nix
Normal file
95
machines/modules/journal-log.nix
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
{
|
||||
lib,
|
||||
config,
|
||||
nodes,
|
||||
# name,
|
||||
# meta,
|
||||
...
|
||||
}:
|
||||
with lib; let
|
||||
cfg = config.journalLog;
|
||||
in {
|
||||
options.journalLog = {
|
||||
enable = mkOption {
|
||||
type = types.bool;
|
||||
default = false;
|
||||
};
|
||||
|
||||
port = mkOption {
|
||||
type = types.number;
|
||||
default = 9080;
|
||||
};
|
||||
|
||||
clientUrl = mkOption {
|
||||
type = types.string;
|
||||
default = "http://monitor.lab:3100/loki/api/v1/push";
|
||||
};
|
||||
|
||||
extraConfig = mkOption {
|
||||
type = types.attrs;
|
||||
default = {};
|
||||
};
|
||||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
networking.firewall.allowedTCPPorts = [cfg.port];
|
||||
|
||||
systemd.tmpfiles.rules = [
|
||||
"d /var/lib/promtail 0755 promtail promtail -"
|
||||
];
|
||||
|
||||
services.promtail = {
|
||||
enable = true;
|
||||
configuration = {
|
||||
server = {
|
||||
http_listen_port = cfg.port;
|
||||
grpc_listen_port = 0;
|
||||
};
|
||||
positions = {
|
||||
filename = "/var/lib/promtail/positions.yaml";
|
||||
};
|
||||
clients = [
|
||||
{
|
||||
url = cfg.clientUrl;
|
||||
}
|
||||
];
|
||||
scrape_configs = [
|
||||
{
|
||||
job_name = "journal";
|
||||
journal = {
|
||||
path = "/var/log/journal";
|
||||
labels = {
|
||||
job = "promtail";
|
||||
host = config.networking.hostName;
|
||||
env = "proxmox";
|
||||
instance = "${config.networking.hostName}.lab";
|
||||
};
|
||||
};
|
||||
relabel_configs = [
|
||||
{
|
||||
source_labels = ["__journal__systemd_unit"];
|
||||
target_label = "unit";
|
||||
}
|
||||
{
|
||||
source_labels = ["__journal__hostname"];
|
||||
target_label = "host";
|
||||
}
|
||||
{
|
||||
source_labels = ["__journal__systemd_user_unit"];
|
||||
target_label = "user_unit";
|
||||
}
|
||||
{
|
||||
source_labels = ["__journal__transport"];
|
||||
target_label = "transport";
|
||||
}
|
||||
{
|
||||
source_labels = ["__journal_priority_keyword"];
|
||||
target_label = "severity";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
40
machines/modules/node-exporter.nix
Normal file
40
machines/modules/node-exporter.nix
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
{
|
||||
lib,
|
||||
config,
|
||||
# name,
|
||||
# meta,
|
||||
...
|
||||
}:
|
||||
with lib; let
|
||||
cfg = config.nodeExporter;
|
||||
in {
|
||||
options.nodeExporter = {
|
||||
enable = mkOption {
|
||||
type = types.bool;
|
||||
default = false;
|
||||
};
|
||||
|
||||
port = mkOption {
|
||||
type = types.number;
|
||||
default = 9100;
|
||||
};
|
||||
|
||||
extraConfig = mkOption {
|
||||
type = types.attrs;
|
||||
default = {};
|
||||
};
|
||||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
networking.firewall.allowedTCPPorts = [cfg.port];
|
||||
|
||||
services.prometheus.exporters.node =
|
||||
{
|
||||
enable = true;
|
||||
enabledCollectors = ["systemd"];
|
||||
port = cfg.port;
|
||||
extraFlags = ["--collector.ethtool" "--collector.softirqs" "--collector.tcpstat" "--collector.wifi"];
|
||||
}
|
||||
// cfg.extraConfig;
|
||||
};
|
||||
}
|
||||
63
machines/monitor/alertmanager.nix
Normal file
63
machines/monitor/alertmanager.nix
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
{
|
||||
config,
|
||||
pkgs,
|
||||
...
|
||||
}: let
|
||||
alertmanagerEnv = config.sops.secrets."alertmanager/env".path;
|
||||
in {
|
||||
sops.secrets."alertmanager/env" = {
|
||||
sopsFile = ../../secrets/secrets.yaml;
|
||||
mode = "0440";
|
||||
};
|
||||
|
||||
services.prometheus.alertmanager = {
|
||||
enable = true;
|
||||
openFirewall = true;
|
||||
environmentFile = alertmanagerEnv;
|
||||
|
||||
webExternalUrl = "http://monitor.lab:9093"; # optional but helpful
|
||||
configuration = {
|
||||
route = {
|
||||
receiver = "null";
|
||||
group_by = ["alertname"];
|
||||
group_wait = "10s";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
|
||||
routes = [
|
||||
{
|
||||
receiver = "telegram";
|
||||
matchers = [
|
||||
"severity =~ \"warning|critical\""
|
||||
];
|
||||
group_wait = "10s";
|
||||
continue = true;
|
||||
}
|
||||
];
|
||||
};
|
||||
|
||||
receivers = [
|
||||
{name = "null";}
|
||||
{
|
||||
name = "telegram";
|
||||
telegram_configs = [
|
||||
{
|
||||
api_url = "https://api.telegram.org";
|
||||
bot_token = "$TELEGRAM_BOT_TOKEN";
|
||||
chat_id = -1002642560007;
|
||||
message_thread_id = 4;
|
||||
parse_mode = "HTML";
|
||||
send_resolved = true;
|
||||
message = "{{ template \"telegram.message\". }}";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
|
||||
templates = [
|
||||
(pkgs.writeText "telegram.tmpl" (builtins.readFile ./provisioning/templates/telegram.tmpl))
|
||||
# (pkgs.writeText "telegram.markdown.v2.tmpl" (builtins.readFile ./provisioning/templates/telegram.markdown.v2.tmpl))
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
2353
machines/monitor/dashboards/15356_rev14.json
Normal file
2353
machines/monitor/dashboards/15356_rev14.json
Normal file
File diff suppressed because it is too large
Load diff
1082
machines/monitor/dashboards/gitea.json
Normal file
1082
machines/monitor/dashboards/gitea.json
Normal file
File diff suppressed because it is too large
Load diff
692
machines/monitor/dashboards/grafana-traefik.json
Normal file
692
machines/monitor/dashboards/grafana-traefik.json
Normal file
|
|
@ -0,0 +1,692 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "grafana"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Traefik dashboard prometheus",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": 2,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 10,
|
||||
"panels": [],
|
||||
"title": "$backend stats",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
}
|
||||
},
|
||||
"decimals": 0,
|
||||
"mappings": [],
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"id": 2,
|
||||
"maxDataPoints": 3,
|
||||
"options": {
|
||||
"displayLabels": [],
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"values": ["value", "percent"]
|
||||
},
|
||||
"pieType": "pie",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"text": {},
|
||||
"tooltip": {
|
||||
"hideZeros": false,
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "12.0.0+security-01",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"exemplar": true,
|
||||
"expr": "traefik_service_requests_total{service=\"$service\"}",
|
||||
"format": "time_series",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{method}} : {{code}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "$service return code",
|
||||
"type": "piechart"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"fixedColor": "rgb(31, 120, 193)",
|
||||
"mode": "fixed"
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"match": "null",
|
||||
"result": {
|
||||
"text": "N/A"
|
||||
}
|
||||
},
|
||||
"type": "special"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "ms"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 1
|
||||
},
|
||||
"id": 4,
|
||||
"maxDataPoints": 100,
|
||||
"options": {
|
||||
"colorMode": "none",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "horizontal",
|
||||
"percentChangeColorMode": "standard",
|
||||
"reduceOptions": {
|
||||
"calcs": ["mean"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showPercentChange": false,
|
||||
"textMode": "auto",
|
||||
"wideLayout": true
|
||||
},
|
||||
"pluginVersion": "12.0.0+security-01",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"exemplar": true,
|
||||
"expr": "sum(traefik_service_request_duration_seconds_sum{service=\"$service\"}) / sum(traefik_service_requests_total{service=\"$service\"}) * 1000",
|
||||
"format": "time_series",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "$service response time",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "bars",
|
||||
"fillOpacity": 100,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"alertThreshold": true,
|
||||
"legend": {
|
||||
"calcs": ["mean", "max", "min"],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"hideZeros": false,
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "12.0.0+security-01",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"exemplar": true,
|
||||
"expr": "sum(rate(traefik_service_requests_total{service=\"$service\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "Total requests $service",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Total requests over 5min $service",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 15
|
||||
},
|
||||
"id": 12,
|
||||
"panels": [],
|
||||
"title": "Global stats",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "bars",
|
||||
"fillOpacity": 100,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "normal"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"alertThreshold": true,
|
||||
"legend": {
|
||||
"calcs": ["lastNotNull", "max", "min"],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"hideZeros": false,
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "12.0.0+security-01",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code=\"200\"}[5m])",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{method}} : {{code}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Status code 200 over 5min",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"barWidthFactor": 0.6,
|
||||
"drawStyle": "bars",
|
||||
"fillOpacity": 100,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "normal"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"alertThreshold": true,
|
||||
"legend": {
|
||||
"calcs": ["lastNotNull", "max", "min"],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"hideZeros": false,
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "12.0.0+security-01",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code!=\"200\"}[5m])",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{ method }} : {{code}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Others status code over 5min",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
}
|
||||
},
|
||||
"decimals": 0,
|
||||
"mappings": [],
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 23
|
||||
},
|
||||
"id": 7,
|
||||
"maxDataPoints": 3,
|
||||
"options": {
|
||||
"displayLabels": [],
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"values": ["value"]
|
||||
},
|
||||
"pieType": "pie",
|
||||
"reduceOptions": {
|
||||
"calcs": ["sum"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"text": {},
|
||||
"tooltip": {
|
||||
"hideZeros": false,
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "12.0.0+security-01",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"exemplar": true,
|
||||
"expr": "sum(rate(traefik_service_requests_total[5m])) by (service) ",
|
||||
"format": "time_series",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{ service }}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Requests by service",
|
||||
"type": "piechart"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
}
|
||||
},
|
||||
"decimals": 0,
|
||||
"mappings": [],
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 23
|
||||
},
|
||||
"id": 8,
|
||||
"maxDataPoints": 3,
|
||||
"options": {
|
||||
"displayLabels": [],
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"showLegend": true,
|
||||
"values": ["value"]
|
||||
},
|
||||
"pieType": "pie",
|
||||
"reduceOptions": {
|
||||
"calcs": ["sum"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"text": {},
|
||||
"tooltip": {
|
||||
"hideZeros": false,
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "12.0.0+security-01",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"exemplar": true,
|
||||
"expr": "sum(rate(traefik_entrypoint_requests_total{entrypoint =~ \"$entrypoint\"}[5m])) by (entrypoint) ",
|
||||
"format": "time_series",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{ entrypoint }}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Requests by protocol",
|
||||
"type": "piechart"
|
||||
}
|
||||
],
|
||||
"preload": false,
|
||||
"schemaVersion": 41,
|
||||
"tags": ["traefik", "prometheus"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"datasource": "Prometheus",
|
||||
"definition": "label_values({job=\"traefik\"},service)",
|
||||
"includeAll": false,
|
||||
"name": "service",
|
||||
"options": [],
|
||||
"query": {
|
||||
"qryType": 1,
|
||||
"query": "label_values({job=\"traefik\"},service)",
|
||||
"refId": "PrometheusVariableQueryEditor-VariableQuery"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"current": {},
|
||||
"datasource": "Prometheus",
|
||||
"definition": "",
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"name": "entrypoint",
|
||||
"options": [],
|
||||
"query": {
|
||||
"query": "label_values(entrypoint)",
|
||||
"refId": "Prometheus-entrypoint-Variable-Query"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Traefik",
|
||||
"uid": "qPdAviJmz",
|
||||
"version": 1
|
||||
}
|
||||
13554
machines/monitor/dashboards/node-exporter.json
Normal file
13554
machines/monitor/dashboards/node-exporter.json
Normal file
File diff suppressed because it is too large
Load diff
3096
machines/monitor/dashboards/postgres.json
Normal file
3096
machines/monitor/dashboards/postgres.json
Normal file
File diff suppressed because it is too large
Load diff
2043
machines/monitor/dashboards/promtail.json
Normal file
2043
machines/monitor/dashboards/promtail.json
Normal file
File diff suppressed because it is too large
Load diff
1087
machines/monitor/dashboards/traefik-access.json
Normal file
1087
machines/monitor/dashboards/traefik-access.json
Normal file
File diff suppressed because it is too large
Load diff
1619
machines/monitor/dashboards/traefik.json
Normal file
1619
machines/monitor/dashboards/traefik.json
Normal file
File diff suppressed because it is too large
Load diff
13
machines/monitor/definition.nix
Normal file
13
machines/monitor/definition.nix
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
{
|
||||
imports = [
|
||||
./alertmanager.nix
|
||||
./prometheus.nix
|
||||
./influxdb.nix
|
||||
./loki.nix
|
||||
./grafana.nix
|
||||
|
||||
./jellyfin-exporter.nix
|
||||
];
|
||||
|
||||
system.stateVersion = "25.05";
|
||||
}
|
||||
126
machines/monitor/grafana.nix
Normal file
126
machines/monitor/grafana.nix
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
{
|
||||
config,
|
||||
pkgs,
|
||||
modulesPath,
|
||||
lib,
|
||||
...
|
||||
}: {
|
||||
services.grafana.enable = true;
|
||||
services.grafana.settings = {
|
||||
server = {
|
||||
http_port = 3000;
|
||||
http_addr = "0.0.0.0";
|
||||
# Grafana needs to know on which domain and URL it's running
|
||||
domain = "grafana.procopius.dk";
|
||||
root_url = "https://grafana.procopius.dk"; # Not needed if it is `https://your.domain/`
|
||||
# serve_from_sub_path = true;
|
||||
oauth_auto_login = false;
|
||||
};
|
||||
"auth.generic_oauth" = {
|
||||
enabled = false;
|
||||
};
|
||||
"auth" = {
|
||||
disable_login_form = false;
|
||||
};
|
||||
};
|
||||
|
||||
networking.firewall.allowedTCPPorts = [3000];
|
||||
|
||||
services.grafana = {
|
||||
# declarativePlugins = with pkgs.grafanaPlugins; [ ... ];
|
||||
|
||||
provision = {
|
||||
enable = true;
|
||||
|
||||
datasources.settings.datasources = [
|
||||
# "Built-in" datasources can be provisioned - c.f. https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources
|
||||
{
|
||||
uid = "prometheus";
|
||||
name = "Prometheus";
|
||||
type = "prometheus";
|
||||
url = "http://127.0.0.1:${toString config.services.prometheus.port}";
|
||||
}
|
||||
{
|
||||
uid = "loki";
|
||||
name = "Loki";
|
||||
type = "loki";
|
||||
url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}";
|
||||
}
|
||||
{
|
||||
uid = "influxdb";
|
||||
name = "InfluxDB";
|
||||
type = "influxdb";
|
||||
url = "http://127.0.0.1:8086";
|
||||
access = "proxy";
|
||||
jsonData = {
|
||||
dbName = "proxmox";
|
||||
httpHeaderName1 = "Authorization";
|
||||
};
|
||||
secureJsonData = {
|
||||
httpHeaderValue1 = "Token iY4MTuqUAVJbBkDUiMde";
|
||||
};
|
||||
}
|
||||
];
|
||||
|
||||
# Note: removing attributes from the above `datasources.settings.datasources` is not enough for them to be deleted on `grafana`;
|
||||
# One needs to use the following option:
|
||||
# datasources.settings.deleteDatasources = [ { name = "prometheus"; orgId = 1; } { name = "loki"; orgId = 1; } ];
|
||||
|
||||
dashboards.settings.providers = [
|
||||
{
|
||||
name = "my dashboards";
|
||||
options.path = "/etc/grafana-dashboards";
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
environment.etc."grafana-dashboards/traefik.json" = {
|
||||
source = ./dashboards/traefik.json;
|
||||
user = "grafana";
|
||||
group = "grafana";
|
||||
mode = "0644";
|
||||
};
|
||||
|
||||
environment.etc."grafana-dashboards/traefik-access.json" = {
|
||||
source = ./dashboards/traefik-access.json;
|
||||
user = "grafana";
|
||||
group = "grafana";
|
||||
mode = "0644";
|
||||
};
|
||||
|
||||
environment.etc."grafana-dashboards/grafana-traefik.json" = {
|
||||
source = ./dashboards/grafana-traefik.json;
|
||||
user = "grafana";
|
||||
group = "grafana";
|
||||
mode = "0644";
|
||||
};
|
||||
|
||||
environment.etc."grafana-dashboards/node-exporter.json" = {
|
||||
source = ./dashboards/node-exporter.json;
|
||||
user = "grafana";
|
||||
group = "grafana";
|
||||
mode = "0644";
|
||||
};
|
||||
|
||||
environment.etc."grafana-dashboards/promtail.json" = {
|
||||
source = ./dashboards/promtail.json;
|
||||
user = "grafana";
|
||||
group = "grafana";
|
||||
mode = "0644";
|
||||
};
|
||||
|
||||
environment.etc."grafana-dashboards/gitea.json" = {
|
||||
source = ./dashboards/gitea.json;
|
||||
user = "grafana";
|
||||
group = "grafana";
|
||||
mode = "0644";
|
||||
};
|
||||
|
||||
environment.etc."grafana-dashboards/postgres.json" = {
|
||||
source = ./dashboards/postgres.json;
|
||||
user = "grafana";
|
||||
group = "grafana";
|
||||
mode = "0644";
|
||||
};
|
||||
}
|
||||
35
machines/monitor/influxdb.nix
Normal file
35
machines/monitor/influxdb.nix
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
{
|
||||
config,
|
||||
pkgs,
|
||||
...
|
||||
}: let
|
||||
influxdbPassword = config.sops.secrets."influxdb/password".path;
|
||||
influxdbToken = config.sops.secrets."influxdb/token".path;
|
||||
in {
|
||||
sops.secrets."influxdb/password" = {
|
||||
sopsFile = ../../secrets/secrets.yaml;
|
||||
owner = "influxdb2";
|
||||
};
|
||||
sops.secrets."influxdb/token" = {
|
||||
sopsFile = ../../secrets/secrets.yaml;
|
||||
owner = "influxdb2";
|
||||
};
|
||||
|
||||
networking.firewall.allowedTCPPorts = [8086];
|
||||
|
||||
services.influxdb2 = {
|
||||
enable = true;
|
||||
settings = {
|
||||
};
|
||||
provision = {
|
||||
enable = true;
|
||||
initialSetup = {
|
||||
username = "plasmagoat";
|
||||
passwordFile = influxdbPassword;
|
||||
tokenFile = influxdbToken;
|
||||
organization = "procopius";
|
||||
bucket = "proxmox";
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
14
machines/monitor/jellyfin-exporter.nix
Normal file
14
machines/monitor/jellyfin-exporter.nix
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
virtualisation.oci-containers.containers = {
|
||||
jellyfin_exporter = {
|
||||
image = "rebelcore/jellyfin-exporter:latest";
|
||||
ports = [
|
||||
"9594:9594"
|
||||
];
|
||||
cmd = [
|
||||
"--jellyfin.address=http://media.lab:8096"
|
||||
"--jellyfin.token=f7c89e5aa307434c9b3ecb329e896335"
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
37
machines/monitor/loki.nix
Normal file
37
machines/monitor/loki.nix
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
{
|
||||
networking.firewall.allowedTCPPorts = [ 3100 ];
|
||||
|
||||
services.loki = {
|
||||
enable = true;
|
||||
configuration = {
|
||||
server.http_listen_port = 3100;
|
||||
auth_enabled = false;
|
||||
analytics.reporting_enabled = false;
|
||||
|
||||
common = {
|
||||
ring = {
|
||||
instance_addr = "127.0.0.1";
|
||||
kvstore.store = "inmemory";
|
||||
};
|
||||
replication_factor = 1;
|
||||
path_prefix = "/tmp/loki";
|
||||
};
|
||||
|
||||
schema_config = {
|
||||
configs = [
|
||||
{
|
||||
from = "2020-05-15";
|
||||
store = "tsdb";
|
||||
object_store = "filesystem";
|
||||
schema = "v13";
|
||||
index = {
|
||||
prefix = "index_";
|
||||
period = "24h";
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
storage_config.filesystem.directory = "/var/lib/loki/chunk";
|
||||
};
|
||||
};
|
||||
}
|
||||
185
machines/monitor/prometheus.nix
Normal file
185
machines/monitor/prometheus.nix
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
{
|
||||
config,
|
||||
pkgs,
|
||||
modulesPath,
|
||||
lib,
|
||||
...
|
||||
}: let
|
||||
monitor_hostname = "monitor.lab";
|
||||
traefik_hostname = "traefik.lab";
|
||||
sandbox_hostname = "sandbox.lab";
|
||||
forgejo_hostname = "forgejo.lab";
|
||||
runner01_hostname = "forgejo-runner-01.lab";
|
||||
dnsmasq_hostname = "dns.lab";
|
||||
media_hostname = "media.lab";
|
||||
mail_hostname = "mail.lab";
|
||||
keycloak_hostname = "keycloak.lab";
|
||||
|
||||
monitored_hosts = [
|
||||
monitor_hostname
|
||||
traefik_hostname
|
||||
sandbox_hostname
|
||||
forgejo_hostname
|
||||
runner01_hostname
|
||||
dnsmasq_hostname
|
||||
media_hostname
|
||||
mail_hostname
|
||||
keycloak_hostname
|
||||
];
|
||||
|
||||
# integrate colmena names and targetHost to generate nodeexporters
|
||||
generateTargets = port:
|
||||
map (host: "${host}:${toString port}") monitored_hosts;
|
||||
|
||||
instance_relabel_config = [
|
||||
{
|
||||
source_labels = ["__address__"];
|
||||
regex = "([^:]+):\\d+"; # Captures everything before the last colon
|
||||
target_label = "instance";
|
||||
replacement = "$1";
|
||||
}
|
||||
];
|
||||
|
||||
node_exporter_port = 9100;
|
||||
node_exporter_job = {
|
||||
job_name = "node";
|
||||
static_configs = [{targets = generateTargets node_exporter_port;}];
|
||||
relabel_configs = instance_relabel_config;
|
||||
};
|
||||
|
||||
promtail_port = 9080;
|
||||
promtail_job = {
|
||||
job_name = "promtail";
|
||||
static_configs = [{targets = generateTargets promtail_port;}];
|
||||
relabel_configs = instance_relabel_config;
|
||||
};
|
||||
|
||||
prometheus_target = "${monitor_hostname}:9090";
|
||||
prometheus_job = {
|
||||
job_name = "prometheus";
|
||||
static_configs = [{targets = [prometheus_target];}];
|
||||
relabel_configs = instance_relabel_config;
|
||||
};
|
||||
|
||||
alertmanager_target = "${monitor_hostname}:9093";
|
||||
alertmanager_job = {
|
||||
job_name = "alertmanager";
|
||||
static_configs = [{targets = [alertmanager_target];}];
|
||||
relabel_configs = instance_relabel_config;
|
||||
};
|
||||
|
||||
grafana_target = "${monitor_hostname}:3000";
|
||||
grafana_job = {
|
||||
job_name = "grafana";
|
||||
static_configs = [{targets = [grafana_target];}];
|
||||
relabel_configs = instance_relabel_config;
|
||||
};
|
||||
|
||||
traefik_monitor_port = 8082;
|
||||
traefik_job = {
|
||||
job_name = "traefik";
|
||||
static_configs = [{targets = ["${traefik_hostname}:${toString traefik_monitor_port}"];}];
|
||||
relabel_configs = instance_relabel_config;
|
||||
};
|
||||
|
||||
forgejo_monitor_port = 3000;
|
||||
forgejo_job = {
|
||||
job_name = "forgejo";
|
||||
static_configs = [{targets = ["${forgejo_hostname}:${toString forgejo_monitor_port}"];}];
|
||||
relabel_configs = instance_relabel_config;
|
||||
};
|
||||
|
||||
postgres_exporter_port = 9187;
|
||||
postgres_job = {
|
||||
job_name = "postgres";
|
||||
static_configs = [{targets = ["${forgejo_hostname}:${toString postgres_exporter_port}"];}];
|
||||
relabel_configs = instance_relabel_config;
|
||||
};
|
||||
|
||||
dnsmasq_exporter_port = 9153;
|
||||
dnsmasq_job = {
|
||||
job_name = "dnsmasq";
|
||||
static_configs = [{targets = ["${dnsmasq_hostname}:${toString dnsmasq_exporter_port}"];}];
|
||||
relabel_configs = instance_relabel_config;
|
||||
};
|
||||
|
||||
# --- Media Stack Scrape Job ---
|
||||
media_stack_job = {
|
||||
job_name = "media_stack";
|
||||
static_configs = [
|
||||
{
|
||||
targets = [
|
||||
"${media_hostname}:9707" # sonarr
|
||||
"${media_hostname}:9708" # readarr
|
||||
"${media_hostname}:9709" # radarr
|
||||
"${media_hostname}:9710" # prowlarr
|
||||
"${media_hostname}:9711" # lidarr
|
||||
"${media_hostname}:9712" # bazarr
|
||||
];
|
||||
}
|
||||
];
|
||||
relabel_configs = instance_relabel_config;
|
||||
};
|
||||
|
||||
jellyfin_port = 8096;
|
||||
jellyfin_exporter_port = 9594;
|
||||
jellyfin_job = {
|
||||
job_name = "jellyfin";
|
||||
static_configs = [
|
||||
{
|
||||
targets = [
|
||||
"${media_hostname}:${toString jellyfin_port}"
|
||||
"${monitor_hostname}:${toString jellyfin_exporter_port}"
|
||||
];
|
||||
}
|
||||
];
|
||||
relabel_configs = instance_relabel_config;
|
||||
};
|
||||
in {
|
||||
networking.firewall.allowedTCPPorts = [9090];
|
||||
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
retentionTime = "7d";
|
||||
globalConfig = {
|
||||
scrape_timeout = "10s";
|
||||
scrape_interval = "30s";
|
||||
# A short evaluation_interval will check alerting rules very often.
|
||||
# It can be costly if you run Prometheus with 100+ alerts.
|
||||
evaluation_interval = "20s";
|
||||
};
|
||||
extraFlags = [
|
||||
"--web.enable-admin-api"
|
||||
];
|
||||
|
||||
scrapeConfigs = [
|
||||
node_exporter_job
|
||||
promtail_job
|
||||
prometheus_job
|
||||
alertmanager_job
|
||||
grafana_job
|
||||
traefik_job
|
||||
forgejo_job
|
||||
postgres_job
|
||||
dnsmasq_job
|
||||
media_stack_job
|
||||
jellyfin_job
|
||||
];
|
||||
|
||||
alertmanagers = [
|
||||
{
|
||||
scheme = "http";
|
||||
static_configs = [{targets = [alertmanager_target];}];
|
||||
}
|
||||
];
|
||||
|
||||
ruleFiles = [
|
||||
(pkgs.writeText "prometheus-alerts.yml" (builtins.readFile ./provisioning/alerts/prometheus-alerts.yml))
|
||||
(pkgs.writeText "loki-alerts.yml" (builtins.readFile ./provisioning/alerts/loki-alerts.yml))
|
||||
(pkgs.writeText "promtail-alerts.yml" (builtins.readFile ./provisioning/alerts/promtail-alerts.yml))
|
||||
(pkgs.writeText "postgres-alerts.yml" (builtins.readFile ./provisioning/alerts/postgres-alerts.yml))
|
||||
(pkgs.writeText "traefik-alerts.yml" (builtins.readFile ./provisioning/alerts/traefik-alerts.yml))
|
||||
(pkgs.writeText "node-exporter-alerts.yml" (builtins.readFile ./provisioning/alerts/node-exporter-alerts.yml))
|
||||
];
|
||||
};
|
||||
}
|
||||
39
machines/monitor/provisioning/alerts/loki-alerts.yml
Normal file
39
machines/monitor/provisioning/alerts/loki-alerts.yml
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
groups:
|
||||
- name: Loki
|
||||
|
||||
rules:
|
||||
- alert: LokiProcessTooManyRestarts
|
||||
expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Loki process too many restarts (instance {{ $labels.instance }})
|
||||
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestErrors
|
||||
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request errors (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestPanic
|
||||
expr: "sum(increase(loki_panic_total[10m])) by (namespace, job) > 0"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request panic (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestLatency
|
||||
expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request latency (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
318
machines/monitor/provisioning/alerts/node-exporter-alerts.yml
Normal file
318
machines/monitor/provisioning/alerts/node-exporter-alerts.yml
Normal file
|
|
@ -0,0 +1,318 @@
|
|||
groups:
|
||||
- name: NodeExporter
|
||||
|
||||
rules:
|
||||
- alert: HostOutOfMemory
|
||||
expr: "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: "(rate(node_vmstat_pgmajfault[5m]) > 1000)"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryIsUnderutilized
|
||||
expr: "min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr: "((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: "((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: "(rate(node_disk_io_time_seconds_total[5m]) > .80)"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostDiskMayFillIn24Hours
|
||||
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfInodes
|
||||
expr: "(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostFilesystemDeviceError
|
||||
expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host filesystem device error (instance {{ $labels.instance }})
|
||||
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostInodesMayFillIn24Hours
|
||||
expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadLatency
|
||||
expr: "(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteLatency
|
||||
expr: "(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuIsUnderutilized
|
||||
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.95'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuHighIowait
|
||||
expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
||||
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskIo
|
||||
expr: "rate(node_disk_io_time_seconds_total[5m]) > 0.8"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostContextSwitchingHigh
|
||||
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host context switching high (instance {{ $labels.instance }})
|
||||
description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: "((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSystemdServiceCrashed
|
||||
expr: '(node_systemd_unit_state{state="failed"} == 1)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
||||
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: "node_hwmon_temp_celsius > node_hwmon_temp_max_celsius"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: "((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSoftwareRaidInsufficientDrives
|
||||
expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
|
||||
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSoftwareRaidDiskFailure
|
||||
expr: '(node_md_disks{state="failed"} > 0)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host software RAID disk failure (instance {{ $labels.instance }})
|
||||
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostKernelVersionDeviations
|
||||
expr: "changes(node_uname_info[1h]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||
description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: "(increase(node_vmstat_oom_kill[1m]) > 0)"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr: "(increase(node_edac_correctable_errors_total[1m]) > 0)"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr: "(node_edac_uncorrectable_errors_total > 0)"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: "(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: "(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkBondDegraded
|
||||
expr: "((node_bonding_active - node_bonding_slaves) != 0)"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostConntrackLimit
|
||||
expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr: "((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))"
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock skew (instance {{ $labels.instance }})
|
||||
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: "(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
201
machines/monitor/provisioning/alerts/postgres-alerts.yml
Normal file
201
machines/monitor/provisioning/alerts/postgres-alerts.yml
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
groups:
|
||||
- name: Postgres
|
||||
|
||||
rules:
|
||||
- alert: PostgresqlDown
|
||||
expr: "pg_up == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql down (instance {{ $labels.instance }})
|
||||
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlRestarted
|
||||
expr: "time() - pg_postmaster_start_time_seconds < 60"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql restarted (instance {{ $labels.instance }})
|
||||
description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlExporterError
|
||||
expr: "pg_exporter_last_scrape_error > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql exporter error (instance {{ $labels.instance }})
|
||||
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTableNotAutoVacuumed
|
||||
expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
|
||||
description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTableNotAutoAnalyzed
|
||||
expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
|
||||
description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyConnections
|
||||
expr: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql too many connections (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlNotEnoughConnections
|
||||
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 2'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL instance should have more connections (> 2)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlDeadLocks
|
||||
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql dead locks (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRollbackRate
|
||||
expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql high rollback rate (instance {{ $labels.instance }})
|
||||
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlCommitRateLow
|
||||
expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql commit rate low (instance {{ $labels.instance }})
|
||||
description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlLowXidConsumption
|
||||
expr: "rate(pg_txid_current[1m]) < 5"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql low XID consumption (instance {{ $labels.instance }})
|
||||
description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateStatementTimeout
|
||||
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
|
||||
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateDeadlock
|
||||
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
|
||||
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlUnusedReplicationSlot
|
||||
expr: "pg_replication_slots_active == 0"
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql unused replication slot (instance {{ $labels.instance }})
|
||||
description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyDeadTuples
|
||||
expr: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlConfigurationChanged
|
||||
expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Postgresql configuration changed (instance {{ $labels.instance }})
|
||||
description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlSslCompressionActive
|
||||
expr: "sum(pg_stat_ssl_compression) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql SSL compression active (instance {{ $labels.instance }})
|
||||
description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyLocksAcquired
|
||||
expr: "((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
|
||||
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlBloatIndexHigh(>80%)
|
||||
expr: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)"
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
|
||||
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlBloatTableHigh(>80%)
|
||||
expr: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)"
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
|
||||
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlInvalidIndex
|
||||
expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql invalid index (instance {{ $labels.instance }})
|
||||
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlReplicationLag
|
||||
expr: "pg_replication_lag_seconds > 5"
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql replication lag (instance {{ $labels.instance }})
|
||||
description: "The PostgreSQL replication lag is high (> 5s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
255
machines/monitor/provisioning/alerts/prometheus-alerts.yml
Normal file
255
machines/monitor/provisioning/alerts/prometheus-alerts.yml
Normal file
|
|
@ -0,0 +1,255 @@
|
|||
groups:
|
||||
- name: Prometheus
|
||||
|
||||
rules:
|
||||
- alert: PrometheusJobMissing
|
||||
expr: 'absent(up{job="prometheus"})'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus job missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetMissing
|
||||
expr: "up == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus target missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAllTargetsMissing
|
||||
expr: "sum by (job) (up) == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus all targets missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetMissingWithWarmupTime
|
||||
expr: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus target missing with warmup time (instance {{ $labels.instance }})
|
||||
description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusConfigurationReloadFailure
|
||||
expr: "prometheus_config_last_reload_successful != 1"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
|
||||
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTooManyRestarts
|
||||
expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus too many restarts (instance {{ $labels.instance }})
|
||||
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerJobMissing
|
||||
expr: 'absent(up{job="alertmanager"})'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerConfigurationReloadFailure
|
||||
expr: "alertmanager_config_last_reload_successful != 1"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
|
||||
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerConfigNotSynced
|
||||
expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
|
||||
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerE2eDeadManSwitch
|
||||
expr: "vector(1)"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
|
||||
description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusNotConnectedToAlertmanager
|
||||
expr: "prometheus_notifications_alertmanagers_discovered < 1"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
|
||||
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusRuleEvaluationFailures
|
||||
expr: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTemplateTextExpansionFailures
|
||||
expr: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusRuleEvaluationSlow
|
||||
expr: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
|
||||
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusNotificationsBacklog
|
||||
expr: "min_over_time(prometheus_notifications_queue_length[10m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
|
||||
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerNotificationFailing
|
||||
expr: "rate(alertmanager_notifications_failed_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
|
||||
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetEmpty
|
||||
expr: "prometheus_sd_discovered_targets == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus target empty (instance {{ $labels.instance }})
|
||||
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetScrapingSlow
|
||||
expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
|
||||
description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusLargeScrape
|
||||
expr: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus large scrape (instance {{ $labels.instance }})
|
||||
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetScrapeDuplicate
|
||||
expr: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
|
||||
description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbCheckpointCreationFailures
|
||||
expr: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbCheckpointDeletionFailures
|
||||
expr: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbCompactionsFailed
|
||||
expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbHeadTruncationsFailed
|
||||
expr: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbReloadFailures
|
||||
expr: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbWalCorruptions
|
||||
expr: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbWalTruncationsFailed
|
||||
expr: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTimeseriesCardinality
|
||||
expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
|
||||
description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
21
machines/monitor/provisioning/alerts/promtail-alerts.yml
Normal file
21
machines/monitor/provisioning/alerts/promtail-alerts.yml
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
groups:
|
||||
- name: Promtail
|
||||
|
||||
rules:
|
||||
- alert: PromtailRequestErrors
|
||||
expr: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Promtail request errors (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PromtailRequestLatency
|
||||
expr: "histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Promtail request latency (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
30
machines/monitor/provisioning/alerts/traefik-alerts.yml
Normal file
30
machines/monitor/provisioning/alerts/traefik-alerts.yml
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
groups:
|
||||
- name: Traefik
|
||||
|
||||
rules:
|
||||
- alert: TraefikServiceDown
|
||||
expr: "count(traefik_service_server_up) by (service) == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Traefik service down (instance {{ $labels.instance }})
|
||||
description: "All Traefik services are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TraefikHighHttp4xxErrorRateService
|
||||
expr: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Traefik high HTTP 4xx error rate service (instance {{ $labels.instance }})
|
||||
description: "Traefik service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TraefikHighHttp5xxErrorRateService
|
||||
expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
|
||||
description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
29
machines/monitor/provisioning/templates/telegram.tmpl
Normal file
29
machines/monitor/provisioning/templates/telegram.tmpl
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
{{ define "alert_list" }}{{ range . }}
|
||||
---
|
||||
🪪 <b>{{ .Labels.alertname | html }}</b>
|
||||
{{- if eq .Labels.severity "critical" }}
|
||||
🚨 CRITICAL 🚨 {{ end }}
|
||||
{{- if eq .Labels.severity "warning" }}
|
||||
⚠️ WARNING ⚠️{{ end }}
|
||||
{{- if .Annotations.summary }}
|
||||
📝 {{ .Annotations.summary | html }}{{ end }}
|
||||
{{- if .Annotations.description }}
|
||||
📖 {{ .Annotations.description | html }}{{ end }}
|
||||
|
||||
🏷 Labels:
|
||||
{{ range .Labels.SortedPairs }} <i>{{ .Name | html }}</i>: <code>{{ .Value | html }}</code>
|
||||
{{ end }}{{ end }}
|
||||
🛠 <a href="https://grafana.procopius.dk">Grafana</a>
|
||||
💊 <a href="https://alertmanager.procopius.dk">Alertmanager</a>
|
||||
{{ end }}
|
||||
|
||||
{{ define "telegram.message" }}
|
||||
{{ if gt (len .Alerts.Firing) 0 }}
|
||||
🔥 Alerts Firing 🔥
|
||||
{{ template "alert_list" .Alerts.Firing }}
|
||||
{{ end }}
|
||||
{{ if gt (len .Alerts.Resolved) 0 }}
|
||||
✅ Alerts Resolved ✅
|
||||
{{ template "alert_list" .Alerts.Resolved }}
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
3
machines/sandbox/definition.nix
Normal file
3
machines/sandbox/definition.nix
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
{
|
||||
system.stateVersion = "25.05";
|
||||
}
|
||||
113
nixos/flake.lock
generated
113
nixos/flake.lock
generated
|
|
@ -1,8 +1,82 @@
|
|||
{
|
||||
"nodes": {
|
||||
"colmena": {
|
||||
"inputs": {
|
||||
"flake-compat": "flake-compat",
|
||||
"flake-utils": "flake-utils",
|
||||
"nix-github-actions": "nix-github-actions",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"stable": "stable"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1751144689,
|
||||
"narHash": "sha256-cgIntaqhcm62V1KU6GmrAGpHpahT4UExEWW2ryS02ZU=",
|
||||
"owner": "zhaofengli",
|
||||
"repo": "colmena",
|
||||
"rev": "3ceec72cfb396a8a8de5fe96a9d75a9ce88cc18e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "zhaofengli",
|
||||
"repo": "colmena",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-compat": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1650374568,
|
||||
"narHash": "sha256-Z+s0J8/r907g149rllvwhb4pKi8Wam5ij0st8PwAh+E=",
|
||||
"owner": "edolstra",
|
||||
"repo": "flake-compat",
|
||||
"rev": "b4a34015c698c7793d592d66adbab377907a2be8",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "edolstra",
|
||||
"repo": "flake-compat",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-utils": {
|
||||
"locked": {
|
||||
"lastModified": 1659877975,
|
||||
"narHash": "sha256-zllb8aq3YO3h8B/U0/J1WBgAL8EX5yWf5pMj3G0NAmc=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "c0e246b9b83f637f4681389ecabcb2681b4f3af0",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nix-github-actions": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"colmena",
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1729742964,
|
||||
"narHash": "sha256-B4mzTcQ0FZHdpeWcpDYPERtyjJd/NIuaQ9+BV1h+MpA=",
|
||||
"owner": "nix-community",
|
||||
"repo": "nix-github-actions",
|
||||
"rev": "e04df33f62cdcf93d73e9a04142464753a16db67",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-community",
|
||||
"repo": "nix-github-actions",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixarr": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs",
|
||||
"nixpkgs": "nixpkgs_2",
|
||||
"vpnconfinement": "vpnconfinement",
|
||||
"website-builder": "website-builder"
|
||||
},
|
||||
|
|
@ -21,6 +95,22 @@
|
|||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1750134718,
|
||||
"narHash": "sha256-v263g4GbxXv87hMXMCpjkIxd/viIF7p3JpJrwgKdNiI=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "9e83b64f727c88a7711a2c463a7b16eedb69a84c",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs_2": {
|
||||
"locked": {
|
||||
"lastModified": 1748662220,
|
||||
"narHash": "sha256-7gGa49iB9nCnFk4h/g9zwjlQAyjtpgcFkODjcOQS0Es=",
|
||||
|
|
@ -36,7 +126,7 @@
|
|||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs_2": {
|
||||
"nixpkgs_3": {
|
||||
"locked": {
|
||||
"lastModified": 1748809735,
|
||||
"narHash": "sha256-UR5vKj8rwKQmE8wxKFHgoJKbod05DMoH5phTje4L1l8=",
|
||||
|
|
@ -53,8 +143,9 @@
|
|||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"colmena": "colmena",
|
||||
"nixarr": "nixarr",
|
||||
"nixpkgs": "nixpkgs_2",
|
||||
"nixpkgs": "nixpkgs_3",
|
||||
"sops-nix": "sops-nix"
|
||||
}
|
||||
},
|
||||
|
|
@ -78,6 +169,22 @@
|
|||
"type": "github"
|
||||
}
|
||||
},
|
||||
"stable": {
|
||||
"locked": {
|
||||
"lastModified": 1750133334,
|
||||
"narHash": "sha256-urV51uWH7fVnhIvsZIELIYalMYsyr2FCalvlRTzqWRw=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "36ab78dab7da2e4e27911007033713bab534187b",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-25.05",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"vpnconfinement": {
|
||||
"locked": {
|
||||
"lastModified": 1743810720,
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:nixos/nixpkgs";
|
||||
colmena.url = "github:zhaofengli/colmena";
|
||||
sops-nix = {
|
||||
url = "github:Mic92/sops-nix";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
|
|
@ -14,7 +15,11 @@
|
|||
# };
|
||||
};
|
||||
|
||||
outputs = inputs @ {...}: let
|
||||
outputs = inputs @ {
|
||||
nixpkgs,
|
||||
colmena,
|
||||
...
|
||||
}: let
|
||||
system = "x86_64-linux";
|
||||
|
||||
liveVMs = {
|
||||
|
|
@ -102,5 +107,32 @@
|
|||
};
|
||||
in {
|
||||
nixosConfigurations = liveVMs;
|
||||
|
||||
colmenaHive = colmena.lib.makeHive {
|
||||
meta = {
|
||||
nixpkgs = import nixpkgs {
|
||||
system = "x86_64-linux";
|
||||
overlays = [];
|
||||
};
|
||||
|
||||
defaults = {pkgs, ...}: {
|
||||
};
|
||||
};
|
||||
|
||||
host-b = {
|
||||
name,
|
||||
nodes,
|
||||
pkgs,
|
||||
...
|
||||
}: {
|
||||
deployment = {
|
||||
targetHost = "somehost.tld";
|
||||
targetPort = 1234;
|
||||
targetUser = "luser";
|
||||
};
|
||||
boot.isContainer = true;
|
||||
time.timeZone = "America/Los_Angeles";
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
|
|
|
|||
21
secrets/secrets.yaml
Normal file
21
secrets/secrets.yaml
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
telegram-alert-bot-token: ENC[AES256_GCM,data:7Bvhtrkaqc06xmSeOsw730cAfHAw4qBvz1ontPXO/6j4Hy6AAKbb8LYGIONVGA==,iv:2xdhmAM2anEH7flV72BlfVeXjStu6sEUqT97PW+dY2w=,tag:h12zGj8J0ftKuGuKZuCEmw==,type:str]
|
||||
alertmanager:
|
||||
env: ENC[AES256_GCM,data:lMZVLGY4JNeEa1OhiQsAyBqArDttpMjAILjtFQfi7933RfckJMike9cWOV8pSVMJjKUBCtmnir/KDhbyYCZ3oYQh,iv:dLGqXsvJ8x32bqtcaq66O85HcbF/I78HSmo3o/Sx76o=,tag:SHv07/J8O+JPkpTu5rRCzA==,type:str]
|
||||
influxdb:
|
||||
password: ENC[AES256_GCM,data:OP+4vK6ulZs7jVM4lgnpUatr+Qs=,iv:MEmD6yyy+Z7beVOdR1xNDn0c27DYDIDTYdnaNiaVHks=,tag:dyG7VPPV40JqSE4UAeVbtQ==,type:str]
|
||||
token: ENC[AES256_GCM,data:QraVWLW1uCSF0YvbkHCKYtPvqs0=,iv:pzkfEyLksjRFVj7wZS8LxO0idQTpEk7OTMpQSsuIRvQ=,tag:d6U6vMqEYbu3CaTpnc0gGw==,type:str]
|
||||
sops:
|
||||
age:
|
||||
- recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
|
||||
enc: |
|
||||
-----BEGIN AGE ENCRYPTED FILE-----
|
||||
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBVa29TZnF4K2dtcEc2N1Qy
|
||||
cmVTeWJpZXlYRWV6SG84M2NjVHhiRER4TFdFCjNCb1owUU5lL1BDOGt4ZXhad242
|
||||
cFI2YWd0SnBCV2RXVUNlY2dKYWpUK1kKLS0tIHh6UmRkdlJqeWZHaTFYQ0M4L2xo
|
||||
QzNYRk5ERmR4aGtLQ3dwQ1lPeDZyaEkKJMLXqv6tBBql7VVnWDIwAh24SfQ2O6Ca
|
||||
CEOQTGEonbqr5doWqTsXUXrdQAS0amL45UdT6ITFtfNAjaHwCMfhZg==
|
||||
-----END AGE ENCRYPTED FILE-----
|
||||
lastmodified: "2025-07-06T17:10:59Z"
|
||||
mac: ENC[AES256_GCM,data:dXLWT5fmSs2ddpFPXA1yOtwaej7b3lPesFxN7aEZ/bV6YRr+Ht5dHFQcXO0TfJArzhFRRtAumdcdVsorMkR4tao4XCcimACcWrZgXlXGM6XgT3hdPJ4006QLePXU+uyzpqyEuOouaxF7fyuSTL68uDr+E/NAHgmP2dnqpWnebpY=,iv:oUkmH/ngp8wvbuXay+2X6YBqhesNdtOPZOV4lvsc/s4=,tag:GErA9zgdkTarUD6fWiMupg==,type:str]
|
||||
unencrypted_suffix: _unencrypted
|
||||
version: 3.10.2
|
||||
Loading…
Add table
Add a link
Reference in a new issue