ready for runners
This commit is contained in:
parent
fc9971ddc9
commit
7dd5043b5d
49 changed files with 2569 additions and 1085 deletions
|
|
@ -1,5 +1,9 @@
|
||||||
nixos-rebuild switch --flake .#traefik --target-host root@192.168.1.171 --verbose
|
nixos-rebuild switch --flake .#traefik --target-host root@192.168.1.171 --verbose
|
||||||
nixos-rebuild switch --flake .#proxmox --target-host root@192.168.1.205 --verbose
|
nixos-rebuild switch --flake .#proxmox --target-host root@192.168.1.205 --verbose
|
||||||
nixos-rebuild switch --flake .#sandbox --target-host root@192.168.1.148 --verbose
|
nixos-rebuild switch --flake .#sandbox --target-host root@192.168.1.148 --verbose
|
||||||
nixos-rebuild switch --flake .#monitoring --target-host root@192.168.1.88 --verbose
|
nixos-rebuild switch --flake .#monitoring --target-host root@monitor.lab --verbose
|
||||||
nixos-rebuild switch --flake .#forgejo --target-host root@192.168.1.249 --verbose
|
nixos-rebuild switch --flake .#forgejo --target-host root@192.168.1.249 --verbose
|
||||||
|
nixos-rebuild switch --flake .#dns --target-host root@192.168.1.140 --verbose
|
||||||
|
|
||||||
|
nixos-rebuild switch --flake .#runner --target-host root@forgejo-runner-01.lab --override-input runnerId 01
|
||||||
|
nixos-rebuild switch --flake .#runner01 --target-host root@forgejo-runner-01.lab --verbose
|
||||||
|
|
|
||||||
|
|
@ -34,9 +34,14 @@
|
||||||
modules = [ ./hosts/sandbox/host.nix ];
|
modules = [ ./hosts/sandbox/host.nix ];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
dns = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
modules = [ ./hosts/dns/host.nix ];
|
||||||
|
};
|
||||||
|
|
||||||
monitoring = nixpkgs.lib.nixosSystem {
|
monitoring = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
modules = [ ./hosts/monitoring/host.nix ];
|
modules = [ ./hosts/monitoring/host.nix sops-nix.nixosModules.sops ];
|
||||||
};
|
};
|
||||||
|
|
||||||
forgejo = nixpkgs.lib.nixosSystem {
|
forgejo = nixpkgs.lib.nixosSystem {
|
||||||
|
|
@ -44,6 +49,12 @@
|
||||||
modules = [ ./hosts/forgejo/host.nix sops-nix.nixosModules.sops ];
|
modules = [ ./hosts/forgejo/host.nix sops-nix.nixosModules.sops ];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
runner01 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
modules = [ ./hosts/forgejo-runner/host.nix sops-nix.nixosModules.sops ];
|
||||||
|
specialArgs.runnerId = "01";
|
||||||
|
};
|
||||||
|
|
||||||
# dockerHost = pkgs.lib.nixosSystem {
|
# dockerHost = pkgs.lib.nixosSystem {
|
||||||
# inherit system;
|
# inherit system;
|
||||||
# modules = [
|
# modules = [
|
||||||
|
|
|
||||||
61
nixos/hosts/dns/dnsmasq.nix
Normal file
61
nixos/hosts/dns/dnsmasq.nix
Normal file
|
|
@ -0,0 +1,61 @@
|
||||||
|
{
|
||||||
|
services.dnsmasq = {
|
||||||
|
enable = true;
|
||||||
|
alwaysKeepRunning = true;
|
||||||
|
settings = {
|
||||||
|
domain = "lab";
|
||||||
|
expand-hosts = true;
|
||||||
|
domain-needed = true;
|
||||||
|
|
||||||
|
# interface = "eth0"; # Replace with your real interface
|
||||||
|
bind-interfaces = true;
|
||||||
|
|
||||||
|
local = [
|
||||||
|
"/lab/"
|
||||||
|
"/procopius.dk/"
|
||||||
|
];
|
||||||
|
bogus-priv = true;
|
||||||
|
no-resolv = true;
|
||||||
|
|
||||||
|
# no-hosts = true; # Prevent 127.0.0.2 etc from leaking in
|
||||||
|
server = [
|
||||||
|
"8.8.8.8"
|
||||||
|
"8.8.4.4"
|
||||||
|
"1.1.1.1"
|
||||||
|
"1.0.0.1"
|
||||||
|
];
|
||||||
|
|
||||||
|
# Static DNS entry: map hostname to IP (without DHCP)
|
||||||
|
address = [
|
||||||
|
# Static IPs
|
||||||
|
"/dns.lab/192.168.1.53"
|
||||||
|
"/traefik.lab/192.168.1.80"
|
||||||
|
# "/proxmox-01.lab/192.168.1.205"
|
||||||
|
# "/nas-01.lab/192.168.1.226"
|
||||||
|
|
||||||
|
# Split Horizon DNS
|
||||||
|
"/procopius.dk/192.168.1.80"
|
||||||
|
"/.procopius.dk/192.168.1.80"
|
||||||
|
];
|
||||||
|
|
||||||
|
cache-size = 10000;
|
||||||
|
|
||||||
|
dhcp-authoritative = true;
|
||||||
|
dhcp-range = "192.168.1.100,192.168.1.254,12h";
|
||||||
|
dhcp-host = "bc:24:11:58:f5:da,dns,192.168.1.53";
|
||||||
|
# "Use 192.168.1.53 as your DNS server."
|
||||||
|
dhcp-option = [
|
||||||
|
"option:router,192.168.1.1" # router
|
||||||
|
"option:dns-server,192.168.1.53" # DNS server (this VM)
|
||||||
|
];
|
||||||
|
|
||||||
|
log-queries = true;
|
||||||
|
localise-queries = true;
|
||||||
|
log-async = true;
|
||||||
|
# log-facility = "/var/log/dnsmasq/dnsmasq.log";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
services.prometheus.exporters.dnsmasq.enable = true;
|
||||||
|
services.prometheus.exporters.dnsmasq.openFirewall = true;
|
||||||
|
}
|
||||||
|
|
@ -3,10 +3,7 @@
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../../templates/base.nix
|
../../templates/base.nix
|
||||||
../../secrets/sops.nix
|
|
||||||
./networking.nix
|
./networking.nix
|
||||||
./storage.nix
|
./dnsmasq.nix
|
||||||
./forgejo.nix
|
|
||||||
./database.nix
|
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
19
nixos/hosts/dns/networking.nix
Normal file
19
nixos/hosts/dns/networking.nix
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
{
|
||||||
|
networking.hostName = "dns";
|
||||||
|
# networking.useHostResolvConf = false;
|
||||||
|
# networking.interfaces.eth0.useDHCP = true;
|
||||||
|
networking.interfaces.eth0.ipv4.addresses = [{
|
||||||
|
address = "192.168.1.53";
|
||||||
|
prefixLength = 24;
|
||||||
|
}];
|
||||||
|
|
||||||
|
networking.defaultGateway = "192.168.1.1"; # your router
|
||||||
|
networking.nameservers = [ "8.8.8.8" ]; # fallback resolvers
|
||||||
|
|
||||||
|
networking.firewall.allowedTCPPorts = [ 53 67 80 443 ];
|
||||||
|
networking.firewall.allowedUDPPorts = [ 53 67 ];
|
||||||
|
|
||||||
|
networking.hosts = {
|
||||||
|
"192.168.1.53" = [ "dns" "dns.lab" ];
|
||||||
|
};
|
||||||
|
}
|
||||||
9
nixos/hosts/forgejo-runner/host.nix
Normal file
9
nixos/hosts/forgejo-runner/host.nix
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
../../templates/base.nix
|
||||||
|
../../secrets/shared-sops.nix
|
||||||
|
./runner.nix
|
||||||
|
./networking.nix
|
||||||
|
./sops.nix
|
||||||
|
];
|
||||||
|
}
|
||||||
4
nixos/hosts/forgejo-runner/networking.nix
Normal file
4
nixos/hosts/forgejo-runner/networking.nix
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
{ config, lib, pkgs, runnerId, ... }:
|
||||||
|
{
|
||||||
|
networking.hostName = "forgejo-runner-${runnerId}";
|
||||||
|
}
|
||||||
35
nixos/hosts/forgejo-runner/runner.nix
Normal file
35
nixos/hosts/forgejo-runner/runner.nix
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
{ config, pkgs,... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
# users.users.forgejo-runner = {
|
||||||
|
# isSystemUser = true;
|
||||||
|
# extraGroups = [ "docker" ]; # Optional: if using docker jobs
|
||||||
|
# };
|
||||||
|
|
||||||
|
services.gitea-actions-runner = {
|
||||||
|
package = pkgs.forgejo-actions-runner;
|
||||||
|
instances.default = {
|
||||||
|
enable = true;
|
||||||
|
name = config.networking.hostName;
|
||||||
|
url = "https://git.procopius.dk";
|
||||||
|
# Obtaining the path to the runner token file may differ
|
||||||
|
# tokenFile should be in format TOKEN=<secret>, since it's EnvironmentFile for systemd
|
||||||
|
tokenFile = config.sops.secrets."forgejo-runner-registration-token".path;
|
||||||
|
labels = [
|
||||||
|
"ubuntu-latest:docker://node:16-bullseye"
|
||||||
|
# "ubuntu-22.04:docker://node:16-bullseye"
|
||||||
|
# "ubuntu-20.04:docker://node:16-bullseye"
|
||||||
|
# "ubuntu-18.04:docker://node:16-buster"
|
||||||
|
## optionally provide native execution on the host:
|
||||||
|
# "native:host"
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# systemd.services."forgejo-actions-runner-default".serviceConfig = {
|
||||||
|
# User = "forgejo-runner";
|
||||||
|
# Group = "forgejo-runner";
|
||||||
|
# };
|
||||||
|
|
||||||
|
virtualisation.docker.enable = true; # Optional: if using docker
|
||||||
|
}
|
||||||
7
nixos/hosts/forgejo-runner/sops.nix
Normal file
7
nixos/hosts/forgejo-runner/sops.nix
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
{ config, lib, ... }:
|
||||||
|
{
|
||||||
|
sops.secrets."forgejo-runner-registration-token" = {
|
||||||
|
sopsFile = ../../secrets/forgejo/runner-secrets.yml;
|
||||||
|
mode = "0440";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
@ -21,11 +21,6 @@
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
services.prometheus.exporters.postgres = {
|
services.prometheus.exporters.postgres.enable = true;
|
||||||
enable = true;
|
services.prometheus.exporters.postgres.openFirewall = true;
|
||||||
listenAddress = "0.0.0.0";
|
|
||||||
port = 9187;
|
|
||||||
};
|
|
||||||
networking.firewall.allowedTCPPorts = [ 9187 ];
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -47,14 +47,11 @@ in
|
||||||
|
|
||||||
security = {
|
security = {
|
||||||
INSTALL_LOCK = true;
|
INSTALL_LOCK = true;
|
||||||
SECRET_KEY = "changeme"; # can be another secret
|
SECRET_KEY = config.sops.secrets."forgejo-secret-key".path; # can be another secret
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
sops.secrets.forgejo-admin-password.owner = "forgejo";
|
|
||||||
sops.secrets.forgejo-db-password.owner = "forgejo";
|
|
||||||
|
|
||||||
systemd.services.forgejo.preStart = let
|
systemd.services.forgejo.preStart = let
|
||||||
adminCmd = "${lib.getExe cfg.package} admin user";
|
adminCmd = "${lib.getExe cfg.package} admin user";
|
||||||
user = "plasmagoat"; # Note, Forgejo doesn't allow creation of an account named "admin"
|
user = "plasmagoat"; # Note, Forgejo doesn't allow creation of an account named "admin"
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,4 @@
|
||||||
{ config, lib, pkgs, ... }: {
|
{ config, lib, pkgs, ... }:
|
||||||
|
{
|
||||||
networking = {
|
networking.hostName = "forgejo";
|
||||||
hostName = "forgejo";
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
19
nixos/hosts/forgejo/sops.nix
Normal file
19
nixos/hosts/forgejo/sops.nix
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
let
|
||||||
|
forgejoSops = ../../secrets/forgejo/secrets.yml;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
sops.secrets = {
|
||||||
|
"forgejo-admin-password" = {
|
||||||
|
sopsFile = forgejoSops;
|
||||||
|
owner = "forgejo";
|
||||||
|
};
|
||||||
|
"forgejo-db-password" = {
|
||||||
|
sopsFile = forgejoSops;
|
||||||
|
owner = "forgejo";
|
||||||
|
};
|
||||||
|
"forgejo-secret-key" = {
|
||||||
|
sopsFile = forgejoSops;
|
||||||
|
owner = "forgejo";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
59
nixos/hosts/monitoring/alertmanager.nix
Normal file
59
nixos/hosts/monitoring/alertmanager.nix
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
{ config, pkgs, modulesPath, lib, ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
services.prometheus.alertmanagers = [ {
|
||||||
|
scheme = "http";
|
||||||
|
# path_prefix = "/alertmanager";
|
||||||
|
static_configs = [ {
|
||||||
|
targets = [
|
||||||
|
"localhost:9093"
|
||||||
|
];
|
||||||
|
} ];
|
||||||
|
} ];
|
||||||
|
services.prometheus.alertmanager = {
|
||||||
|
enable = true;
|
||||||
|
openFirewall = true;
|
||||||
|
webExternalUrl = "http://monitor.lab:9093"; # optional but helpful
|
||||||
|
configuration = {
|
||||||
|
route = {
|
||||||
|
group_wait = "10s";
|
||||||
|
group_interval = "30s";
|
||||||
|
repeat_interval = "30m";
|
||||||
|
receiver = "telegram";
|
||||||
|
|
||||||
|
routes = [
|
||||||
|
{
|
||||||
|
receiver = "telegram";
|
||||||
|
group_wait = "10s";
|
||||||
|
match_re = {
|
||||||
|
severity = "critical|warning";
|
||||||
|
};
|
||||||
|
continue = true;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
|
receivers = [
|
||||||
|
{
|
||||||
|
name = "telegram";
|
||||||
|
telegram_configs = [
|
||||||
|
{
|
||||||
|
api_url = "https://api.telegram.org";
|
||||||
|
bot_token = config.sops.secrets."telegram-alert-bot-token".path;
|
||||||
|
chat_id = -1002642560007;
|
||||||
|
message_thread_id = 4;
|
||||||
|
parse_mode = "HTML";
|
||||||
|
send_resolved = false;
|
||||||
|
message = "{{ template \"telegram.message\". }}";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
templates = [
|
||||||
|
(pkgs.writeText "telegram.tmpl" (builtins.readFile ./provisioning/templates/telegram.tmpl))
|
||||||
|
(pkgs.writeText "telegram.markdown.v2.tmpl" (builtins.readFile ./provisioning/templates/telegram.markdown.v2.tmpl))
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -6,7 +6,7 @@
|
||||||
http_port = 3000;
|
http_port = 3000;
|
||||||
http_addr = "0.0.0.0";
|
http_addr = "0.0.0.0";
|
||||||
# Grafana needs to know on which domain and URL it's running
|
# Grafana needs to know on which domain and URL it's running
|
||||||
domain = "grafana.procopius.dk";
|
domain = "grafana.lab";
|
||||||
# root_url = "https://monitor.local/grafana/"; # Not needed if it is `https://your.domain/`
|
# root_url = "https://monitor.local/grafana/"; # Not needed if it is `https://your.domain/`
|
||||||
# serve_from_sub_path = true;
|
# serve_from_sub_path = true;
|
||||||
};
|
};
|
||||||
|
|
@ -45,12 +45,6 @@
|
||||||
# One needs to use the following option:
|
# One needs to use the following option:
|
||||||
# datasources.settings.deleteDatasources = [ { name = "prometheus"; orgId = 1; } { name = "loki"; orgId = 1; } ];
|
# datasources.settings.deleteDatasources = [ { name = "prometheus"; orgId = 1; } { name = "loki"; orgId = 1; } ];
|
||||||
|
|
||||||
alerting.rules.path = "/etc/grafana/provisioning/alerting/alerts.yml";
|
|
||||||
|
|
||||||
# notifiers.settings = {
|
|
||||||
# path = "/etc/grafana/provisioning/notifiers";
|
|
||||||
# };
|
|
||||||
|
|
||||||
dashboards.settings.providers = [{
|
dashboards.settings.providers = [{
|
||||||
name = "my dashboards";
|
name = "my dashboards";
|
||||||
options.path = "/etc/grafana-dashboards";
|
options.path = "/etc/grafana-dashboards";
|
||||||
|
|
@ -99,20 +93,4 @@
|
||||||
group = "grafana";
|
group = "grafana";
|
||||||
mode = "0644";
|
mode = "0644";
|
||||||
};
|
};
|
||||||
|
|
||||||
# 🔔 Alerts provisioning
|
|
||||||
environment.etc."grafana/provisioning/alerting/alerts.yml" = {
|
|
||||||
source = ./provisioning/alerting/alerts.yml;
|
|
||||||
user = "grafana";
|
|
||||||
group = "grafana";
|
|
||||||
mode = "0644";
|
|
||||||
};
|
|
||||||
|
|
||||||
# 📬 Contact point provisioning
|
|
||||||
environment.etc."grafana/provisioning/notifiers/contact-points.yml" = {
|
|
||||||
source = ./provisioning/notifiers/contact-points.yml;
|
|
||||||
user = "grafana";
|
|
||||||
group = "grafana";
|
|
||||||
mode = "0644";
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,9 +3,12 @@
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../../templates/base.nix
|
../../templates/base.nix
|
||||||
|
../../secrets/shared-sops.nix
|
||||||
./networking.nix
|
./networking.nix
|
||||||
./prometheus.nix
|
./prometheus.nix
|
||||||
./grafana.nix
|
./grafana.nix
|
||||||
./loki.nix
|
./loki.nix
|
||||||
|
./alertmanager.nix
|
||||||
|
./sops.nix
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,4 @@
|
||||||
{ config, lib, pkgs, ... }: {
|
{ config, lib, pkgs, ... }:
|
||||||
|
{
|
||||||
networking = {
|
networking.hostName = "monitor";
|
||||||
hostName = "monitor";
|
|
||||||
# interfaces.eth0 = {
|
|
||||||
# ipv4.addresses = [{
|
|
||||||
# address = "192.168.1.171";
|
|
||||||
# prefixLength = 24;
|
|
||||||
# }];
|
|
||||||
# };
|
|
||||||
# firewall.allowedTCPPorts = [ 80 3000 9090 ];
|
|
||||||
# defaultGateway = {
|
|
||||||
# address = "192.168.1.1";
|
|
||||||
# interface = "eth0";
|
|
||||||
# };
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,18 +1,46 @@
|
||||||
{ config, pkgs, modulesPath, lib, ... }:
|
{ config, pkgs, modulesPath, lib, ... }:
|
||||||
|
|
||||||
let
|
let
|
||||||
monitor_ip = "monitor.local";
|
monitor_ip = "monitor.lab";
|
||||||
traefik_ip = "traefik.local";
|
traefik_ip = "traefik.lab";
|
||||||
sandbox_ip = "sandbox.local";
|
sandbox_ip = "sandbox.lab";
|
||||||
forgejo_ip = "forgejo.local";
|
forgejo_ip = "forgejo.lab";
|
||||||
|
dnsmasq_ip = "dns.lab";
|
||||||
|
|
||||||
prometheus_exporter_port = 9100;
|
prometheus_exporter_port = 9100;
|
||||||
postgres_exporter_port = 9187;
|
postgres_exporter_port = 9187;
|
||||||
prometheus_port = 9090;
|
prometheus_port = 9090;
|
||||||
|
alertmanager_port = 9093;
|
||||||
grafana_port = 3000;
|
grafana_port = 3000;
|
||||||
promtail_port = 9080;
|
promtail_port = 9080;
|
||||||
traefik_monitor_port = 8082;
|
traefik_monitor_port = 8082;
|
||||||
forgejo_monitor_port = 3000;
|
forgejo_monitor_port = 3000;
|
||||||
|
dnsmasq_exporter_port = 9153;
|
||||||
|
|
||||||
|
exporters = {
|
||||||
|
node = [
|
||||||
|
"${monitor_ip}:${toString prometheus_exporter_port}"
|
||||||
|
"${traefik_ip}:${toString prometheus_exporter_port}"
|
||||||
|
"${sandbox_ip}:${toString prometheus_exporter_port}"
|
||||||
|
"${forgejo_ip}:${toString prometheus_exporter_port}"
|
||||||
|
];
|
||||||
|
promtail = [
|
||||||
|
"${monitor_ip}:${toString promtail_port}"
|
||||||
|
"${traefik_ip}:${toString promtail_port}"
|
||||||
|
"${sandbox_ip}:${toString promtail_port}"
|
||||||
|
"${forgejo_ip}:${toString promtail_port}"
|
||||||
|
];
|
||||||
|
grafana = [ "${monitor_ip}:${toString grafana_port}" ];
|
||||||
|
prometheus = [ "${monitor_ip}:${toString prometheus_port}" ];
|
||||||
|
alertmanager = [ "${monitor_ip}:${toString alertmanager_port}" ];
|
||||||
|
traefik = [ "${traefik_ip}:${toString traefik_monitor_port}" ];
|
||||||
|
gitea = [ "${forgejo_ip}:${toString forgejo_monitor_port}" ];
|
||||||
|
postgres = [ "${forgejo_ip}:${toString postgres_exporter_port}" ];
|
||||||
|
|
||||||
|
dnsmasq = [ "${dnsmasq_ip}:${toString dnsmasq_exporter_port}" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
in {
|
in {
|
||||||
networking.firewall.allowedTCPPorts = [ 9090 ];
|
networking.firewall.allowedTCPPorts = [ 9090 ];
|
||||||
|
|
||||||
|
|
@ -22,64 +50,27 @@ in {
|
||||||
globalConfig = {
|
globalConfig = {
|
||||||
scrape_timeout = "10s";
|
scrape_timeout = "10s";
|
||||||
scrape_interval = "30s";
|
scrape_interval = "30s";
|
||||||
|
# A short evaluation_interval will check alerting rules very often.
|
||||||
|
# It can be costly if you run Prometheus with 100+ alerts.
|
||||||
|
evaluation_interval = "20s";
|
||||||
};
|
};
|
||||||
scrapeConfigs = [
|
extraFlags = [
|
||||||
{
|
"--web.enable-admin-api"
|
||||||
job_name = "node";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [
|
|
||||||
"${monitor_ip}:${toString prometheus_exporter_port}"
|
|
||||||
"${traefik_ip}:${toString prometheus_exporter_port}"
|
|
||||||
"${sandbox_ip}:${toString prometheus_exporter_port}"
|
|
||||||
"${forgejo_ip}:${toString prometheus_exporter_port}"
|
|
||||||
];
|
];
|
||||||
}
|
|
||||||
];
|
scrapeConfigs = lib.mapAttrsToList (job_name: targets: {
|
||||||
}
|
inherit job_name;
|
||||||
{
|
static_configs = [ { inherit targets; } ];
|
||||||
job_name = "grafana";
|
}) exporters;
|
||||||
static_configs = [
|
|
||||||
{ targets = [ "${monitor_ip}:${toString grafana_port}" ]; }
|
# 🔔 Alerts provisioning
|
||||||
];
|
ruleFiles = [
|
||||||
}
|
(pkgs.writeText "prometheus-alerts.yml" (builtins.readFile ./provisioning/alerts/prometheus-alerts.yml))
|
||||||
{
|
(pkgs.writeText "loki-alerts.yml" (builtins.readFile ./provisioning/alerts/loki-alerts.yml))
|
||||||
job_name = "prometheus";
|
(pkgs.writeText "promtail-alerts.yml" (builtins.readFile ./provisioning/alerts/promtail-alerts.yml))
|
||||||
static_configs = [
|
(pkgs.writeText "postgres-alerts.yml" (builtins.readFile ./provisioning/alerts/postgres-alerts.yml))
|
||||||
{ targets = [ "${monitor_ip}:${toString prometheus_port}" ]; }
|
(pkgs.writeText "traefik-alerts.yml" (builtins.readFile ./provisioning/alerts/traefik-alerts.yml))
|
||||||
];
|
(pkgs.writeText "node-exporter-alerts.yml" (builtins.readFile ./provisioning/alerts/node-exporter-alerts.yml))
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "traefik";
|
|
||||||
static_configs = [
|
|
||||||
{ targets = [ "${traefik_ip}:${toString traefik_monitor_port}" ]; }
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "gitea";
|
|
||||||
static_configs = [
|
|
||||||
{ targets = [ "${forgejo_ip}:${toString forgejo_monitor_port}" ]; }
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "postgres";
|
|
||||||
static_configs = [
|
|
||||||
{ targets = [ "${forgejo_ip}:${toString postgres_exporter_port}" ]; }
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "promtail";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [
|
|
||||||
"${monitor_ip}:${toString promtail_port}"
|
|
||||||
"${traefik_ip}:${toString promtail_port}"
|
|
||||||
"${sandbox_ip}:${toString promtail_port}"
|
|
||||||
"${forgejo_ip}:${toString promtail_port}"
|
|
||||||
];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,265 +0,0 @@
|
||||||
apiVersion: 1
|
|
||||||
|
|
||||||
groups:
|
|
||||||
- name: node-alerts
|
|
||||||
folder: "VM Alerts"
|
|
||||||
interval: 1m
|
|
||||||
rules:
|
|
||||||
- uid: high_cpu
|
|
||||||
title: High CPU Usage
|
|
||||||
condition: A
|
|
||||||
for: 5m
|
|
||||||
annotations:
|
|
||||||
summary: "High CPU on {{ $labels.instance }}"
|
|
||||||
description: "CPU > 90% for 5 minutes"
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: { from: 300, to: 0 }
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model:
|
|
||||||
expr: avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) by (instance) > 0.9
|
|
||||||
interval: ""
|
|
||||||
datasource: { type: prometheus, uid: prometheus }
|
|
||||||
instant: false
|
|
||||||
intervalMs: 15000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
|
|
||||||
- uid: high_memory
|
|
||||||
title: High Memory Usage
|
|
||||||
condition: A
|
|
||||||
for: 5m
|
|
||||||
annotations:
|
|
||||||
summary: "High memory on {{ $labels.instance }}"
|
|
||||||
description: "Memory > 90% for 5 minutes"
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: { from: 300, to: 0 }
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model:
|
|
||||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.9
|
|
||||||
interval: ""
|
|
||||||
datasource: { type: prometheus, uid: prometheus }
|
|
||||||
instant: false
|
|
||||||
intervalMs: 15000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
|
|
||||||
- uid: low_disk
|
|
||||||
title: Low Disk Space
|
|
||||||
condition: A
|
|
||||||
for: 5m
|
|
||||||
annotations:
|
|
||||||
summary: "Low disk on {{ $labels.instance }} {{ $labels.mountpoint }}"
|
|
||||||
description: "< 10% space left"
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: { from: 300, to: 0 }
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model:
|
|
||||||
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.1
|
|
||||||
interval: ""
|
|
||||||
datasource: { type: prometheus, uid: prometheus }
|
|
||||||
instant: false
|
|
||||||
intervalMs: 15000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
|
|
||||||
- uid: node_down
|
|
||||||
title: Node is Down
|
|
||||||
condition: A
|
|
||||||
for: 1m
|
|
||||||
annotations:
|
|
||||||
summary: "{{ $labels.instance }} is DOWN"
|
|
||||||
description: "Exporter is not responding"
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: { from: 60, to: 0 }
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model:
|
|
||||||
expr: up == 0
|
|
||||||
interval: ""
|
|
||||||
datasource: { type: prometheus, uid: prometheus }
|
|
||||||
instant: true
|
|
||||||
intervalMs: 15000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
|
|
||||||
- name: prometheus-alerts
|
|
||||||
folder: "Prometheus Alerts"
|
|
||||||
interval: 1m
|
|
||||||
rules:
|
|
||||||
- uid: prometheus_down
|
|
||||||
title: Prometheus is Down
|
|
||||||
condition: A
|
|
||||||
for: 1m
|
|
||||||
annotations:
|
|
||||||
summary: "Prometheus is not responding"
|
|
||||||
description: "up{job='prometheus'} == 0"
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: { from: 60, to: 0 }
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model:
|
|
||||||
expr: up{job="prometheus"} == 0
|
|
||||||
interval: ""
|
|
||||||
datasource: { type: prometheus, uid: prometheus }
|
|
||||||
instant: true
|
|
||||||
intervalMs: 15000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
|
|
||||||
- name: traefik-alerts
|
|
||||||
folder: "Traefik Alerts"
|
|
||||||
interval: 1m
|
|
||||||
rules:
|
|
||||||
- uid: traefik_5xx
|
|
||||||
title: Traefik 5xx Errors
|
|
||||||
condition: A
|
|
||||||
for: 1m
|
|
||||||
annotations:
|
|
||||||
summary: "Traefik has a high rate of 5xx responses"
|
|
||||||
description: "Rate of HTTP 5xx responses is high"
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: { from: 60, to: 0 }
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model:
|
|
||||||
expr: rate(traefik_service_requests_total{code=~"5.."}[1m]) > 0
|
|
||||||
interval: ""
|
|
||||||
datasource: { type: prometheus, uid: prometheus }
|
|
||||||
instant: false
|
|
||||||
intervalMs: 15000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
|
|
||||||
- name: grafana-alerts
|
|
||||||
folder: "Grafana Alerts"
|
|
||||||
interval: 1m
|
|
||||||
rules:
|
|
||||||
- uid: grafana_down
|
|
||||||
title: Grafana is Down
|
|
||||||
condition: A
|
|
||||||
for: 1m
|
|
||||||
annotations:
|
|
||||||
summary: "Grafana is not responding"
|
|
||||||
description: "up{job='grafana'} == 0"
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: { from: 60, to: 0 }
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model:
|
|
||||||
expr: up{job="grafana"} == 0
|
|
||||||
interval: ""
|
|
||||||
datasource: { type: prometheus, uid: prometheus }
|
|
||||||
instant: true
|
|
||||||
intervalMs: 15000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
|
|
||||||
- name: postgres-alerts
|
|
||||||
folder: "Postgres Alerts"
|
|
||||||
interval: 1m
|
|
||||||
rules:
|
|
||||||
- uid: postgres_down
|
|
||||||
title: Postgres is Down
|
|
||||||
condition: A
|
|
||||||
for: 1m
|
|
||||||
annotations:
|
|
||||||
summary: "Postgres is not responding"
|
|
||||||
description: "up{job='postgres'} == 0"
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: { from: 60, to: 0 }
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model:
|
|
||||||
expr: up{job="postgres"} == 0
|
|
||||||
interval: ""
|
|
||||||
datasource: { type: prometheus, uid: prometheus }
|
|
||||||
instant: true
|
|
||||||
intervalMs: 15000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
|
|
||||||
- name: gitea-alerts
|
|
||||||
folder: "Gitea Alerts"
|
|
||||||
interval: 1m
|
|
||||||
rules:
|
|
||||||
- uid: gitea_down
|
|
||||||
title: Gitea is Down
|
|
||||||
condition: A
|
|
||||||
for: 1m
|
|
||||||
annotations:
|
|
||||||
summary: "Gitea is not responding"
|
|
||||||
description: "up{job='gitea'} == 0"
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: { from: 60, to: 0 }
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model:
|
|
||||||
expr: up{job="gitea"} == 0
|
|
||||||
interval: ""
|
|
||||||
datasource: { type: prometheus, uid: prometheus }
|
|
||||||
instant: true
|
|
||||||
intervalMs: 15000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
|
|
||||||
- name: promtail-alerts
|
|
||||||
folder: "Promtail Alerts"
|
|
||||||
interval: 1m
|
|
||||||
rules:
|
|
||||||
- uid: promtail_down
|
|
||||||
title: Promtail is Down
|
|
||||||
condition: A
|
|
||||||
for: 1m
|
|
||||||
annotations:
|
|
||||||
summary: "Promtail is not responding"
|
|
||||||
description: "up{job='promtail'} == 0"
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: { from: 60, to: 0 }
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model:
|
|
||||||
expr: up{job="promtail"} == 0
|
|
||||||
interval: ""
|
|
||||||
datasource: { type: prometheus, uid: prometheus }
|
|
||||||
instant: true
|
|
||||||
intervalMs: 15000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
|
|
||||||
- name: logs-alerts
|
|
||||||
folder: "Logs Alerts"
|
|
||||||
interval: 1m
|
|
||||||
rules:
|
|
||||||
- uid: failed_ssh_logins
|
|
||||||
title: Failed SSH Logins
|
|
||||||
condition: A
|
|
||||||
for: 1m
|
|
||||||
annotations:
|
|
||||||
summary: "Too many failed SSH login attempts"
|
|
||||||
description: "Check for brute force login attempts"
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
data:
|
|
||||||
- refId: A
|
|
||||||
relativeTimeRange: { from: 300, to: 0 }
|
|
||||||
datasourceUid: prometheus
|
|
||||||
model:
|
|
||||||
expr: sum(rate(failed_ssh_logins[5m])) > 5
|
|
||||||
interval: ""
|
|
||||||
datasource: { type: prometheus, uid: prometheus }
|
|
||||||
instant: false
|
|
||||||
intervalMs: 15000
|
|
||||||
maxDataPoints: 43200
|
|
||||||
39
nixos/hosts/monitoring/provisioning/alerts/loki-alerts.yml
Normal file
39
nixos/hosts/monitoring/provisioning/alerts/loki-alerts.yml
Normal file
|
|
@ -0,0 +1,39 @@
|
||||||
|
groups:
|
||||||
|
- name: Loki
|
||||||
|
|
||||||
|
rules:
|
||||||
|
- alert: LokiProcessTooManyRestarts
|
||||||
|
expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Loki process too many restarts (instance {{ $labels.instance }})
|
||||||
|
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: LokiRequestErrors
|
||||||
|
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Loki request errors (instance {{ $labels.instance }})
|
||||||
|
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: LokiRequestPanic
|
||||||
|
expr: "sum(increase(loki_panic_total[10m])) by (namespace, job) > 0"
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Loki request panic (instance {{ $labels.instance }})
|
||||||
|
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: LokiRequestLatency
|
||||||
|
expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Loki request latency (instance {{ $labels.instance }})
|
||||||
|
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
@ -0,0 +1,299 @@
|
||||||
|
groups:
|
||||||
|
- name: NodeExporterV2
|
||||||
|
rules:
|
||||||
|
- alert: Node down
|
||||||
|
expr: up{job="monitoring-pi"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
title: Node {{ $labels.instance }} is down
|
||||||
|
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down.
|
||||||
|
|
||||||
|
- alert: HostOutOfMemory
|
||||||
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of memory (instance {{ $labels.instance }})
|
||||||
|
description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostMemoryUnderMemoryPressure
|
||||||
|
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||||
|
description: The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputIn
|
||||||
|
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||||
|
description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputOut
|
||||||
|
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||||
|
description: Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskReadRate
|
||||||
|
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||||
|
description: Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskWriteRate
|
||||||
|
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||||
|
description: Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
# Please add ignored mountpoints in node_exporter parameters like
|
||||||
|
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||||
|
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||||
|
- alert: HostOutOfDiskSpace
|
||||||
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||||
|
description: Disk is almost full (< 10% left)\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
# Please add ignored mountpoints in node_exporter parameters like
|
||||||
|
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||||
|
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||||
|
- alert: HostDiskWillFillIn24Hours
|
||||||
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||||
|
description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostOutOfInodes
|
||||||
|
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||||
|
description: Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostInodesWillFillIn24Hours
|
||||||
|
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||||
|
description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskReadLatency
|
||||||
|
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||||
|
description: Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskWriteLatency
|
||||||
|
expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||||
|
description: Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostHighCpuLoad
|
||||||
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||||
|
description: CPU load is > 80%\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostCpuStealNoisyNeighbor
|
||||||
|
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||||
|
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
# 1000 context switches is an arbitrary number.
|
||||||
|
# Alert threshold depends on nature of application.
|
||||||
|
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
|
||||||
|
- alert: HostContextSwitching
|
||||||
|
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host context switching (instance {{ $labels.instance }})
|
||||||
|
description: Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostSwapIsFillingUp
|
||||||
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||||
|
description: Swap is filling up (>80%)\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostSystemdServiceCrashed
|
||||||
|
expr: node_systemd_unit_state{state="failed"} == 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host SystemD service crashed (instance {{ $labels.instance }})
|
||||||
|
description: SystemD service crashed\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostPhysicalComponentTooHot
|
||||||
|
expr: node_hwmon_temp_celsius > 75
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||||
|
description: Physical hardware component too hot\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostNodeOvertemperatureAlarm
|
||||||
|
expr: node_hwmon_temp_crit_alarm_celsius == 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||||
|
description: Physical node temperature alarm triggered\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostRaidArrayGotInactive
|
||||||
|
expr: node_md_state{state="inactive"} > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||||
|
description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostRaidDiskFailure
|
||||||
|
expr: node_md_disks{state="failed"} > 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||||
|
description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostKernelVersionDeviations
|
||||||
|
expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
|
||||||
|
for: 6h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||||
|
description: Different kernel versions are running\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostOomKillDetected
|
||||||
|
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||||
|
description: OOM kill detected\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostEdacCorrectableErrorsDetected
|
||||||
|
expr: increase(node_edac_correctable_errors_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||||
|
description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostEdacUncorrectableErrorsDetected
|
||||||
|
expr: node_edac_uncorrectable_errors_total > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||||
|
description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostNetworkReceiveErrors
|
||||||
|
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }})
|
||||||
|
description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostNetworkTransmitErrors
|
||||||
|
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }})
|
||||||
|
description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostNetworkInterfaceSaturated
|
||||||
|
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }})
|
||||||
|
description: The network interface is getting overloaded.\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostConntrackLimit
|
||||||
|
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||||
|
description: The number of conntrack is approching limit\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostClockSkew
|
||||||
|
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host clock skew (instance {{ $labels.instance }})
|
||||||
|
description: Clock skew detected. Clock is out of sync.\n VALUE = {{ $value }}
|
||||||
|
|
||||||
|
- alert: HostClockNotSynchronising
|
||||||
|
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||||
|
description: Clock not synchronising.\n VALUE = {{ $value }}
|
||||||
|
|
@ -0,0 +1,320 @@
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: NodeExporter
|
||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: HostOutOfMemory
|
||||||
|
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of memory (instance {{ $labels.instance }})
|
||||||
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostMemoryUnderMemoryPressure
|
||||||
|
expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||||
|
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostMemoryIsUnderutilized
|
||||||
|
expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||||
|
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputIn
|
||||||
|
expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||||
|
description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputOut
|
||||||
|
expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||||
|
description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskReadRate
|
||||||
|
expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostOutOfDiskSpace
|
||||||
|
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostDiskMayFillIn24Hours
|
||||||
|
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
|
||||||
|
description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostOutOfInodes
|
||||||
|
expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostFilesystemDeviceError
|
||||||
|
expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host filesystem device error (instance {{ $labels.instance }})
|
||||||
|
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostInodesMayFillIn24Hours
|
||||||
|
expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
|
||||||
|
description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskReadLatency
|
||||||
|
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||||
|
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskWriteLatency
|
||||||
|
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||||
|
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostHighCpuLoad
|
||||||
|
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||||
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostCpuIsUnderutilized
|
||||||
|
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
|
||||||
|
for: 1w
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||||
|
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostCpuStealNoisyNeighbor
|
||||||
|
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||||
|
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostCpuHighIowait
|
||||||
|
expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
||||||
|
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskIo
|
||||||
|
expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||||
|
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostContextSwitchingHigh
|
||||||
|
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host context switching high (instance {{ $labels.instance }})
|
||||||
|
description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostSwapIsFillingUp
|
||||||
|
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||||
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostSystemdServiceCrashed
|
||||||
|
expr: '(node_systemd_unit_state{state="failed"} == 1)'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
||||||
|
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostPhysicalComponentTooHot
|
||||||
|
expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||||
|
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostNodeOvertemperatureAlarm
|
||||||
|
expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||||
|
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostSoftwareRaidInsufficientDrives
|
||||||
|
expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
|
||||||
|
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostSoftwareRaidDiskFailure
|
||||||
|
expr: '(node_md_disks{state="failed"} > 0)'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host software RAID disk failure (instance {{ $labels.instance }})
|
||||||
|
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostKernelVersionDeviations
|
||||||
|
expr: 'changes(node_uname_info[1h]) > 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||||
|
description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostOomKillDetected
|
||||||
|
expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||||
|
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostEdacCorrectableErrorsDetected
|
||||||
|
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||||
|
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostEdacUncorrectableErrorsDetected
|
||||||
|
expr: '(node_edac_uncorrectable_errors_total > 0)'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||||
|
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostNetworkReceiveErrors
|
||||||
|
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
||||||
|
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostNetworkTransmitErrors
|
||||||
|
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
||||||
|
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostNetworkBondDegraded
|
||||||
|
expr: '((node_bonding_active - node_bonding_slaves) != 0)'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||||
|
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostConntrackLimit
|
||||||
|
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||||
|
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostClockSkew
|
||||||
|
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host clock skew (instance {{ $labels.instance }})
|
||||||
|
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostClockNotSynchronising
|
||||||
|
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||||
|
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
201
nixos/hosts/monitoring/provisioning/alerts/postgres-alerts.yml
Normal file
201
nixos/hosts/monitoring/provisioning/alerts/postgres-alerts.yml
Normal file
|
|
@ -0,0 +1,201 @@
|
||||||
|
groups:
|
||||||
|
- name: Postgres
|
||||||
|
|
||||||
|
rules:
|
||||||
|
- alert: PostgresqlDown
|
||||||
|
expr: "pg_up == 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql down (instance {{ $labels.instance }})
|
||||||
|
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlRestarted
|
||||||
|
expr: "time() - pg_postmaster_start_time_seconds < 60"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql restarted (instance {{ $labels.instance }})
|
||||||
|
description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlExporterError
|
||||||
|
expr: "pg_exporter_last_scrape_error > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql exporter error (instance {{ $labels.instance }})
|
||||||
|
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlTableNotAutoVacuumed
|
||||||
|
expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
|
||||||
|
description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlTableNotAutoAnalyzed
|
||||||
|
expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
|
||||||
|
description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlTooManyConnections
|
||||||
|
expr: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)"
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql too many connections (instance {{ $labels.instance }})
|
||||||
|
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlNotEnoughConnections
|
||||||
|
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
||||||
|
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlDeadLocks
|
||||||
|
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql dead locks (instance {{ $labels.instance }})
|
||||||
|
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlHighRollbackRate
|
||||||
|
expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql high rollback rate (instance {{ $labels.instance }})
|
||||||
|
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlCommitRateLow
|
||||||
|
expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql commit rate low (instance {{ $labels.instance }})
|
||||||
|
description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlLowXidConsumption
|
||||||
|
expr: "rate(pg_txid_current[1m]) < 5"
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql low XID consumption (instance {{ $labels.instance }})
|
||||||
|
description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlHighRateStatementTimeout
|
||||||
|
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
|
||||||
|
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlHighRateDeadlock
|
||||||
|
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
|
||||||
|
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlUnusedReplicationSlot
|
||||||
|
expr: "pg_replication_slots_active == 0"
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql unused replication slot (instance {{ $labels.instance }})
|
||||||
|
description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlTooManyDeadTuples
|
||||||
|
expr: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1"
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
|
||||||
|
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlConfigurationChanged
|
||||||
|
expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql configuration changed (instance {{ $labels.instance }})
|
||||||
|
description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlSslCompressionActive
|
||||||
|
expr: "sum(pg_stat_ssl_compression) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql SSL compression active (instance {{ $labels.instance }})
|
||||||
|
description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlTooManyLocksAcquired
|
||||||
|
expr: "((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20"
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
|
||||||
|
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlBloatIndexHigh(>80%)
|
||||||
|
expr: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)"
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
|
||||||
|
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlBloatTableHigh(>80%)
|
||||||
|
expr: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)"
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
|
||||||
|
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlInvalidIndex
|
||||||
|
expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
||||||
|
for: 6h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql invalid index (instance {{ $labels.instance }})
|
||||||
|
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlReplicationLag
|
||||||
|
expr: "pg_replication_lag_seconds > 5"
|
||||||
|
for: 30s
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql replication lag (instance {{ $labels.instance }})
|
||||||
|
description: "The PostgreSQL replication lag is high (> 5s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
255
nixos/hosts/monitoring/provisioning/alerts/prometheus-alerts.yml
Normal file
255
nixos/hosts/monitoring/provisioning/alerts/prometheus-alerts.yml
Normal file
|
|
@ -0,0 +1,255 @@
|
||||||
|
groups:
|
||||||
|
- name: Prometheus
|
||||||
|
|
||||||
|
rules:
|
||||||
|
- alert: PrometheusJobMissing
|
||||||
|
expr: 'absent(up{job="prometheus"})'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus job missing (instance {{ $labels.instance }})
|
||||||
|
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTargetMissing
|
||||||
|
expr: "up == 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus target missing (instance {{ $labels.instance }})
|
||||||
|
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusAllTargetsMissing
|
||||||
|
expr: "sum by (job) (up) == 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus all targets missing (instance {{ $labels.instance }})
|
||||||
|
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTargetMissingWithWarmupTime
|
||||||
|
expr: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus target missing with warmup time (instance {{ $labels.instance }})
|
||||||
|
description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusConfigurationReloadFailure
|
||||||
|
expr: "prometheus_config_last_reload_successful != 1"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTooManyRestarts
|
||||||
|
expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus too many restarts (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusAlertmanagerJobMissing
|
||||||
|
expr: 'absent(up{job="alertmanager"})'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
|
||||||
|
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusAlertmanagerConfigurationReloadFailure
|
||||||
|
expr: "alertmanager_config_last_reload_successful != 1"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
|
||||||
|
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusAlertmanagerConfigNotSynced
|
||||||
|
expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
|
||||||
|
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusAlertmanagerE2eDeadManSwitch
|
||||||
|
expr: "vector(1)"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusNotConnectedToAlertmanager
|
||||||
|
expr: "prometheus_notifications_alertmanagers_discovered < 1"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusRuleEvaluationFailures
|
||||||
|
expr: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTemplateTextExpansionFailures
|
||||||
|
expr: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusRuleEvaluationSlow
|
||||||
|
expr: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds"
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusNotificationsBacklog
|
||||||
|
expr: "min_over_time(prometheus_notifications_queue_length[10m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
|
||||||
|
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusAlertmanagerNotificationFailing
|
||||||
|
expr: "rate(alertmanager_notifications_failed_total[1m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
|
||||||
|
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTargetEmpty
|
||||||
|
expr: "prometheus_sd_discovered_targets == 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus target empty (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTargetScrapingSlow
|
||||||
|
expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusLargeScrape
|
||||||
|
expr: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10"
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus large scrape (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTargetScrapeDuplicate
|
||||||
|
expr: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbCheckpointCreationFailures
|
||||||
|
expr: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbCheckpointDeletionFailures
|
||||||
|
expr: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbCompactionsFailed
|
||||||
|
expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbHeadTruncationsFailed
|
||||||
|
expr: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbReloadFailures
|
||||||
|
expr: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbWalCorruptions
|
||||||
|
expr: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbWalTruncationsFailed
|
||||||
|
expr: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTimeseriesCardinality
|
||||||
|
expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
|
||||||
|
description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
groups:
|
||||||
|
- name: Promtail
|
||||||
|
|
||||||
|
rules:
|
||||||
|
- alert: PromtailRequestErrors
|
||||||
|
expr: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Promtail request errors (instance {{ $labels.instance }})
|
||||||
|
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PromtailRequestLatency
|
||||||
|
expr: "histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1"
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Promtail request latency (instance {{ $labels.instance }})
|
||||||
|
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
@ -0,0 +1,30 @@
|
||||||
|
groups:
|
||||||
|
- name: Traefik
|
||||||
|
|
||||||
|
rules:
|
||||||
|
- alert: TraefikServiceDown
|
||||||
|
expr: "count(traefik_service_server_up) by (service) == 0"
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Traefik service down (instance {{ $labels.instance }})
|
||||||
|
description: "All Traefik services are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: TraefikHighHttp4xxErrorRateService
|
||||||
|
expr: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Traefik high HTTP 4xx error rate service (instance {{ $labels.instance }})
|
||||||
|
description: "Traefik service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: TraefikHighHttp5xxErrorRateService
|
||||||
|
expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
|
||||||
|
description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
{{ define "telegram.markdown.message" }}
|
||||||
|
{{- if eq .Status "firing" -}}
|
||||||
|
{{- if eq .CommonLabels.severity "critical" -}}
|
||||||
|
🔴 Alert: {{ .CommonLabels.alertname }}
|
||||||
|
{{- else if eq .CommonLabels.severity "warning" -}}
|
||||||
|
🟠 Alert: {{ .CommonLabels.alertname }}
|
||||||
|
{{- else -}}
|
||||||
|
⚪️ Alert: {{ .CommonLabels.alertname }}
|
||||||
|
{{- end }}
|
||||||
|
Status: 🔥 FIRING
|
||||||
|
Severity: {{ if eq .CommonLabels.severity "critical" }}🔴 {{ .CommonLabels.severity | title }}{{ else if eq .CommonLabels.severity "warning" }}🟠 {{ .CommonLabels.severity | title }}{{ else }}⚪️ {{ .CommonLabels.severity | title }}{{ end }}
|
||||||
|
{{- else if eq .Status "resolved" -}}
|
||||||
|
⚪️ Alert: {{ .CommonLabels.alertname }}
|
||||||
|
Status: ✅ RESOLVED
|
||||||
|
Severity: {{ if eq .CommonLabels.severity "critical" }}🟢 {{ .CommonLabels.severity | title }}{{ else if eq .CommonLabels.severity "warning" }}🟢 {{ .CommonLabels.severity | title }}{{ else }}⚪️ {{ .CommonLabels.severity | title }}{{ end }}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{- range .Alerts -}}
|
||||||
|
|
||||||
|
{{- if .Labels.job }}
|
||||||
|
Job: `{{ .Labels.job }}`
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{- if .Labels.namespace }}
|
||||||
|
Namespace: `{{ .Labels.namespace }}`
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{- if .Labels.instance }}
|
||||||
|
Instance: `{{ .Labels.instance }}`
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{- if .Annotations.runbook_url }}
|
||||||
|
[RunbookURL]({{ .Annotations.runbook_url }})
|
||||||
|
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{ end }}
|
||||||
28
nixos/hosts/monitoring/provisioning/templates/telegram.tmpl
Normal file
28
nixos/hosts/monitoring/provisioning/templates/telegram.tmpl
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
{{ define "alert_list" }}{{ range . }}
|
||||||
|
---
|
||||||
|
🪪 <b>{{ .Labels.alertname }}</b>
|
||||||
|
{{- if eq .Labels.severity "critical" }}
|
||||||
|
🚨 CRITICAL 🚨 {{ end }}
|
||||||
|
{{- if eq .Labels.severity "warning" }}
|
||||||
|
⚠️ WARNING ⚠️{{ end }}
|
||||||
|
{{- if .Annotations.summary }}
|
||||||
|
📝 {{ .Annotations.summary }}{{ end }}
|
||||||
|
{{- if .Annotations.description }}
|
||||||
|
📖 {{ .Annotations.description }}{{ end }}
|
||||||
|
|
||||||
|
🏷 Labels:
|
||||||
|
{{ range .Labels.SortedPairs }} <i>{{ .Name }}</i>: <code>{{ .Value }}</code>
|
||||||
|
{{ end }}{{ end }}
|
||||||
|
🛠 <a href="https://grafana.prod.global:3000">Grafana</a> 💊 <a href="https://alertmanager.prod.global:9093">Alertmanager</a> 💊 <a href="https://">Any other link</a> 🛠
|
||||||
|
{{ end }}
|
||||||
|
|
||||||
|
{{ define "telegram.message" }}
|
||||||
|
{{ if gt (len .Alerts.Firing) 0 }}
|
||||||
|
🔥 Alerts Firing 🔥
|
||||||
|
{{ template "alert_list" .Alerts.Firing }}
|
||||||
|
{{ end }}
|
||||||
|
{{ if gt (len .Alerts.Resolved) 0 }}
|
||||||
|
✅ Alerts Resolved ✅
|
||||||
|
{{ template "alert_list" .Alerts.Resolved }}
|
||||||
|
{{ end }}
|
||||||
|
{{ end }}
|
||||||
7
nixos/hosts/monitoring/sops.nix
Normal file
7
nixos/hosts/monitoring/sops.nix
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
{ config, lib, ... }:
|
||||||
|
{
|
||||||
|
sops.secrets."telegram-alert-bot-token" = {
|
||||||
|
sopsFile = ../../secrets/telegram/secrets.yml;
|
||||||
|
owner = "prometheus";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
@ -1,20 +1,6 @@
|
||||||
{ config, lib, pkgs, ... }: {
|
{ config, lib, pkgs, ... }:
|
||||||
|
{
|
||||||
networking = {
|
networking.hostName = "sandbox";
|
||||||
hostName = "sandbox";
|
networking.interfaces.ens18.useDHCP = true;
|
||||||
interfaces.eth0 = {
|
networking.defaultGateway = "192.168.1.1";
|
||||||
ipv4.addresses = [{
|
|
||||||
address = "192.168.1.148";
|
|
||||||
prefixLength = 24;
|
|
||||||
}];
|
|
||||||
ipv6.addresses = [{
|
|
||||||
address = "fe80::148";
|
|
||||||
prefixLength = 64;
|
|
||||||
}];
|
|
||||||
};
|
|
||||||
defaultGateway = {
|
|
||||||
address = "192.168.1.1";
|
|
||||||
interface = "eth0";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
10
nixos/hosts/traefik/configuration/middlewares.nix
Normal file
10
nixos/hosts/traefik/configuration/middlewares.nix
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
{ lib, config, ... }:
|
||||||
|
|
||||||
|
let
|
||||||
|
internalNetwork = "192.168.1.0/24";
|
||||||
|
in
|
||||||
|
{
|
||||||
|
internal-whitelist = {
|
||||||
|
ipWhiteList.sourceRange = [ internalNetwork ];
|
||||||
|
};
|
||||||
|
}
|
||||||
140
nixos/hosts/traefik/configuration/routers.nix
Normal file
140
nixos/hosts/traefik/configuration/routers.nix
Normal file
|
|
@ -0,0 +1,140 @@
|
||||||
|
{ lib, config, ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
traefik = {
|
||||||
|
rule = "Host(`traefik.procopius.dk`)";
|
||||||
|
service = "traefik";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
middlewares = [ "internal-whitelist" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
proxmox = {
|
||||||
|
rule = "Host(`proxmox.procopius.dk`)";
|
||||||
|
service = "proxmox";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
forgejo = {
|
||||||
|
rule = "Host(`git.procopius.dk`)";
|
||||||
|
service = "forgejo";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
prometheus = {
|
||||||
|
rule = "Host(`prometheus.procopius.dk`)";
|
||||||
|
service = "prometheus";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
middlewares = [ "internal-whitelist" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
grafana = {
|
||||||
|
rule = "Host(`grafana.procopius.dk`)";
|
||||||
|
service = "grafana";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
middlewares = [ "internal-whitelist" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
alertmanager = {
|
||||||
|
rule = "Host(`alertmanager.procopius.dk`)";
|
||||||
|
service = "alertmanager";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
middlewares = [ "internal-whitelist" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
jellyfin = {
|
||||||
|
rule = "Host(`jellyfin.procopius.dk`)";
|
||||||
|
service = "jellyfin";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
sonarr = {
|
||||||
|
rule = "Host(`sonarr.procopius.dk`)";
|
||||||
|
service = "sonarr";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
radarr = {
|
||||||
|
rule = "Host(`radarr.procopius.dk`)";
|
||||||
|
service = "radarr";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
ente = {
|
||||||
|
rule = "Host(`ente.procopius.dk`)";
|
||||||
|
service = "ente";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
photos = {
|
||||||
|
rule = "Host(`photos.procopius.dk`)";
|
||||||
|
service = "photos";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
minio = {
|
||||||
|
rule = "Host(`minio.procopius.dk`)";
|
||||||
|
service = "minio";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
minio-api = {
|
||||||
|
rule = "Host(`minio-api.procopius.dk`)";
|
||||||
|
service = "minio-api";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
account = {
|
||||||
|
rule = "Host(`account.procopius.dk`)";
|
||||||
|
service = "account";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
auth = {
|
||||||
|
rule = "Host(`auth.procopius.dk`)";
|
||||||
|
service = "auth";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
nas = {
|
||||||
|
rule = "Host(`nas.procopius.dk`)";
|
||||||
|
service = "nas";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
umami = {
|
||||||
|
rule = "Host(`umami.procopius.dk`)";
|
||||||
|
service = "umami";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
mesterjakob = {
|
||||||
|
rule = "Host(`mester.jakobblum.dk`)";
|
||||||
|
service = "mesterjakob";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
|
||||||
|
catchAll = {
|
||||||
|
rule = "HostRegexp(`.+`)";
|
||||||
|
service = "nginx";
|
||||||
|
entryPoints = [ "websecure" ];
|
||||||
|
tls = { certResolver = "letsencrypt"; };
|
||||||
|
};
|
||||||
|
}
|
||||||
38
nixos/hosts/traefik/configuration/services.nix
Normal file
38
nixos/hosts/traefik/configuration/services.nix
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
{ lib, config, ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
proxmox.loadBalancer.servers = [ { url = "https://192.168.1.205:8006"; } ];
|
||||||
|
proxmox.loadBalancer.serversTransport = "insecureTransport";
|
||||||
|
|
||||||
|
traefik.loadBalancer.servers = [ { url = "http://localhost:8080"; } ];
|
||||||
|
|
||||||
|
forgejo.loadBalancer.servers = [ { url = "http://forgejo.lab:3000"; } ];
|
||||||
|
|
||||||
|
nginx.loadBalancer.servers = [ { url = "https://192.168.1.226:4433"; } ];
|
||||||
|
nginx.loadBalancer.serversTransport = "insecureTransport";
|
||||||
|
|
||||||
|
prometheus.loadBalancer.servers = [ { url = "http://monitor.lab:9090"; } ];
|
||||||
|
grafana.loadBalancer.servers = [ { url = "http://monitor.lab:3000"; } ];
|
||||||
|
alertmanager.loadBalancer.servers = [ { url = "http://monitor.lab:9093"; } ];
|
||||||
|
|
||||||
|
|
||||||
|
# from nginx
|
||||||
|
account.loadBalancer.servers = [ { url = "http://192.168.1.226:3001"; } ];
|
||||||
|
auth.loadBalancer.servers = [ { url = "http://192.168.1.226:3005"; } ];
|
||||||
|
ente.loadBalancer.servers = [ { url = "http://192.168.1.226:8087"; } ];
|
||||||
|
photos.loadBalancer.servers = [ { url = "http://192.168.1.226:3000"; } ];
|
||||||
|
minio.loadBalancer.servers = [ { url = "http://192.168.1.226:3201"; } ];
|
||||||
|
minio-api.loadBalancer.servers = [ { url = "http://192.168.1.226:3200"; } ];
|
||||||
|
|
||||||
|
nas.loadBalancer.servers = [ { url = "https://192.168.1.226:5001"; } ];
|
||||||
|
nas.loadBalancer.serversTransport = "insecureTransport";
|
||||||
|
|
||||||
|
|
||||||
|
jellyfin.loadBalancer.servers = [ { url = "http://192.168.1.226:8096"; } ];
|
||||||
|
radarr.loadBalancer.servers = [ { url = "http://192.168.1.226:7878"; } ];
|
||||||
|
sonarr.loadBalancer.servers = [ { url = "http://192.168.1.226:8989"; } ];
|
||||||
|
|
||||||
|
umami.loadBalancer.servers = [ { url = "http://192.168.1.226:3333"; } ];
|
||||||
|
|
||||||
|
mesterjakob.loadBalancer.servers = [ { url = "http://192.168.1.226:4200"; } ];
|
||||||
|
}
|
||||||
61
nixos/hosts/traefik/configuration/static.nix
Normal file
61
nixos/hosts/traefik/configuration/static.nix
Normal file
|
|
@ -0,0 +1,61 @@
|
||||||
|
{ lib, config, ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
entryPoints = {
|
||||||
|
web = {
|
||||||
|
address = ":80";
|
||||||
|
asDefault = true;
|
||||||
|
http.redirections.entrypoint = {
|
||||||
|
to = "websecure";
|
||||||
|
scheme = "https";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
websecure = {
|
||||||
|
address = ":443";
|
||||||
|
http.tls.certResolver = "letsencrypt";
|
||||||
|
};
|
||||||
|
|
||||||
|
metrics = {
|
||||||
|
address = ":8082";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
api = {
|
||||||
|
dashboard = true;
|
||||||
|
insecure = true;
|
||||||
|
};
|
||||||
|
|
||||||
|
certificatesResolvers = {
|
||||||
|
letsencrypt = {
|
||||||
|
acme = {
|
||||||
|
email = "david.mikael@proton.me";
|
||||||
|
storage = "/var/lib/traefik/acme.json";
|
||||||
|
# httpChallenge = {
|
||||||
|
# entryPoint = "web";
|
||||||
|
# };
|
||||||
|
dnsChallenge = {
|
||||||
|
provider = "cloudflare";
|
||||||
|
delayBeforeCheck = 10;
|
||||||
|
resolvers = [ "1.1.1.1:53" "8.8.8.8:53" ];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
metrics = {
|
||||||
|
prometheus = {
|
||||||
|
entryPoint = "metrics";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
log = {
|
||||||
|
level = "DEBUG";
|
||||||
|
filePath = "/var/log/traefik/traefik.log";
|
||||||
|
};
|
||||||
|
|
||||||
|
accessLog = {
|
||||||
|
format = "json";
|
||||||
|
filePath = "/var/log/traefik/access.log";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
@ -1,18 +1,13 @@
|
||||||
{ config, lib, pkgs, ... }: {
|
{ config, lib, pkgs, ... }: {
|
||||||
|
|
||||||
networking = {
|
networking.hostName = "traefik";
|
||||||
hostName = "traefik";
|
networking.interfaces.eth0.ipv4.addresses = [{
|
||||||
interfaces.eth0 = {
|
address = "192.168.1.80";
|
||||||
ipv4.addresses = [{
|
|
||||||
address = "192.168.1.171";
|
|
||||||
prefixLength = 24;
|
prefixLength = 24;
|
||||||
}];
|
}];
|
||||||
};
|
|
||||||
firewall.allowedTCPPorts = [ 80 443 8080 8082 ];
|
|
||||||
|
|
||||||
defaultGateway = {
|
networking.firewall.allowedTCPPorts = [ 80 443 8080 8082 ];
|
||||||
address = "192.168.1.1";
|
|
||||||
interface = "eth0";
|
networking.nameservers = [ "192.168.1.53" ];
|
||||||
};
|
networking.defaultGateway = "192.168.1.1";
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -14,10 +14,10 @@
|
||||||
{
|
{
|
||||||
targets = [ "localhost" ];
|
targets = [ "localhost" ];
|
||||||
labels = {
|
labels = {
|
||||||
job = "traefik";
|
job = "/var/log/traefik/*.log";
|
||||||
host = config.networking.hostName;
|
host = config.networking.hostName;
|
||||||
env = "proxmox";
|
env = "proxmox";
|
||||||
instance = "${config.networking.hostName}.local"; # prometheus scrape target
|
instance = "${config.networking.hostName}.lab"; # prometheus scrape target
|
||||||
__path__ = "/var/log/traefik/*.log";
|
__path__ = "/var/log/traefik/*.log";
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,158 +1,36 @@
|
||||||
{ config, lib, pkgs, ... }: {
|
{ config, lib, pkgs, ... }:
|
||||||
|
|
||||||
# Traefik reverse proxy setup
|
let
|
||||||
|
staticConfig = import ./configuration/static.nix { inherit lib config; };
|
||||||
|
middlewaresConfig = import ./configuration/middlewares.nix { inherit lib config; };
|
||||||
|
routersConfig = import ./configuration/routers.nix { inherit lib config; };
|
||||||
|
servicesConfig = import ./configuration/services.nix { inherit lib config; };
|
||||||
|
in
|
||||||
|
{
|
||||||
services.traefik = {
|
services.traefik = {
|
||||||
enable = true;
|
enable = true;
|
||||||
|
|
||||||
staticConfigOptions = {
|
# ==== Static Configuration ====
|
||||||
entryPoints = {
|
staticConfigOptions = staticConfig;
|
||||||
web = {
|
|
||||||
address = ":80";
|
|
||||||
asDefault = true;
|
|
||||||
http.redirections.entrypoint = {
|
|
||||||
to = "websecure";
|
|
||||||
scheme = "https";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
websecure = {
|
# ==== Dynamic Configuration ====
|
||||||
address = ":443";
|
dynamicConfigOptions.http = {
|
||||||
asDefault = true;
|
routers = routersConfig;
|
||||||
http.tls.certResolver = "letsencrypt";
|
services = servicesConfig;
|
||||||
};
|
middlewares = middlewaresConfig;
|
||||||
|
|
||||||
metrics = {
|
serversTransports = {
|
||||||
address = ":8082";
|
insecureTransport = {
|
||||||
|
insecureSkipVerify = true;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
api.dashboard = true;
|
|
||||||
api.insecure = true;
|
|
||||||
|
|
||||||
# Enable Let's Encrypt
|
|
||||||
certificatesResolvers = {
|
|
||||||
letsencrypt = {
|
|
||||||
acme = {
|
|
||||||
email = "david.mikael@proton.me"; # Replace with your email
|
|
||||||
storage = "/var/lib/traefik/acme.json"; # Location to store ACME certificates
|
|
||||||
httpChallenge = {
|
|
||||||
entryPoint = "web"; # Uses HTTP challenge (can also use DNS)
|
|
||||||
};
|
|
||||||
# Uncomment the following for staging (testing) environment
|
|
||||||
# caServer = "https://acme-staging-v02.api.letsencrypt.org/directory";
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
# Enable Prometheus metrics
|
systemd.services.traefik.serviceConfig.Environment = [
|
||||||
metrics = {
|
"CLOUDFLARE_DNS_API_TOKEN=gQYyG6cRw-emp_qpsUj9TrkYgoVC1v9UUtv94ozA"
|
||||||
prometheus = {
|
"CLOUDFLARE_ZONE_API_TOKEN=gQYyG6cRw-emp_qpsUj9TrkYgoVC1v9UUtv94ozA"
|
||||||
entryPoint = "metrics";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
log = {
|
|
||||||
level = "DEBUG";
|
|
||||||
filePath = "/var/log/traefik/traefik.log";
|
|
||||||
};
|
|
||||||
|
|
||||||
accessLog = {
|
|
||||||
format = "json";
|
|
||||||
filePath = "/var/log/traefik/access.log";
|
|
||||||
};
|
|
||||||
|
|
||||||
# Enable access logs (you can customize the log format)
|
|
||||||
# accessLog = {
|
|
||||||
# filePath = "/var/log/traefik/access.log"; # Log to a file
|
|
||||||
# format = "common"; # You can adjust this to `json` or `common`
|
|
||||||
# };
|
|
||||||
# tracing = {
|
|
||||||
# enabled = true;
|
|
||||||
# provider = "jaeger"; # or zipkin, or other
|
|
||||||
# jaeger = {
|
|
||||||
# apiURL = "http://localhost:5775"; # Replace with your Jaeger instance URL
|
|
||||||
# };
|
|
||||||
# };
|
|
||||||
};
|
|
||||||
|
|
||||||
dynamicConfigOptions = {
|
|
||||||
# Add IP whitelisting middleware to restrict access to internal network only
|
|
||||||
http.middlewares = {
|
|
||||||
internal-whitelist = {
|
|
||||||
ipWhiteList = {
|
|
||||||
sourceRange = ["192.168.1.0/24"]; # Adjust to your internal network range
|
|
||||||
# Alternatively use `127.0.0.1/32` for localhost access
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
# Route to Proxmox UI
|
|
||||||
http.routers.proxmox = {
|
|
||||||
rule = "Host(`proxmox.procopius.dk`)";
|
|
||||||
service = "proxmox";
|
|
||||||
entryPoints = [ "web" "websecure" ];
|
|
||||||
tls = {
|
|
||||||
certResolver = "letsencrypt"; # Use Let's Encrypt
|
|
||||||
};
|
|
||||||
};
|
|
||||||
# Route to Traefik Dashboard
|
|
||||||
http.routers.traefik = {
|
|
||||||
rule = "Host(`traefik.procopius.dk`)";
|
|
||||||
service = "traefik";
|
|
||||||
entryPoints = [ "web" "websecure" ];
|
|
||||||
middlewares = ["internal-whitelist"];
|
|
||||||
tls = {
|
|
||||||
certResolver = "letsencrypt"; # Use Let's Encrypt
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
http.routers.forgejo = {
|
|
||||||
rule = "Host(`git.procopius.dk`)";
|
|
||||||
service = "forgejo";
|
|
||||||
entryPoints = [ "web" "websecure" ];
|
|
||||||
tls = {
|
|
||||||
certResolver = "letsencrypt"; # Use Let's Encrypt
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
# Route to Traefik Dashboard
|
|
||||||
http.routers.catchAll = {
|
|
||||||
# rule = "Host(`jellyfin.procopius.dk`)";
|
|
||||||
rule = "HostRegexp(`.+`)";
|
|
||||||
# rule = "HostRegexp(`{host:.+}`)";
|
|
||||||
service = "nginx";
|
|
||||||
entryPoints = [ "web" "websecure" ];
|
|
||||||
tls = {
|
|
||||||
certResolver = "letsencrypt"; # Use Let's Encrypt
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
# Define the services
|
|
||||||
http.services.proxmox.loadBalancer.servers = [
|
|
||||||
{ url = "https://192.168.1.205:8006"; } # Proxmox
|
|
||||||
];
|
|
||||||
http.services.proxmox.loadBalancer.serversTransport = "insecureTransport";
|
|
||||||
|
|
||||||
|
|
||||||
http.services.traefik.loadBalancer.servers = [
|
|
||||||
{ url = "http://traefik.local:8080"; } # Traefik Dashboard
|
|
||||||
];
|
];
|
||||||
|
|
||||||
http.services.forgejo.loadBalancer.servers = [
|
|
||||||
{ url = "http://192.168.1.249:3000"; } # forgejo
|
|
||||||
];
|
|
||||||
|
|
||||||
http.services.nginx.loadBalancer.servers = [
|
|
||||||
{ url = "https://192.168.1.226:4433"; } # nginx
|
|
||||||
];
|
|
||||||
http.services.nginx.loadBalancer.serversTransport = "insecureTransport";
|
|
||||||
|
|
||||||
|
|
||||||
http.serversTransports.insecureTransport.insecureSkipVerify = true;
|
|
||||||
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
# Optionally, you can add Docker support if using Docker Compose
|
|
||||||
virtualisation.docker.enable = true;
|
virtualisation.docker.enable = true;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
127
nixos/hosts/vpn/README.md
Normal file
127
nixos/hosts/vpn/README.md
Normal file
|
|
@ -0,0 +1,127 @@
|
||||||
|
Great question — and you're absolutely right to ask.
|
||||||
|
|
||||||
|
You **don’t need** Innernet or Headscale+patch *if* you're comfortable building a small self-hosted portal that handles:
|
||||||
|
|
||||||
|
* ✅ OIDC login
|
||||||
|
* ✅ WireGuard peer key management
|
||||||
|
* ✅ Config generation and download
|
||||||
|
|
||||||
|
So let’s break it down:
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🧩 Why *Innernet* and *Headscale+patch* Exist
|
||||||
|
|
||||||
|
Those tools solve **user/device coordination**, dynamic routing, and access control *automatically*, **with batteries included**. They're great if you want a more managed, “plug-and-play” experience. Specifically:
|
||||||
|
|
||||||
|
| Tool | Purpose |
|
||||||
|
| ------------------------ | ----------------------------------------------------------------- |
|
||||||
|
| **Innernet** | Full WireGuard-based mesh + OIDC login + per-device management |
|
||||||
|
| **Headscale** | Tailscale backend, enables user login + ACL + device coordination |
|
||||||
|
| **Headscale+OIDC patch** | Add OIDC login to Headscale (by default, only pre-shared keys) |
|
||||||
|
|
||||||
|
These tools do things like:
|
||||||
|
|
||||||
|
* Assign IPs/DNS names to users
|
||||||
|
* Regenerate keys
|
||||||
|
* Handle coordination between peers
|
||||||
|
* Gate access with identity
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🛠️ Why You *Don’t* Need Them (Your Custom Portal Idea)
|
||||||
|
|
||||||
|
If you're fine with:
|
||||||
|
|
||||||
|
* Managing WireGuard server and IPs manually
|
||||||
|
* Writing a small web service to issue `.conf` files after login
|
||||||
|
* Using firewall rules for access control
|
||||||
|
|
||||||
|
Then you absolutely **can** build a custom system around **WireGuard + OIDC** like this:
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 🧱 Architecture: WireGuard + OIDC + Config Generator
|
||||||
|
|
||||||
|
```
|
||||||
|
┌──────────────┐
|
||||||
|
│ Your OIDC │
|
||||||
|
│ provider │◄────── Google, Authentik, Keycloak
|
||||||
|
└─────┬────────┘
|
||||||
|
│
|
||||||
|
(OIDC Login)
|
||||||
|
│
|
||||||
|
┌────────▼────────┐
|
||||||
|
│ Web Portal │◄────── nginx + Go, Node.js, Flask, etc.
|
||||||
|
│ (self-hosted) │
|
||||||
|
└──┬────────────┬─┘
|
||||||
|
Gen Keys │ │ Serve config
|
||||||
|
& register │ └─────> `client.conf` / QR code
|
||||||
|
▼
|
||||||
|
┌──────────────────────┐
|
||||||
|
│ WireGuard server │ (NixOS)
|
||||||
|
│ - Handles all peers │
|
||||||
|
└──────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 🔐 What Your Portal Needs to Do
|
||||||
|
|
||||||
|
* Authenticate via OIDC (OAuth2 flow)
|
||||||
|
* Store a list of "authorized users"
|
||||||
|
* When a user logs in:
|
||||||
|
|
||||||
|
* Check if they already have a WireGuard peer entry
|
||||||
|
* If not: generate a keypair, assign IP (e.g. `10.100.0.5`)
|
||||||
|
* Append to server config (`/etc/wireguard/wg0.conf`) or use `wg set`
|
||||||
|
* Return config file or QR code for the mobile client
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ✅ Benefits of This DIY Approach
|
||||||
|
|
||||||
|
* 💡 Full control: you own every step
|
||||||
|
* 🔐 OIDC is the gatekeeper (centrally managed access)
|
||||||
|
* ⚡ No extra daemon/software overhead (just WireGuard + your app)
|
||||||
|
* 🧩 Easily extensible (e.g., device limits, IP lease expiration)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ⚙️ Tools You Can Use
|
||||||
|
|
||||||
|
| Purpose | Tools |
|
||||||
|
| ---------------------- | -------------------------------------------------------------------------- |
|
||||||
|
| OIDC Auth | [Dex](https://github.com/dexidp/dex), Keycloak, Authentik, or Firebase |
|
||||||
|
| Web Portal Backend | Node.js (Express), Go, Flask, FastAPI |
|
||||||
|
| Config Template Engine | Go templates, EJS, Jinja2 |
|
||||||
|
| WireGuard Integration | `wg`, `wg-quick`, or directly edit `wg0.conf` |
|
||||||
|
| NixOS Support | Use `networking.wireguard.interfaces.wg0.peers` to declaratively add peers |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🧪 Bonus Features You Could Build
|
||||||
|
|
||||||
|
* 🔐 Per-user firewall rules
|
||||||
|
* 🔁 One-time config URLs (`vpn.mydomain.com/invite/xyz`)
|
||||||
|
* 🔐 Optional Authelia/Keycloak MFA
|
||||||
|
* 📉 Dashboard with currently active peers (use `wg show`)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ✨ Summary
|
||||||
|
|
||||||
|
| Option | You Need Innernet / Headscale? | Login via OIDC? | DNS Routing? | Complexity |
|
||||||
|
| ----------------- | ------------------------------ | --------------- | ------------ | ---------- |
|
||||||
|
| Your own portal | ❌ No | ✅ Yes | ❌ Manual | 🟡 Medium |
|
||||||
|
| Innernet | ✅ Yes | ✅ Yes | ✅ Built-in | 🟠 Medium |
|
||||||
|
| Headscale + patch | ✅ Yes | ✅ Yes | ✅ Built-in | 🔴 High |
|
||||||
|
|
||||||
|
You’re on a great path by choosing the custom VPN portal idea.
|
||||||
|
Let me know if you'd like:
|
||||||
|
|
||||||
|
* A **sample architecture repo**
|
||||||
|
* A **NixOS module to support peer configs**
|
||||||
|
* Help building the **login + config generator backend**
|
||||||
|
|
||||||
|
I can generate a Nix flake and a working OIDC portal template to kickstart the project.
|
||||||
|
|
@ -20,7 +20,7 @@ in
|
||||||
filename = "/var/lib/promtail/positions.yaml";
|
filename = "/var/lib/promtail/positions.yaml";
|
||||||
};
|
};
|
||||||
clients = [{
|
clients = [{
|
||||||
url = "http://monitor.local:3100/loki/api/v1/push";
|
url = "http://monitor.lab:3100/loki/api/v1/push";
|
||||||
}];
|
}];
|
||||||
scrape_configs = [{
|
scrape_configs = [{
|
||||||
job_name = "journal";
|
job_name = "journal";
|
||||||
|
|
@ -30,13 +30,14 @@ in
|
||||||
job = "promtail";
|
job = "promtail";
|
||||||
host = config.networking.hostName;
|
host = config.networking.hostName;
|
||||||
env = "proxmox";
|
env = "proxmox";
|
||||||
instance = "${config.networking.hostName}.local";
|
instance = "${config.networking.hostName}.lab";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
relabel_configs = [{
|
relabel_configs = [{
|
||||||
source_labels = ["__journal__systemd_unit"];
|
source_labels = ["__journal__systemd_unit"];
|
||||||
target_label = "unit";
|
target_label = "unit";
|
||||||
}];
|
}];
|
||||||
|
|
||||||
}];
|
}];
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
|
||||||
59
nixos/secrets/HOWTO.md
Normal file
59
nixos/secrets/HOWTO.md
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
### 🔧 Using Secrets in NixOS Configurations
|
||||||
|
|
||||||
|
You can use decrypted SOPS secrets in your `configuration.nix`, service modules, and flake-based setups.
|
||||||
|
|
||||||
|
#### 🔑 1. Use as environment variable (e.g. password)
|
||||||
|
|
||||||
|
```nix
|
||||||
|
systemd.services.my-service.serviceConfig.EnvironmentFile =
|
||||||
|
config.sops.secrets."my-password".path;
|
||||||
|
```
|
||||||
|
|
||||||
|
> Your `secrets.yaml` should contain:
|
||||||
|
>
|
||||||
|
> ```yaml
|
||||||
|
> my-password: PASSWORD=supersecret
|
||||||
|
> ```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🗂 2. Use as file source (e.g. private key or token)
|
||||||
|
|
||||||
|
```nix
|
||||||
|
environment.etc."ssh/id_ed25519".source =
|
||||||
|
config.sops.secrets."ssh-private-key".path;
|
||||||
|
```
|
||||||
|
|
||||||
|
> This places the decrypted secret at `/etc/ssh/id_ed25519` with appropriate permissions.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 👤 3. Read a secret value directly (not recommended for sensitive data)
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# Use a secret as a string value in a setting
|
||||||
|
services.myapp.settings.apiKey = builtins.readFile config.sops.secrets."api-key".path;
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🛠 4. Use in systemd preStart scripts
|
||||||
|
|
||||||
|
```nix
|
||||||
|
systemd.services.my-service.preStart = ''
|
||||||
|
export PASSWORD=$(<${config.sops.secrets."my-password".path})
|
||||||
|
./myapp --auth $PASSWORD
|
||||||
|
'';
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 🧠 5. Use in Forgejo user creation
|
||||||
|
|
||||||
|
```nix
|
||||||
|
systemd.services.forgejo.preStart = ''
|
||||||
|
${lib.getExe cfg.package} admin user create \
|
||||||
|
--username admin \
|
||||||
|
--password "$(tr -d '\n' < ${config.sops.secrets."admin-password".path})"
|
||||||
|
'';
|
||||||
|
```
|
||||||
|
|
@ -1,51 +1,104 @@
|
||||||
|
# 🔐 Secrets Management (with SOPS + Nix)
|
||||||
|
|
||||||
🔑 2. Generate an age Keypair
|
This directory contains encrypted secrets used across the infrastructure managed by NixOS and [sops-nix](https://github.com/Mic92/sops-nix). Secrets are stored using [SOPS](https://github.com/mozilla/sops) and encrypted with an `age` key located on each host at `/etc/sops/age.key`.
|
||||||
|
|
||||||
age-keygen -o secrets/age.key
|
---
|
||||||
|
|
||||||
This will output something like:
|
## 📁 Directory Structure
|
||||||
|
|
||||||
# created: 2025-06-02T22:00:00Z
|
|
||||||
# public key: age1abcdefghijk...
|
|
||||||
|
|
||||||
Copy that public key somewhere — you’ll need it for encrypting.
|
|
||||||
|
|
||||||
✅ You should now have:
|
|
||||||
|
|
||||||
|
```
|
||||||
secrets/
|
secrets/
|
||||||
├── age.key # keep this safe and private!
|
├── forgejo/
|
||||||
|
│ └── secrets.yaml # Forgejo-specific secrets (admin password, DB password, secret key)
|
||||||
|
├── runner/
|
||||||
|
│ └── secrets.yaml # Forgejo runner secrets (tokens, etc.)
|
||||||
|
├── shared/
|
||||||
|
│ └── secrets.yaml # Shared secrets used across multiple VMs (SSH keys, tokens)
|
||||||
|
````
|
||||||
|
|
||||||
📝 3. Create Encrypted Secrets File
|
---
|
||||||
|
|
||||||
sops --age age1abcdefghijk... secrets/secrets.yaml
|
## 🛠 SOPS Basics
|
||||||
|
|
||||||
This opens a YAML file in your $EDITOR. Add secrets like:
|
### ✅ Encrypt a **new secret file**
|
||||||
|
|
||||||
forgejo-admin-password: "my-super-secret-password"
|
```bash
|
||||||
|
sops --age <YOUR-AGE-PUBKEY> -e > secrets/myservice/secrets.yaml
|
||||||
|
````
|
||||||
|
Example:
|
||||||
|
```bash
|
||||||
|
sops --age $(cat ~/.config/sops/age/keys.txt | grep public) -e > secrets/forgejo/secrets.yaml
|
||||||
|
```
|
||||||
|
> Press `i` to enter edit mode if prompted, or fill it using YAML format:
|
||||||
|
```yaml
|
||||||
|
admin-password: hunter2
|
||||||
|
db-password: supersecret
|
||||||
|
```
|
||||||
|
|
||||||
Save and close the file — it’s now encrypted using the public key.
|
---
|
||||||
|
|
||||||
✅ Now you should have:
|
### ✏️ Edit secrets in an existing file
|
||||||
|
|
||||||
secrets/
|
```bash
|
||||||
├── age.key
|
sops secrets/forgejo/secrets.yaml
|
||||||
├── secrets.yaml # encrypted file (safe to commit)
|
```
|
||||||
|
|
||||||
You can commit secrets.yaml, but do not commit age.key unless you're OK with putting it on a VM.
|
---
|
||||||
|
|
||||||
|
## 🧬 Using Secrets in Nix
|
||||||
|
|
||||||
🧪 Test Decryption Locally
|
### 🧩 Option 1: Reference shared secrets (via `defaultSopsFile`)
|
||||||
|
|
||||||
export SOPS_AGE_KEY_FILE=secrets/age.key
|
```nix
|
||||||
|
# shared-sops.nix
|
||||||
|
{
|
||||||
|
sops = {
|
||||||
|
age.keyFile = "/etc/sops/age.key";
|
||||||
|
defaultSopsFile = ../secrets/shared/secrets.yaml;
|
||||||
|
|
||||||
To test:
|
secrets = {
|
||||||
|
"monitoring-token".owner = "prometheus";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
sops -d secrets/secrets.yaml
|
Then in services:
|
||||||
|
|
||||||
To edit:
|
```nix
|
||||||
|
environment.etc."monitoring/token".source = config.sops.secrets."monitoring-token".path;
|
||||||
|
```
|
||||||
|
|
||||||
sops secrets/secrets.yaml
|
---
|
||||||
|
|
||||||
|
### 🧩 Option 2: Reference per-service secrets with explicit `sopsFile`
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# forgejo/sops.nix
|
||||||
|
{
|
||||||
|
sops.secrets = {
|
||||||
|
"admin-password" = {
|
||||||
|
sopsFile = ./../secrets/forgejo/secrets.yaml;
|
||||||
|
owner = "forgejo";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
[plasmagoat@forgejo:~]$ sudo chmod 400 /etc/sops/age.key && sudo chown root:root /etc/sops/age.key
|
---
|
||||||
|
|
||||||
|
## 🧪 Testing secrets setup
|
||||||
|
|
||||||
|
Check which secrets will be applied:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nixos-rebuild dry-activate --flake .#my-hostname
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📚 Resources
|
||||||
|
|
||||||
|
* [sops-nix](https://github.com/Mic92/sops-nix)
|
||||||
|
* [Mozilla SOPS](https://github.com/mozilla/sops)
|
||||||
|
* [age encryption](https://github.com/FiloSottile/age)
|
||||||
|
|
|
||||||
16
nixos/secrets/forgejo/runner-secrets.yml
Normal file
16
nixos/secrets/forgejo/runner-secrets.yml
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
forgejo-runner-registration-token: ENC[AES256_GCM,data:ms0Ouy5GP6rlwkiLXoq31ZPSi9bpDKpNOqzEFATHLHflt+YTIjWuPAVRvKEIEQ==,iv:z2snOwdGq3e7Mxl+CmnoOh8c+ZaA+6lNDdXh2vVLULM=,tag:5ZpELR8K5JBQraMBYdXSuA==,type:str]
|
||||||
|
sops:
|
||||||
|
age:
|
||||||
|
- recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
|
||||||
|
enc: |
|
||||||
|
-----BEGIN AGE ENCRYPTED FILE-----
|
||||||
|
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSA0d3kzWXd2RElqdnViNGJG
|
||||||
|
NHd5bER1S2dVQUpxOER6Mi9TYlVuOHFhVzNNCnNWZVJzdU1LSG4yR3BNdmFEVzA2
|
||||||
|
bFNzK2cxNG9OcTB6NC8wdDAxcCtDekkKLS0tIExQdWNJQnBmb05RMktoeXF0dDZC
|
||||||
|
M3FyUEswckYrUDdvdmdUYnBqaTZFcncK8aNh8jL8nzYv2vWwhxX4QPed1pjFr2zK
|
||||||
|
9znxO+osZsUNIXySioLBfsA1kfqZCzaASsM2ezfWHKt1nCVQAvbXGA==
|
||||||
|
-----END AGE ENCRYPTED FILE-----
|
||||||
|
lastmodified: "2025-06-06T21:10:54Z"
|
||||||
|
mac: ENC[AES256_GCM,data:cDk2zKgxX01y/X9eQCbLm6OW74nE9HJdtliE6iye3gsDKbM+SqCuU1JTBvEcOAeROLn4svJmlRe3DDTGhrnuNO8tL8qLXKt2oQ0CM+A/3kXBb/jG13ps57fEpD32u/QbK6smVDS0Li+TCHEtfqiLyVat42lgyy9kakgjOll//K0=,iv:K8ly08WGyHLpk07oUwaO7ygEqcriJ3Uq1Ev/FtUcfiY=,tag:8VRCalipvZv0DAOAu9tSlg==,type:str]
|
||||||
|
unencrypted_suffix: _unencrypted
|
||||||
|
version: 3.10.2
|
||||||
18
nixos/secrets/forgejo/secrets.yml
Normal file
18
nixos/secrets/forgejo/secrets.yml
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
forgejo-admin-password: ENC[AES256_GCM,data:S05b/J9AK2SuIKDSWmtRf72C7V5FwMgZv/o5yxzNXRZEH2eIm18sC6+FEg==,iv:Ig/c4K9Io0S07Ywl4JQtbfxhjXJ7Rvea7+N4KhLUqjc=,tag:rx44tRuAbERBZR45QN6b9A==,type:str]
|
||||||
|
forgejo-db-password: ENC[AES256_GCM,data:5YwRl6HNa1LzJgr73ArllG9s+vWCS7m/s6QQh5YUz8I0anG7GQ==,iv:5ARq3unUy2xbDcAFkucvEhjz/QYC2rYgutEo4T2bw2E=,tag:k7eHKqeA7k6XzksLVcnXRw==,type:str]
|
||||||
|
forgejo-secret-key: ENC[AES256_GCM,data:iserDzOnJkM4HLP4c6rekSFANtRmEXwuCPyfMqo=,iv:3CNqN/DyS4PIl/iOO4JCpWJn3ARlb5KQSCNv5Orx2mo=,tag:q34jEpGrK2EKf0bcBznpQQ==,type:str]
|
||||||
|
sops:
|
||||||
|
age:
|
||||||
|
- recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
|
||||||
|
enc: |
|
||||||
|
-----BEGIN AGE ENCRYPTED FILE-----
|
||||||
|
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBjeTdJNVExVjB2dzF0dTRu
|
||||||
|
ZEV1RHlvd3VPNUZ1b0FsQW14bHJOUUM5Z1NjCmhudWRoUjd5a3dWSEhwK1dDd0hK
|
||||||
|
N1JUUHhlOVFGVWxwalpvbXJVMlhtcGcKLS0tIFJmRjM4bnJ0TUIyWElaUUd3Y2Zq
|
||||||
|
LzBHRWZXODVDZTE2WnVZOGNQckk4KzAKdm3xnA03JnQnc07yhVVtYkVYS6654Zm1
|
||||||
|
4AcLRSCcWvWrvp26XYVE2UGqU7acfxrTsk07o0nHAQpa5LjgJ4oFKw==
|
||||||
|
-----END AGE ENCRYPTED FILE-----
|
||||||
|
lastmodified: "2025-06-06T18:38:08Z"
|
||||||
|
mac: ENC[AES256_GCM,data:BvpIz6tfVSR3m1l7g4ilUyoTKKqirt+k6tPizxCsAgjztt0IyDCio+cLTln4P1tGSy/frjvbxy1mR3tIDkWn6aDFoYz/gnsbTKHSo/K5Q77jJ3uJffoB3/Wruigojl3EBIQHALicq9xhF8rsH/RKjpWqh+TrQwO+ibbA6ff76cw=,iv:Z0ZwJ9aPpI9MtbsZnvFkW7zsFFOMj5/Gv+tF/mal+yI=,tag:knf01NC/XwgjPUHH+8RpSg==,type:str]
|
||||||
|
unencrypted_suffix: _unencrypted
|
||||||
|
version: 3.10.2
|
||||||
|
|
@ -1,27 +0,0 @@
|
||||||
forgejo-admin-password: ENC[AES256_GCM,data:cLC4JQC8PMF4/aeVBzOROupPLzd7TbYwvudr7yVx4YpLCGSmYXRwJQAoXg==,iv:tG2kL66ZshwZkJodZQ5K8SZKfG1eJYeX9eYsZ7yM7rA=,tag:0roW0M9eUmzejkH6pwN/IA==,type:str]
|
|
||||||
forgejo-db-password: ENC[AES256_GCM,data:0KZJHmNuxpO8TmLNuryipICPTjG9h56+II1Azk+v3fkE5MAb9g==,iv:zb14BvbC2OehCYATgMMoPXv742jjD4v0B12cVhNCWBw=,tag:pnrboj5IvwXYXaZJbZpxTQ==,type:str]
|
|
||||||
hello: ENC[AES256_GCM,data:XkOLnE2Mkunc0zNF1932jOuz1olAwWf56lkqL2dt+h99WoL/vNLfSQ0al8NfEA==,iv:WC2xbB9WmB/khOVjdClFerJ8kjtHjaR/p6rDYaaDZhY=,tag:tT92FNrRm74XoZxoFFXm5g==,type:str]
|
|
||||||
example_key: ENC[AES256_GCM,data:kBk87OXu+qfJjP/2EA==,iv:64WcHaVfQrVCouUCZoHk0z/4ii8U9m61/E9SqLeB3Ms=,tag:MZJ6m7m4+s6BNGhtNs+ZFQ==,type:str]
|
|
||||||
#ENC[AES256_GCM,data:lM4LNQNU2S66a73pUymyUA==,iv:pAHgR+ViSO3Ff2zSaZQcXNGb2r2KH+ZbRd33vpq8ncs=,tag:WTNQCjaESLXTXwcwZePU2A==,type:comment]
|
|
||||||
example_array:
|
|
||||||
- ENC[AES256_GCM,data:Sc1q0Yd3sQ6eOzSwfQA=,iv:L4YBbWWeQZAYROHpiNEtHLDCdcuW+vvEpYhGxD0b62g=,tag:82L6MlHWIMpxKb4B3+Lszg==,type:str]
|
|
||||||
- ENC[AES256_GCM,data:Ud9dpSAcHc8NOq48wQI=,iv:9ERTBUQqKHPUIG57KXbRPMXN37cx+WcxOCDxCWpbE1k=,tag:ftTGF/obIJVZSTodIGoABw==,type:str]
|
|
||||||
example_number: ENC[AES256_GCM,data:1Xvp578L4rjW6g==,iv:82z/MQM586y4WilPZgmisa2C7GTdG0vmIEkyx/aMCXw=,tag:UtNDNKbu0tuhSyu1OQiJJA==,type:float]
|
|
||||||
example_booleans:
|
|
||||||
- ENC[AES256_GCM,data:RkxG/g==,iv:RNZpV/1KRWOazIuHj+SH7r3AmwnRBIUgXgfDplrk5X0=,tag:cKv0dVJGQcluscNspIrPgg==,type:bool]
|
|
||||||
- ENC[AES256_GCM,data:PvghSeY=,iv:xPlMb1LMsg5gAWsCXT3UnMyOfQmSKDKdDrjt+n9+Nqs=,tag:B2aROAGdcupDmoOHAiXeTg==,type:bool]
|
|
||||||
sops:
|
|
||||||
age:
|
|
||||||
- recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
|
|
||||||
enc: |
|
|
||||||
-----BEGIN AGE ENCRYPTED FILE-----
|
|
||||||
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBwVElvVXluZCsxK1BiT3c2
|
|
||||||
Zm9kaURNdnZ2Nk9EM0dld2tjdFhrZlFiSEVnCk8zZVpWWlFXS3JYS0Q2WHExLzFU
|
|
||||||
WkFwcDFmR3VrdHFmS2JmVC95TnZIMjQKLS0tIGsyVmp1Sm1uL3FKVWlERUZHdmVw
|
|
||||||
TG9HYXdUdlZNYXJUZng2ejBwbjJoNVkK0ER6mqLdz0hEaovWME4p56tjuYbPIuhb
|
|
||||||
X1smwLmHxgcRboeFU5dyp3wZKBg7ccRPneQKsgJvYb929BesynHr6g==
|
|
||||||
-----END AGE ENCRYPTED FILE-----
|
|
||||||
lastmodified: "2025-06-03T16:03:32Z"
|
|
||||||
mac: ENC[AES256_GCM,data:mLCtH1EPm1cD7KD/fCVO0hrIfG6AOl396kcwdahyr326IRvTneT+6lr+f0XAHSkPXtRsmSCiD9WNhLYAh/kCfsP7tVPKl4X17OHkK9blUJ5JpuqnZJfOQ3PXNitYFvcSUUi1Y1/vIQmDf52oTPlcZgxmTgsQj4MEJIIni7d0SOc=,iv:MhAJ0QAdyHv8BzHIBQ/lZ7zV/MKjcsicbBOw9kwo7Nc=,tag:qrfTfCPxAMvXOm69BMWJ4g==,type:str]
|
|
||||||
unencrypted_suffix: _unencrypted
|
|
||||||
version: 3.10.2
|
|
||||||
11
nixos/secrets/shared-sops.nix
Normal file
11
nixos/secrets/shared-sops.nix
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
{
|
||||||
|
sops = {
|
||||||
|
age.keyFile = "/etc/sops/age.key";
|
||||||
|
defaultSopsFile = ./shared/secrets.yml;
|
||||||
|
secrets = {
|
||||||
|
# "monitoring-token".owner = "prometheus";
|
||||||
|
# "ssh-private-key".owner = "root";
|
||||||
|
# "ssh-public-key".owner = "root";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
16
nixos/secrets/shared/secrets.yml
Normal file
16
nixos/secrets/shared/secrets.yml
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
test: ENC[AES256_GCM,data:a+pmog==,iv:3Ledge90oTzTM8uNFWWIgLafa7/Hhx9WzXRAS3flUZo=,tag:mfWiEWxkZVihuX3S3SY12w==,type:str]
|
||||||
|
sops:
|
||||||
|
age:
|
||||||
|
- recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
|
||||||
|
enc: |
|
||||||
|
-----BEGIN AGE ENCRYPTED FILE-----
|
||||||
|
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBudWI1RXZBOGFoaGIwWWc2
|
||||||
|
cXdpems4SmY1QjNhOWxHTVBhR1ZPYjdSZUcwClhHb1IzSTBmcFllbWl2UVpabGFo
|
||||||
|
Sm1neTlUVVUwalZUSWVLaVNBUWhUSTAKLS0tIG5yVHBiWGMwMG1OTnBXckh4eXdv
|
||||||
|
MU5UQ29lMmw0ZEJnV09IWEpWc2NYT2MKfl+3cZvgunrgGr7KhjGZOlJ0EKRAiAED
|
||||||
|
pDGr25OcjQgpsg4/LCPPOMsi9Yyi/RICJGyDDINaTLMEQDhIsoOyUQ==
|
||||||
|
-----END AGE ENCRYPTED FILE-----
|
||||||
|
lastmodified: "2025-06-06T17:29:59Z"
|
||||||
|
mac: ENC[AES256_GCM,data:nvb3Wc3578e45ob2FyyWlsadVOdErTfJ2Ni5jb06f/WbzDkyJd3lCBRTUIAdyXijT4ErtogHImBjYXzRuCi9xP68mTtaoQb6l8bULKJLdY/yDcMzMyKGZLDxTVW80nLvDrqs5piKBYFWtyFaAEio8fVlA4RIUsyFx/mgcbI3ChA=,iv:9/DmD48MKzBNGSODUr4jqDv17r2o4xgH7TVbpQeuyCU=,tag:1uVui9sVI9SfTlgtqPCLMA==,type:str]
|
||||||
|
unencrypted_suffix: _unencrypted
|
||||||
|
version: 3.10.2
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
{ config, lib, ... }:
|
|
||||||
{
|
|
||||||
sops = {
|
|
||||||
defaultSopsFile = ./secrets.yaml;
|
|
||||||
age.keyFile = "/etc/sops/age.key";
|
|
||||||
#secrets."forgejo-admin-password".owner = "forgejo";
|
|
||||||
};
|
|
||||||
}
|
|
||||||
16
nixos/secrets/telegram/secrets.yml
Normal file
16
nixos/secrets/telegram/secrets.yml
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
telegram-alert-bot-token: ENC[AES256_GCM,data:mM1aYhpcCecRUdwkdlBKA+dWOHZEwUvP+m4MIg4n89SzgY8GWw0z1OaIpxfR0w==,iv:tzmCjiYntDYpkO4S0a/tMQkfGQpZjLBiBu4Rs/5RHbc=,tag:5cZDEK474WzXwIW5Jc7S5w==,type:str]
|
||||||
|
sops:
|
||||||
|
age:
|
||||||
|
- recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y
|
||||||
|
enc: |
|
||||||
|
-----BEGIN AGE ENCRYPTED FILE-----
|
||||||
|
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSA5RE82S05lbHV6aE9qZFhL
|
||||||
|
MTZ4Y1R6cXFSZUFhZHdHbzZ5OWdrOXhwdEZ3CkxzM1NtSjNzeXM5byt1VzVtRHpH
|
||||||
|
Tk9ORUtEZ05FMTgrYVNhU3dKRkFKaHMKLS0tIFlLeWJ1dmJsRWc5SkhDbjdEb0or
|
||||||
|
UHl6emN0My8wcFZWYlZEaElrb2NidjgKlZols9SJQxgaoOdJJxghqlACBcwuFs94
|
||||||
|
IGAOoQVUSFhMCWzyXqAQ/1/VkbWqfiUmvqDa3ulEK2Ri+1F+u3mB1Q==
|
||||||
|
-----END AGE ENCRYPTED FILE-----
|
||||||
|
lastmodified: "2025-06-06T21:21:32Z"
|
||||||
|
mac: ENC[AES256_GCM,data:YS7BLFXkQ/A5PVLVOyMaqRHGavY0YttFps3njzSiYgBUa4VfPHqMcl2fW5vMec5MwM3GKPFGtrSEZKK1NVqLxUWZrfIF6ugAZ4vhRCyWe1Kze2Zs2S0ia2C3mUdhQR2wb7M7YzohI/e7PDZo0UcrcG3YeEzS5NL7qb0hzFsrGLY=,iv:kqzD06q5X0ZkZ1sIoUQz05b6QRDWQVsPqQYxPP2OAl8=,tag:eexvJspUxpDpwJqU1zEMnA==,type:str]
|
||||||
|
unencrypted_suffix: _unencrypted
|
||||||
|
version: 3.10.2
|
||||||
Loading…
Add table
Add a link
Reference in a new issue