From 3b6f4545b4d72cfe62c4b41782f8d120aa2983ff Mon Sep 17 00:00:00 2001 From: DannyDannyDanny Date: Sun, 10 May 2026 16:12:08 +0200 Subject: [PATCH] monitoring: prometheus + alertmanager + grafana on sunken-ship node_exporter on all three hosts (port 9100, ZT-only). Prometheus server scrapes via the clan ZT IPv6s. Alertmanager routes alerts to @HarakatBot (chat 66070351); critical repeats every 1h, others 4h. Starter rule: HostDown when up==0 for 5m. Grafana on :3000 over ZT, provisioned with the local Prometheus as default datasource. Manual secrets on sunken-ship: /etc/alertmanager/telegram-token and /etc/grafana/secret-key. Co-Authored-By: Claude Opus 4.7 (1M context) --- flake-modules/clan.nix | 4 + flake-modules/nixos-modules.nix | 2 + modules/monitoring-node-exporter.nix | 12 ++ modules/monitoring-prometheus-server.nix | 134 +++++++++++++++++++++++ 4 files changed, 152 insertions(+) create mode 100644 modules/monitoring-node-exporter.nix create mode 100644 modules/monitoring-prometheus-server.nix diff --git a/flake-modules/clan.nix b/flake-modules/clan.nix index f8b1293..6b4a5d0 100644 --- a/flake-modules/clan.nix +++ b/flake-modules/clan.nix @@ -127,6 +127,8 @@ in { ../nixos/hosts/sunken-ship.nix config.flake.nixosModules.dotfiles-rebuild config.flake.nixosModules.server-debug-tools + config.flake.nixosModules.monitoring-node-exporter + config.flake.nixosModules.monitoring-prometheus-server inputs.home-manager.nixosModules.home-manager (hmModule { user = "danny"; @@ -146,6 +148,7 @@ in { } clanHostsModule ../nixos/hosts/vps-relay.nix + config.flake.nixosModules.monitoring-node-exporter inputs.home-manager.nixosModules.home-manager (hmModule { user = "danny"; @@ -167,6 +170,7 @@ in { ../nixos/hosts/phantom-ship.nix config.flake.nixosModules.dotfiles-rebuild config.flake.nixosModules.server-debug-tools + config.flake.nixosModules.monitoring-node-exporter inputs.home-manager.nixosModules.home-manager (hmModule { user = "danny"; diff --git a/flake-modules/nixos-modules.nix b/flake-modules/nixos-modules.nix index a466a58..3f6bf96 100644 --- a/flake-modules/nixos-modules.nix +++ b/flake-modules/nixos-modules.nix @@ -5,4 +5,6 @@ { ... }: { flake.nixosModules.dotfiles-rebuild = ../modules/dotfiles-rebuild.nix; flake.nixosModules.server-debug-tools = ../modules/server-debug-tools.nix; + flake.nixosModules.monitoring-node-exporter = ../modules/monitoring-node-exporter.nix; + flake.nixosModules.monitoring-prometheus-server = ../modules/monitoring-prometheus-server.nix; } diff --git a/modules/monitoring-node-exporter.nix b/modules/monitoring-node-exporter.nix new file mode 100644 index 0000000..402f44d --- /dev/null +++ b/modules/monitoring-node-exporter.nix @@ -0,0 +1,12 @@ +# Prometheus node_exporter — exposes host metrics on :9100, scoped to the +# ZeroTier mesh so only sunken-ship (the Prometheus server) can scrape it. +{ ... }: { + services.prometheus.exporters.node = { + enable = true; + port = 9100; + listenAddress = "::"; + enabledCollectors = [ "systemd" ]; + }; + + networking.firewall.interfaces."zt+".allowedTCPPorts = [ 9100 ]; +} diff --git a/modules/monitoring-prometheus-server.nix b/modules/monitoring-prometheus-server.nix new file mode 100644 index 0000000..6b02d14 --- /dev/null +++ b/modules/monitoring-prometheus-server.nix @@ -0,0 +1,134 @@ +# Prometheus + Alertmanager + Grafana on sunken-ship. +# +# Scrape targets are the clan ZeroTier IPv6s — kept in sync with +# vars/per-machine//zerotier/zerotier-ip/value. +# +# Telegram receiver uses the existing @HarakatBot. Drop the bot token at +# /etc/alertmanager/telegram-token (mode 0400, root) before rebuild — same +# manual-secret pattern as the other Telegram bots in the repo. +# +# Routing: critical alerts repeat every 1h, everything else every 4h. +{ ... }: +let + sunkenShipZTv6 = "fdd5:53a2:de33:d269:6499:93d5:53a2:de33"; + phantomShipZTv6 = "fdd5:53a2:de33:d269:6499:936c:48a:bbdc"; + vpsRelayZTv6 = "fdd5:53a2:de33:d269:6499:9305:339f:2ed3"; + + target = ip: "[${ip}]:9100"; +in { + services.prometheus = { + enable = true; + port = 9090; + listenAddress = "[::1]"; + + globalConfig = { + scrape_interval = "30s"; + evaluation_interval = "30s"; + }; + + scrapeConfigs = [{ + job_name = "node"; + static_configs = [{ + targets = [ + (target sunkenShipZTv6) + (target phantomShipZTv6) + (target vpsRelayZTv6) + ]; + labels.job = "node"; + }]; + }]; + + ruleFiles = [ + (builtins.toFile "host-rules.yml" (builtins.toJSON { + groups = [{ + name = "hosts"; + rules = [{ + alert = "HostDown"; + expr = ''up{job="node"} == 0''; + for = "5m"; + labels.severity = "critical"; + annotations = { + summary = "{{ $labels.instance }} is down"; + description = "{{ $labels.instance }} has been unreachable for 5 minutes."; + }; + }]; + }]; + })) + ]; + + alertmanagers = [{ + static_configs = [{ targets = [ "127.0.0.1:9093" ]; }]; + }]; + + alertmanager = { + enable = true; + port = 9093; + listenAddress = "[::1]"; + configuration = { + route = { + receiver = "telegram-default"; + group_by = [ "alertname" ]; + group_wait = "30s"; + group_interval = "5m"; + repeat_interval = "4h"; + routes = [{ + matchers = [ ''severity="critical"'' ]; + receiver = "telegram-critical"; + group_wait = "10s"; + group_interval = "1m"; + repeat_interval = "1h"; + }]; + }; + receivers = [ + { + name = "telegram-default"; + telegram_configs = [{ + bot_token_file = "/etc/alertmanager/telegram-token"; + chat_id = 66070351; + api_url = "https://api.telegram.org"; + parse_mode = ""; + }]; + } + { + name = "telegram-critical"; + telegram_configs = [{ + bot_token_file = "/etc/alertmanager/telegram-token"; + chat_id = 66070351; + api_url = "https://api.telegram.org"; + parse_mode = ""; + message = '' + CRITICAL: {{ .CommonLabels.alertname }} + {{ range .Alerts }}{{ .Annotations.summary }} + {{ .Annotations.description }} + {{ end }}''; + }]; + } + ]; + }; + }; + }; + + services.grafana = { + enable = true; + settings.server = { + http_addr = "::"; + http_port = 3000; + domain = "sunken-ship.clan"; + }; + # Drop a random 32+ char string at /etc/grafana/secret-key (mode 0400, + # owned by grafana:grafana) before rebuild — same manual-secret pattern + # as /etc/alertmanager/telegram-token. Used to encrypt secrets stored + # in Grafana's DB; nothing to rotate on a fresh install. + settings.security.secret_key = "$__file{/etc/grafana/secret-key}"; + provision.datasources.settings.datasources = [{ + name = "Prometheus"; + type = "prometheus"; + url = "http://[::1]:9090"; + isDefault = true; + }]; + }; + + # Grafana on the ZeroTier mesh only. Prometheus + Alertmanager bind to + # localhost so they're not reachable off-host. + networking.firewall.interfaces."zt+".allowedTCPPorts = [ 3000 ]; +}