monitoring: prometheus + alertmanager + grafana on sunken-ship

node_exporter on all three hosts (port 9100, ZT-only). Prometheus
server scrapes via the clan ZT IPv6s. Alertmanager routes alerts to
@HarakatBot (chat 66070351); critical repeats every 1h, others 4h.
Starter rule: HostDown when up==0 for 5m. Grafana on :3000 over ZT,
provisioned with the local Prometheus as default datasource.

Manual secrets on sunken-ship: /etc/alertmanager/telegram-token and
/etc/grafana/secret-key.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
DannyDannyDanny 2026-05-10 16:12:08 +02:00
parent 40cc62f65b
commit 3b6f4545b4
4 changed files with 152 additions and 0 deletions

View file

@ -127,6 +127,8 @@ in {
../nixos/hosts/sunken-ship.nix ../nixos/hosts/sunken-ship.nix
config.flake.nixosModules.dotfiles-rebuild config.flake.nixosModules.dotfiles-rebuild
config.flake.nixosModules.server-debug-tools config.flake.nixosModules.server-debug-tools
config.flake.nixosModules.monitoring-node-exporter
config.flake.nixosModules.monitoring-prometheus-server
inputs.home-manager.nixosModules.home-manager inputs.home-manager.nixosModules.home-manager
(hmModule { (hmModule {
user = "danny"; user = "danny";
@ -146,6 +148,7 @@ in {
} }
clanHostsModule clanHostsModule
../nixos/hosts/vps-relay.nix ../nixos/hosts/vps-relay.nix
config.flake.nixosModules.monitoring-node-exporter
inputs.home-manager.nixosModules.home-manager inputs.home-manager.nixosModules.home-manager
(hmModule { (hmModule {
user = "danny"; user = "danny";
@ -167,6 +170,7 @@ in {
../nixos/hosts/phantom-ship.nix ../nixos/hosts/phantom-ship.nix
config.flake.nixosModules.dotfiles-rebuild config.flake.nixosModules.dotfiles-rebuild
config.flake.nixosModules.server-debug-tools config.flake.nixosModules.server-debug-tools
config.flake.nixosModules.monitoring-node-exporter
inputs.home-manager.nixosModules.home-manager inputs.home-manager.nixosModules.home-manager
(hmModule { (hmModule {
user = "danny"; user = "danny";

View file

@ -5,4 +5,6 @@
{ ... }: { { ... }: {
flake.nixosModules.dotfiles-rebuild = ../modules/dotfiles-rebuild.nix; flake.nixosModules.dotfiles-rebuild = ../modules/dotfiles-rebuild.nix;
flake.nixosModules.server-debug-tools = ../modules/server-debug-tools.nix; flake.nixosModules.server-debug-tools = ../modules/server-debug-tools.nix;
flake.nixosModules.monitoring-node-exporter = ../modules/monitoring-node-exporter.nix;
flake.nixosModules.monitoring-prometheus-server = ../modules/monitoring-prometheus-server.nix;
} }

View file

@ -0,0 +1,12 @@
# Prometheus node_exporter — exposes host metrics on :9100, scoped to the
# ZeroTier mesh so only sunken-ship (the Prometheus server) can scrape it.
{ ... }: {
services.prometheus.exporters.node = {
enable = true;
port = 9100;
listenAddress = "::";
enabledCollectors = [ "systemd" ];
};
networking.firewall.interfaces."zt+".allowedTCPPorts = [ 9100 ];
}

View file

@ -0,0 +1,134 @@
# Prometheus + Alertmanager + Grafana on sunken-ship.
#
# Scrape targets are the clan ZeroTier IPv6s — kept in sync with
# vars/per-machine/<host>/zerotier/zerotier-ip/value.
#
# Telegram receiver uses the existing @HarakatBot. Drop the bot token at
# /etc/alertmanager/telegram-token (mode 0400, root) before rebuild — same
# manual-secret pattern as the other Telegram bots in the repo.
#
# Routing: critical alerts repeat every 1h, everything else every 4h.
{ ... }:
let
sunkenShipZTv6 = "fdd5:53a2:de33:d269:6499:93d5:53a2:de33";
phantomShipZTv6 = "fdd5:53a2:de33:d269:6499:936c:48a:bbdc";
vpsRelayZTv6 = "fdd5:53a2:de33:d269:6499:9305:339f:2ed3";
target = ip: "[${ip}]:9100";
in {
services.prometheus = {
enable = true;
port = 9090;
listenAddress = "[::1]";
globalConfig = {
scrape_interval = "30s";
evaluation_interval = "30s";
};
scrapeConfigs = [{
job_name = "node";
static_configs = [{
targets = [
(target sunkenShipZTv6)
(target phantomShipZTv6)
(target vpsRelayZTv6)
];
labels.job = "node";
}];
}];
ruleFiles = [
(builtins.toFile "host-rules.yml" (builtins.toJSON {
groups = [{
name = "hosts";
rules = [{
alert = "HostDown";
expr = ''up{job="node"} == 0'';
for = "5m";
labels.severity = "critical";
annotations = {
summary = "{{ $labels.instance }} is down";
description = "{{ $labels.instance }} has been unreachable for 5 minutes.";
};
}];
}];
}))
];
alertmanagers = [{
static_configs = [{ targets = [ "127.0.0.1:9093" ]; }];
}];
alertmanager = {
enable = true;
port = 9093;
listenAddress = "[::1]";
configuration = {
route = {
receiver = "telegram-default";
group_by = [ "alertname" ];
group_wait = "30s";
group_interval = "5m";
repeat_interval = "4h";
routes = [{
matchers = [ ''severity="critical"'' ];
receiver = "telegram-critical";
group_wait = "10s";
group_interval = "1m";
repeat_interval = "1h";
}];
};
receivers = [
{
name = "telegram-default";
telegram_configs = [{
bot_token_file = "/etc/alertmanager/telegram-token";
chat_id = 66070351;
api_url = "https://api.telegram.org";
parse_mode = "";
}];
}
{
name = "telegram-critical";
telegram_configs = [{
bot_token_file = "/etc/alertmanager/telegram-token";
chat_id = 66070351;
api_url = "https://api.telegram.org";
parse_mode = "";
message = ''
CRITICAL: {{ .CommonLabels.alertname }}
{{ range .Alerts }}{{ .Annotations.summary }}
{{ .Annotations.description }}
{{ end }}'';
}];
}
];
};
};
};
services.grafana = {
enable = true;
settings.server = {
http_addr = "::";
http_port = 3000;
domain = "sunken-ship.clan";
};
# Drop a random 32+ char string at /etc/grafana/secret-key (mode 0400,
# owned by grafana:grafana) before rebuild — same manual-secret pattern
# as /etc/alertmanager/telegram-token. Used to encrypt secrets stored
# in Grafana's DB; nothing to rotate on a fresh install.
settings.security.secret_key = "$__file{/etc/grafana/secret-key}";
provision.datasources.settings.datasources = [{
name = "Prometheus";
type = "prometheus";
url = "http://[::1]:9090";
isDefault = true;
}];
};
# Grafana on the ZeroTier mesh only. Prometheus + Alertmanager bind to
# localhost so they're not reachable off-host.
networking.firewall.interfaces."zt+".allowedTCPPorts = [ 3000 ];
}