dotfiles/modules/monitoring-prometheus-server.nix
DannyDannyDanny e8158e6c0f monitoring: fix prometheus → alertmanager loopback (IPv4 vs IPv6)
Alertmanager binds [::1]:9093 but Prometheus was dialing
127.0.0.1:9093 — connection refused, so alerts fired internally
but never reached Alertmanager. Switch the target to [::1]:9093
to match the bind.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 16:47:37 +02:00

134 lines
4 KiB
Nix

# Prometheus + Alertmanager + Grafana on sunken-ship.
#
# Scrape targets are the clan ZeroTier IPv6s — kept in sync with
# vars/per-machine/<host>/zerotier/zerotier-ip/value.
#
# Telegram receiver uses the existing @HarakatBot. Drop the bot token at
# /etc/alertmanager/telegram-token (mode 0400, root) before rebuild — same
# manual-secret pattern as the other Telegram bots in the repo.
#
# Routing: critical alerts repeat every 1h, everything else every 4h.
{ ... }:
let
sunkenShipZTv6 = "fdd5:53a2:de33:d269:6499:93d5:53a2:de33";
phantomShipZTv6 = "fdd5:53a2:de33:d269:6499:936c:48a:bbdc";
vpsRelayZTv6 = "fdd5:53a2:de33:d269:6499:9305:339f:2ed3";
target = ip: "[${ip}]:9100";
in {
services.prometheus = {
enable = true;
port = 9090;
listenAddress = "[::1]";
globalConfig = {
scrape_interval = "30s";
evaluation_interval = "30s";
};
scrapeConfigs = [{
job_name = "node";
static_configs = [{
targets = [
(target sunkenShipZTv6)
(target phantomShipZTv6)
(target vpsRelayZTv6)
];
labels.job = "node";
}];
}];
ruleFiles = [
(builtins.toFile "host-rules.yml" (builtins.toJSON {
groups = [{
name = "hosts";
rules = [{
alert = "HostDown";
expr = ''up{job="node"} == 0'';
for = "5m";
labels.severity = "critical";
annotations = {
summary = "{{ $labels.instance }} is down";
description = "{{ $labels.instance }} has been unreachable for 5 minutes.";
};
}];
}];
}))
];
alertmanagers = [{
static_configs = [{ targets = [ "[::1]:9093" ]; }];
}];
alertmanager = {
enable = true;
port = 9093;
listenAddress = "[::1]";
configuration = {
route = {
receiver = "telegram-default";
group_by = [ "alertname" ];
group_wait = "30s";
group_interval = "5m";
repeat_interval = "4h";
routes = [{
matchers = [ ''severity="critical"'' ];
receiver = "telegram-critical";
group_wait = "10s";
group_interval = "1m";
repeat_interval = "1h";
}];
};
receivers = [
{
name = "telegram-default";
telegram_configs = [{
bot_token_file = "/etc/alertmanager/telegram-token";
chat_id = 66070351;
api_url = "https://api.telegram.org";
parse_mode = "";
}];
}
{
name = "telegram-critical";
telegram_configs = [{
bot_token_file = "/etc/alertmanager/telegram-token";
chat_id = 66070351;
api_url = "https://api.telegram.org";
parse_mode = "";
message = ''
CRITICAL: {{ .CommonLabels.alertname }}
{{ range .Alerts }}{{ .Annotations.summary }}
{{ .Annotations.description }}
{{ end }}'';
}];
}
];
};
};
};
services.grafana = {
enable = true;
settings.server = {
http_addr = "::";
http_port = 3000;
domain = "sunken-ship.clan";
};
# Drop a random 32+ char string at /etc/grafana/secret-key (mode 0400,
# owned by grafana:grafana) before rebuild — same manual-secret pattern
# as /etc/alertmanager/telegram-token. Used to encrypt secrets stored
# in Grafana's DB; nothing to rotate on a fresh install.
settings.security.secret_key = "$__file{/etc/grafana/secret-key}";
provision.datasources.settings.datasources = [{
name = "Prometheus";
type = "prometheus";
url = "http://[::1]:9090";
isDefault = true;
}];
};
# Grafana on the ZeroTier mesh only. Prometheus + Alertmanager bind to
# localhost so they're not reachable off-host.
networking.firewall.interfaces."zt+".allowedTCPPorts = [ 3000 ];
}