monitoring: prometheus + alertmanager + grafana on sunken-ship
node_exporter on all three hosts (port 9100, ZT-only). Prometheus server scrapes via the clan ZT IPv6s. Alertmanager routes alerts to @HarakatBot (chat 66070351); critical repeats every 1h, others 4h. Starter rule: HostDown when up==0 for 5m. Grafana on :3000 over ZT, provisioned with the local Prometheus as default datasource. Manual secrets on sunken-ship: /etc/alertmanager/telegram-token and /etc/grafana/secret-key. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
40cc62f65b
commit
3b6f4545b4
4 changed files with 152 additions and 0 deletions
|
|
@ -127,6 +127,8 @@ in {
|
||||||
../nixos/hosts/sunken-ship.nix
|
../nixos/hosts/sunken-ship.nix
|
||||||
config.flake.nixosModules.dotfiles-rebuild
|
config.flake.nixosModules.dotfiles-rebuild
|
||||||
config.flake.nixosModules.server-debug-tools
|
config.flake.nixosModules.server-debug-tools
|
||||||
|
config.flake.nixosModules.monitoring-node-exporter
|
||||||
|
config.flake.nixosModules.monitoring-prometheus-server
|
||||||
inputs.home-manager.nixosModules.home-manager
|
inputs.home-manager.nixosModules.home-manager
|
||||||
(hmModule {
|
(hmModule {
|
||||||
user = "danny";
|
user = "danny";
|
||||||
|
|
@ -146,6 +148,7 @@ in {
|
||||||
}
|
}
|
||||||
clanHostsModule
|
clanHostsModule
|
||||||
../nixos/hosts/vps-relay.nix
|
../nixos/hosts/vps-relay.nix
|
||||||
|
config.flake.nixosModules.monitoring-node-exporter
|
||||||
inputs.home-manager.nixosModules.home-manager
|
inputs.home-manager.nixosModules.home-manager
|
||||||
(hmModule {
|
(hmModule {
|
||||||
user = "danny";
|
user = "danny";
|
||||||
|
|
@ -167,6 +170,7 @@ in {
|
||||||
../nixos/hosts/phantom-ship.nix
|
../nixos/hosts/phantom-ship.nix
|
||||||
config.flake.nixosModules.dotfiles-rebuild
|
config.flake.nixosModules.dotfiles-rebuild
|
||||||
config.flake.nixosModules.server-debug-tools
|
config.flake.nixosModules.server-debug-tools
|
||||||
|
config.flake.nixosModules.monitoring-node-exporter
|
||||||
inputs.home-manager.nixosModules.home-manager
|
inputs.home-manager.nixosModules.home-manager
|
||||||
(hmModule {
|
(hmModule {
|
||||||
user = "danny";
|
user = "danny";
|
||||||
|
|
|
||||||
|
|
@ -5,4 +5,6 @@
|
||||||
{ ... }: {
|
{ ... }: {
|
||||||
flake.nixosModules.dotfiles-rebuild = ../modules/dotfiles-rebuild.nix;
|
flake.nixosModules.dotfiles-rebuild = ../modules/dotfiles-rebuild.nix;
|
||||||
flake.nixosModules.server-debug-tools = ../modules/server-debug-tools.nix;
|
flake.nixosModules.server-debug-tools = ../modules/server-debug-tools.nix;
|
||||||
|
flake.nixosModules.monitoring-node-exporter = ../modules/monitoring-node-exporter.nix;
|
||||||
|
flake.nixosModules.monitoring-prometheus-server = ../modules/monitoring-prometheus-server.nix;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
12
modules/monitoring-node-exporter.nix
Normal file
12
modules/monitoring-node-exporter.nix
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
# Prometheus node_exporter — exposes host metrics on :9100, scoped to the
|
||||||
|
# ZeroTier mesh so only sunken-ship (the Prometheus server) can scrape it.
|
||||||
|
{ ... }: {
|
||||||
|
services.prometheus.exporters.node = {
|
||||||
|
enable = true;
|
||||||
|
port = 9100;
|
||||||
|
listenAddress = "::";
|
||||||
|
enabledCollectors = [ "systemd" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
networking.firewall.interfaces."zt+".allowedTCPPorts = [ 9100 ];
|
||||||
|
}
|
||||||
134
modules/monitoring-prometheus-server.nix
Normal file
134
modules/monitoring-prometheus-server.nix
Normal file
|
|
@ -0,0 +1,134 @@
|
||||||
|
# Prometheus + Alertmanager + Grafana on sunken-ship.
|
||||||
|
#
|
||||||
|
# Scrape targets are the clan ZeroTier IPv6s — kept in sync with
|
||||||
|
# vars/per-machine/<host>/zerotier/zerotier-ip/value.
|
||||||
|
#
|
||||||
|
# Telegram receiver uses the existing @HarakatBot. Drop the bot token at
|
||||||
|
# /etc/alertmanager/telegram-token (mode 0400, root) before rebuild — same
|
||||||
|
# manual-secret pattern as the other Telegram bots in the repo.
|
||||||
|
#
|
||||||
|
# Routing: critical alerts repeat every 1h, everything else every 4h.
|
||||||
|
{ ... }:
|
||||||
|
let
|
||||||
|
sunkenShipZTv6 = "fdd5:53a2:de33:d269:6499:93d5:53a2:de33";
|
||||||
|
phantomShipZTv6 = "fdd5:53a2:de33:d269:6499:936c:48a:bbdc";
|
||||||
|
vpsRelayZTv6 = "fdd5:53a2:de33:d269:6499:9305:339f:2ed3";
|
||||||
|
|
||||||
|
target = ip: "[${ip}]:9100";
|
||||||
|
in {
|
||||||
|
services.prometheus = {
|
||||||
|
enable = true;
|
||||||
|
port = 9090;
|
||||||
|
listenAddress = "[::1]";
|
||||||
|
|
||||||
|
globalConfig = {
|
||||||
|
scrape_interval = "30s";
|
||||||
|
evaluation_interval = "30s";
|
||||||
|
};
|
||||||
|
|
||||||
|
scrapeConfigs = [{
|
||||||
|
job_name = "node";
|
||||||
|
static_configs = [{
|
||||||
|
targets = [
|
||||||
|
(target sunkenShipZTv6)
|
||||||
|
(target phantomShipZTv6)
|
||||||
|
(target vpsRelayZTv6)
|
||||||
|
];
|
||||||
|
labels.job = "node";
|
||||||
|
}];
|
||||||
|
}];
|
||||||
|
|
||||||
|
ruleFiles = [
|
||||||
|
(builtins.toFile "host-rules.yml" (builtins.toJSON {
|
||||||
|
groups = [{
|
||||||
|
name = "hosts";
|
||||||
|
rules = [{
|
||||||
|
alert = "HostDown";
|
||||||
|
expr = ''up{job="node"} == 0'';
|
||||||
|
for = "5m";
|
||||||
|
labels.severity = "critical";
|
||||||
|
annotations = {
|
||||||
|
summary = "{{ $labels.instance }} is down";
|
||||||
|
description = "{{ $labels.instance }} has been unreachable for 5 minutes.";
|
||||||
|
};
|
||||||
|
}];
|
||||||
|
}];
|
||||||
|
}))
|
||||||
|
];
|
||||||
|
|
||||||
|
alertmanagers = [{
|
||||||
|
static_configs = [{ targets = [ "127.0.0.1:9093" ]; }];
|
||||||
|
}];
|
||||||
|
|
||||||
|
alertmanager = {
|
||||||
|
enable = true;
|
||||||
|
port = 9093;
|
||||||
|
listenAddress = "[::1]";
|
||||||
|
configuration = {
|
||||||
|
route = {
|
||||||
|
receiver = "telegram-default";
|
||||||
|
group_by = [ "alertname" ];
|
||||||
|
group_wait = "30s";
|
||||||
|
group_interval = "5m";
|
||||||
|
repeat_interval = "4h";
|
||||||
|
routes = [{
|
||||||
|
matchers = [ ''severity="critical"'' ];
|
||||||
|
receiver = "telegram-critical";
|
||||||
|
group_wait = "10s";
|
||||||
|
group_interval = "1m";
|
||||||
|
repeat_interval = "1h";
|
||||||
|
}];
|
||||||
|
};
|
||||||
|
receivers = [
|
||||||
|
{
|
||||||
|
name = "telegram-default";
|
||||||
|
telegram_configs = [{
|
||||||
|
bot_token_file = "/etc/alertmanager/telegram-token";
|
||||||
|
chat_id = 66070351;
|
||||||
|
api_url = "https://api.telegram.org";
|
||||||
|
parse_mode = "";
|
||||||
|
}];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
name = "telegram-critical";
|
||||||
|
telegram_configs = [{
|
||||||
|
bot_token_file = "/etc/alertmanager/telegram-token";
|
||||||
|
chat_id = 66070351;
|
||||||
|
api_url = "https://api.telegram.org";
|
||||||
|
parse_mode = "";
|
||||||
|
message = ''
|
||||||
|
CRITICAL: {{ .CommonLabels.alertname }}
|
||||||
|
{{ range .Alerts }}{{ .Annotations.summary }}
|
||||||
|
{{ .Annotations.description }}
|
||||||
|
{{ end }}'';
|
||||||
|
}];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
services.grafana = {
|
||||||
|
enable = true;
|
||||||
|
settings.server = {
|
||||||
|
http_addr = "::";
|
||||||
|
http_port = 3000;
|
||||||
|
domain = "sunken-ship.clan";
|
||||||
|
};
|
||||||
|
# Drop a random 32+ char string at /etc/grafana/secret-key (mode 0400,
|
||||||
|
# owned by grafana:grafana) before rebuild — same manual-secret pattern
|
||||||
|
# as /etc/alertmanager/telegram-token. Used to encrypt secrets stored
|
||||||
|
# in Grafana's DB; nothing to rotate on a fresh install.
|
||||||
|
settings.security.secret_key = "$__file{/etc/grafana/secret-key}";
|
||||||
|
provision.datasources.settings.datasources = [{
|
||||||
|
name = "Prometheus";
|
||||||
|
type = "prometheus";
|
||||||
|
url = "http://[::1]:9090";
|
||||||
|
isDefault = true;
|
||||||
|
}];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Grafana on the ZeroTier mesh only. Prometheus + Alertmanager bind to
|
||||||
|
# localhost so they're not reachable off-host.
|
||||||
|
networking.firewall.interfaces."zt+".allowedTCPPorts = [ 3000 ];
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue