monitoring: prometheus + alertmanager + grafana on sunken-ship
node_exporter on all three hosts (port 9100, ZT-only). Prometheus server scrapes via the clan ZT IPv6s. Alertmanager routes alerts to @HarakatBot (chat 66070351); critical repeats every 1h, others 4h. Starter rule: HostDown when up==0 for 5m. Grafana on :3000 over ZT, provisioned with the local Prometheus as default datasource. Manual secrets on sunken-ship: /etc/alertmanager/telegram-token and /etc/grafana/secret-key. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
40cc62f65b
commit
3b6f4545b4
4 changed files with 152 additions and 0 deletions
|
|
@ -127,6 +127,8 @@ in {
|
|||
../nixos/hosts/sunken-ship.nix
|
||||
config.flake.nixosModules.dotfiles-rebuild
|
||||
config.flake.nixosModules.server-debug-tools
|
||||
config.flake.nixosModules.monitoring-node-exporter
|
||||
config.flake.nixosModules.monitoring-prometheus-server
|
||||
inputs.home-manager.nixosModules.home-manager
|
||||
(hmModule {
|
||||
user = "danny";
|
||||
|
|
@ -146,6 +148,7 @@ in {
|
|||
}
|
||||
clanHostsModule
|
||||
../nixos/hosts/vps-relay.nix
|
||||
config.flake.nixosModules.monitoring-node-exporter
|
||||
inputs.home-manager.nixosModules.home-manager
|
||||
(hmModule {
|
||||
user = "danny";
|
||||
|
|
@ -167,6 +170,7 @@ in {
|
|||
../nixos/hosts/phantom-ship.nix
|
||||
config.flake.nixosModules.dotfiles-rebuild
|
||||
config.flake.nixosModules.server-debug-tools
|
||||
config.flake.nixosModules.monitoring-node-exporter
|
||||
inputs.home-manager.nixosModules.home-manager
|
||||
(hmModule {
|
||||
user = "danny";
|
||||
|
|
|
|||
|
|
@ -5,4 +5,6 @@
|
|||
{ ... }: {
|
||||
flake.nixosModules.dotfiles-rebuild = ../modules/dotfiles-rebuild.nix;
|
||||
flake.nixosModules.server-debug-tools = ../modules/server-debug-tools.nix;
|
||||
flake.nixosModules.monitoring-node-exporter = ../modules/monitoring-node-exporter.nix;
|
||||
flake.nixosModules.monitoring-prometheus-server = ../modules/monitoring-prometheus-server.nix;
|
||||
}
|
||||
|
|
|
|||
12
modules/monitoring-node-exporter.nix
Normal file
12
modules/monitoring-node-exporter.nix
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
# Prometheus node_exporter — exposes host metrics on :9100, scoped to the
|
||||
# ZeroTier mesh so only sunken-ship (the Prometheus server) can scrape it.
|
||||
{ ... }: {
|
||||
services.prometheus.exporters.node = {
|
||||
enable = true;
|
||||
port = 9100;
|
||||
listenAddress = "::";
|
||||
enabledCollectors = [ "systemd" ];
|
||||
};
|
||||
|
||||
networking.firewall.interfaces."zt+".allowedTCPPorts = [ 9100 ];
|
||||
}
|
||||
134
modules/monitoring-prometheus-server.nix
Normal file
134
modules/monitoring-prometheus-server.nix
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
# Prometheus + Alertmanager + Grafana on sunken-ship.
|
||||
#
|
||||
# Scrape targets are the clan ZeroTier IPv6s — kept in sync with
|
||||
# vars/per-machine/<host>/zerotier/zerotier-ip/value.
|
||||
#
|
||||
# Telegram receiver uses the existing @HarakatBot. Drop the bot token at
|
||||
# /etc/alertmanager/telegram-token (mode 0400, root) before rebuild — same
|
||||
# manual-secret pattern as the other Telegram bots in the repo.
|
||||
#
|
||||
# Routing: critical alerts repeat every 1h, everything else every 4h.
|
||||
{ ... }:
|
||||
let
|
||||
sunkenShipZTv6 = "fdd5:53a2:de33:d269:6499:93d5:53a2:de33";
|
||||
phantomShipZTv6 = "fdd5:53a2:de33:d269:6499:936c:48a:bbdc";
|
||||
vpsRelayZTv6 = "fdd5:53a2:de33:d269:6499:9305:339f:2ed3";
|
||||
|
||||
target = ip: "[${ip}]:9100";
|
||||
in {
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
port = 9090;
|
||||
listenAddress = "[::1]";
|
||||
|
||||
globalConfig = {
|
||||
scrape_interval = "30s";
|
||||
evaluation_interval = "30s";
|
||||
};
|
||||
|
||||
scrapeConfigs = [{
|
||||
job_name = "node";
|
||||
static_configs = [{
|
||||
targets = [
|
||||
(target sunkenShipZTv6)
|
||||
(target phantomShipZTv6)
|
||||
(target vpsRelayZTv6)
|
||||
];
|
||||
labels.job = "node";
|
||||
}];
|
||||
}];
|
||||
|
||||
ruleFiles = [
|
||||
(builtins.toFile "host-rules.yml" (builtins.toJSON {
|
||||
groups = [{
|
||||
name = "hosts";
|
||||
rules = [{
|
||||
alert = "HostDown";
|
||||
expr = ''up{job="node"} == 0'';
|
||||
for = "5m";
|
||||
labels.severity = "critical";
|
||||
annotations = {
|
||||
summary = "{{ $labels.instance }} is down";
|
||||
description = "{{ $labels.instance }} has been unreachable for 5 minutes.";
|
||||
};
|
||||
}];
|
||||
}];
|
||||
}))
|
||||
];
|
||||
|
||||
alertmanagers = [{
|
||||
static_configs = [{ targets = [ "127.0.0.1:9093" ]; }];
|
||||
}];
|
||||
|
||||
alertmanager = {
|
||||
enable = true;
|
||||
port = 9093;
|
||||
listenAddress = "[::1]";
|
||||
configuration = {
|
||||
route = {
|
||||
receiver = "telegram-default";
|
||||
group_by = [ "alertname" ];
|
||||
group_wait = "30s";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
routes = [{
|
||||
matchers = [ ''severity="critical"'' ];
|
||||
receiver = "telegram-critical";
|
||||
group_wait = "10s";
|
||||
group_interval = "1m";
|
||||
repeat_interval = "1h";
|
||||
}];
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "telegram-default";
|
||||
telegram_configs = [{
|
||||
bot_token_file = "/etc/alertmanager/telegram-token";
|
||||
chat_id = 66070351;
|
||||
api_url = "https://api.telegram.org";
|
||||
parse_mode = "";
|
||||
}];
|
||||
}
|
||||
{
|
||||
name = "telegram-critical";
|
||||
telegram_configs = [{
|
||||
bot_token_file = "/etc/alertmanager/telegram-token";
|
||||
chat_id = 66070351;
|
||||
api_url = "https://api.telegram.org";
|
||||
parse_mode = "";
|
||||
message = ''
|
||||
CRITICAL: {{ .CommonLabels.alertname }}
|
||||
{{ range .Alerts }}{{ .Annotations.summary }}
|
||||
{{ .Annotations.description }}
|
||||
{{ end }}'';
|
||||
}];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
services.grafana = {
|
||||
enable = true;
|
||||
settings.server = {
|
||||
http_addr = "::";
|
||||
http_port = 3000;
|
||||
domain = "sunken-ship.clan";
|
||||
};
|
||||
# Drop a random 32+ char string at /etc/grafana/secret-key (mode 0400,
|
||||
# owned by grafana:grafana) before rebuild — same manual-secret pattern
|
||||
# as /etc/alertmanager/telegram-token. Used to encrypt secrets stored
|
||||
# in Grafana's DB; nothing to rotate on a fresh install.
|
||||
settings.security.secret_key = "$__file{/etc/grafana/secret-key}";
|
||||
provision.datasources.settings.datasources = [{
|
||||
name = "Prometheus";
|
||||
type = "prometheus";
|
||||
url = "http://[::1]:9090";
|
||||
isDefault = true;
|
||||
}];
|
||||
};
|
||||
|
||||
# Grafana on the ZeroTier mesh only. Prometheus + Alertmanager bind to
|
||||
# localhost so they're not reachable off-host.
|
||||
networking.firewall.interfaces."zt+".allowedTCPPorts = [ 3000 ];
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue