homelab/modules/homelab/services/prometheus.nix
plasmagoat ce8c543e84
Some checks failed
Test / tests (push) Has been cancelled
/ OpenTofu (push) Has been cancelled
auto docs
2025-07-29 16:28:17 +02:00

259 lines
7.3 KiB
Nix

{
config,
lib,
pkgs,
...
}:
with lib; let
serviceName = "prometheus";
cfg = config.homelab.services.${serviceName};
homelabCfg = config.homelab;
# Generate Prometheus scrape configs from global monitoring data
prometheusScrapeConfigs = let
# Get all metrics - try global first, fallback to local
allMetrics = homelabCfg.monitoring.global.allMetrics;
jobGroups = groupBy (m: m.jobName) allMetrics;
scrapeConfigs =
mapAttrsToList (jobName: endpoints: {
job_name = jobName;
scrape_interval = head endpoints.scrapeInterval or ["30s"];
static_configs =
map
(endpoint: {
targets = ["${endpoint.host}:${toString endpoint.port}"];
labels = endpoint.labels;
})
endpoints;
metrics_path = head endpoints.path or ["/metrics"];
})
jobGroups;
in
scrapeConfigs;
# Standard alerting rules for homelab
alertingRules = [
{
name = "homelab.rules";
rules = [
{
alert = "InstanceDown";
expr = "up == 0";
for = "5m";
labels = {severity = "critical";};
annotations = {
summary = "Instance {{ $labels.instance }} down";
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.";
};
}
{
alert = "HighCPUUsage";
expr = "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) > 80";
for = "10m";
labels = {severity = "warning";};
annotations = {
summary = "High CPU usage on {{ $labels.instance }}";
description = "CPU usage is above 80% for more than 10 minutes on {{ $labels.instance }}.";
};
}
{
alert = "HighMemoryUsage";
expr = "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85";
for = "10m";
labels = {severity = "warning";};
annotations = {
summary = "High memory usage on {{ $labels.instance }}";
description = "Memory usage is above 85% for more than 10 minutes on {{ $labels.instance }}.";
};
}
{
alert = "DiskSpaceLow";
expr = "((node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes) * 100 > 90";
for = "5m";
labels = {severity = "critical";};
annotations = {
summary = "Disk space low on {{ $labels.instance }}";
description = "Disk usage is above 90% on {{ $labels.instance }} {{ $labels.mountpoint }}.";
};
}
];
}
];
in {
imports = [
(import ../lib/features/monitoring.nix serviceName)
(import ../lib/features/logging.nix serviceName)
(import ../lib/features/proxy.nix serviceName)
];
# Core service options
options.homelab.services.${serviceName} = {
enable = mkEnableOption "Prometheus Monitoring Server";
port = mkOption {
type = types.port;
default = 9090;
};
description = mkOption {
type = types.str;
default = "Prometheus Monitoring Server";
};
# Prometheus-specific options
retention = mkOption {
type = types.str;
default = "15d";
description = "How long to retain metrics data";
};
alertmanager = {
enable = mkOption {
type = types.bool;
default = true;
description = "Enable integration with Alertmanager";
};
url = mkOption {
type = types.str;
default = "alertmanager.${homelabCfg.domain}:9093";
description = "Alertmanager URL";
};
};
extraScrapeConfigs = mkOption {
type = types.listOf types.attrs;
default = [];
description = "Additional scrape configurations";
};
extraAlertingRules = mkOption {
type = types.listOf types.attrs;
default = [];
description = "Additional alerting rules";
};
globalConfig = mkOption {
type = types.attrs;
default = {
scrape_interval = "15s";
evaluation_interval = "15s";
};
description = "Global Prometheus configuration";
};
extraFlags = mkOption {
type = types.listOf types.str;
default = [];
description = "Extra command line flags";
};
ruleFiles = mkOption {
type = types.listOf types.path;
default = [];
description = "Additional rule files to load";
};
systemdServices = mkOption {
type = types.listOf types.str;
default = [
"prometheus.service"
"prometheus"
];
description = "Systemd services to monitor";
};
};
# Service configuration with smart defaults
config = mkIf cfg.enable (mkMerge [
# Core Prometheus service
{
services.prometheus = {
enable = true;
port = cfg.port;
listenAddress = "0.0.0.0";
retentionTime = cfg.retention;
globalConfig = cfg.globalConfig;
extraFlags = cfg.extraFlags;
# Automatically aggregate all metrics from the fleet
scrapeConfigs = prometheusScrapeConfigs ++ cfg.extraScrapeConfigs;
# Include standard + custom alerting rules
ruleFiles =
map (ruleGroup:
pkgs.writeText "${ruleGroup.name}.yml" (builtins.toJSON {
groups = [ruleGroup];
})) (alertingRules ++ cfg.extraAlertingRules)
++ cfg.ruleFiles;
# Connect to Alertmanager if enabled
alertmanagers = mkIf cfg.alertmanager.enable [
{
static_configs = [
{
targets = [cfg.alertmanager.url];
}
];
}
];
};
networking.firewall.allowedTCPPorts = [cfg.port];
homelab.services.${serviceName}.monitoring.enable = mkDefault true;
}
# Smart defaults for Prometheus
(mkIf cfg.monitoring.enable {
homelab.services.${serviceName}.monitoring = mkDefault {
metrics = {
path = "/metrics";
extraEndpoints = [];
};
healthCheck = {
path = "/-/healthy";
conditions = ["[STATUS] == 200" "[RESPONSE_TIME] < 1000"];
extraChecks = [
{
name = "prometheus-ready";
port = cfg.port;
path = "/-/ready";
conditions = ["[STATUS] == 200"];
group = "monitoring";
}
];
};
extraLabels = {
component = "monitoring-server";
tier = "monitoring";
};
};
})
(mkIf cfg.logging.enable {
homelab.services.${serviceName}.logging = mkDefault {
files = ["/var/log/prometheus/prometheus.log"];
parsing = {
# Prometheus log format: ts=2024-01-01T12:00:00.000Z caller=main.go:123 level=info msg="message"
regex = "^ts=(?P<timestamp>[^ ]+) caller=(?P<caller>[^ ]+) level=(?P<level>\\w+) msg=\"(?P<message>[^\"]*)\"";
extractFields = ["level" "caller"];
};
extraLabels = {
component = "monitoring-server";
application = "prometheus";
};
};
})
(mkIf cfg.proxy.enable {
homelab.services.${serviceName}.proxy = mkDefault {
subdomain = "prometheus";
enableAuth = true; # Admin interface needs protection
};
})
]);
}