259 lines
7.3 KiB
Nix
259 lines
7.3 KiB
Nix
{
|
|
config,
|
|
lib,
|
|
pkgs,
|
|
...
|
|
}:
|
|
with lib; let
|
|
serviceName = "prometheus";
|
|
cfg = config.homelab.services.${serviceName};
|
|
homelabCfg = config.homelab;
|
|
|
|
# Generate Prometheus scrape configs from global monitoring data
|
|
prometheusScrapeConfigs = let
|
|
# Get all metrics - try global first, fallback to local
|
|
allMetrics = homelabCfg.monitoring.global.allMetrics;
|
|
|
|
jobGroups = groupBy (m: m.jobName) allMetrics;
|
|
|
|
scrapeConfigs =
|
|
mapAttrsToList (jobName: endpoints: {
|
|
job_name = jobName;
|
|
scrape_interval = head endpoints.scrapeInterval or ["30s"];
|
|
static_configs =
|
|
map
|
|
(endpoint: {
|
|
targets = ["${endpoint.host}:${toString endpoint.port}"];
|
|
labels = endpoint.labels;
|
|
})
|
|
endpoints;
|
|
metrics_path = head endpoints.path or ["/metrics"];
|
|
})
|
|
jobGroups;
|
|
in
|
|
scrapeConfigs;
|
|
|
|
# Standard alerting rules for homelab
|
|
alertingRules = [
|
|
{
|
|
name = "homelab.rules";
|
|
rules = [
|
|
{
|
|
alert = "InstanceDown";
|
|
expr = "up == 0";
|
|
for = "5m";
|
|
labels = {severity = "critical";};
|
|
annotations = {
|
|
summary = "Instance {{ $labels.instance }} down";
|
|
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.";
|
|
};
|
|
}
|
|
{
|
|
alert = "HighCPUUsage";
|
|
expr = "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) > 80";
|
|
for = "10m";
|
|
labels = {severity = "warning";};
|
|
annotations = {
|
|
summary = "High CPU usage on {{ $labels.instance }}";
|
|
description = "CPU usage is above 80% for more than 10 minutes on {{ $labels.instance }}.";
|
|
};
|
|
}
|
|
{
|
|
alert = "HighMemoryUsage";
|
|
expr = "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85";
|
|
for = "10m";
|
|
labels = {severity = "warning";};
|
|
annotations = {
|
|
summary = "High memory usage on {{ $labels.instance }}";
|
|
description = "Memory usage is above 85% for more than 10 minutes on {{ $labels.instance }}.";
|
|
};
|
|
}
|
|
{
|
|
alert = "DiskSpaceLow";
|
|
expr = "((node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes) * 100 > 90";
|
|
for = "5m";
|
|
labels = {severity = "critical";};
|
|
annotations = {
|
|
summary = "Disk space low on {{ $labels.instance }}";
|
|
description = "Disk usage is above 90% on {{ $labels.instance }} {{ $labels.mountpoint }}.";
|
|
};
|
|
}
|
|
];
|
|
}
|
|
];
|
|
in {
|
|
imports = [
|
|
(import ../lib/features/monitoring.nix serviceName)
|
|
(import ../lib/features/logging.nix serviceName)
|
|
(import ../lib/features/proxy.nix serviceName)
|
|
];
|
|
|
|
# Core service options
|
|
options.homelab.services.${serviceName} = {
|
|
enable = mkEnableOption "Prometheus Monitoring Server";
|
|
|
|
port = mkOption {
|
|
type = types.port;
|
|
default = 9090;
|
|
};
|
|
|
|
description = mkOption {
|
|
type = types.str;
|
|
default = "Prometheus Monitoring Server";
|
|
};
|
|
|
|
# Prometheus-specific options
|
|
retention = mkOption {
|
|
type = types.str;
|
|
default = "15d";
|
|
description = "How long to retain metrics data";
|
|
};
|
|
|
|
alertmanager = {
|
|
enable = mkOption {
|
|
type = types.bool;
|
|
default = true;
|
|
description = "Enable integration with Alertmanager";
|
|
};
|
|
|
|
url = mkOption {
|
|
type = types.str;
|
|
default = "alertmanager.${homelabCfg.domain}:9093";
|
|
description = "Alertmanager URL";
|
|
};
|
|
};
|
|
|
|
extraScrapeConfigs = mkOption {
|
|
type = types.listOf types.attrs;
|
|
default = [];
|
|
description = "Additional scrape configurations";
|
|
};
|
|
|
|
extraAlertingRules = mkOption {
|
|
type = types.listOf types.attrs;
|
|
default = [];
|
|
description = "Additional alerting rules";
|
|
};
|
|
|
|
globalConfig = mkOption {
|
|
type = types.attrs;
|
|
default = {
|
|
scrape_interval = "15s";
|
|
evaluation_interval = "15s";
|
|
};
|
|
description = "Global Prometheus configuration";
|
|
};
|
|
|
|
extraFlags = mkOption {
|
|
type = types.listOf types.str;
|
|
default = [];
|
|
description = "Extra command line flags";
|
|
};
|
|
|
|
ruleFiles = mkOption {
|
|
type = types.listOf types.path;
|
|
default = [];
|
|
description = "Additional rule files to load";
|
|
};
|
|
|
|
systemdServices = mkOption {
|
|
type = types.listOf types.str;
|
|
default = [
|
|
"prometheus.service"
|
|
"prometheus"
|
|
];
|
|
description = "Systemd services to monitor";
|
|
};
|
|
};
|
|
|
|
# Service configuration with smart defaults
|
|
config = mkIf cfg.enable (mkMerge [
|
|
# Core Prometheus service
|
|
{
|
|
services.prometheus = {
|
|
enable = true;
|
|
port = cfg.port;
|
|
listenAddress = "0.0.0.0";
|
|
retentionTime = cfg.retention;
|
|
|
|
globalConfig = cfg.globalConfig;
|
|
extraFlags = cfg.extraFlags;
|
|
|
|
# Automatically aggregate all metrics from the fleet
|
|
scrapeConfigs = prometheusScrapeConfigs ++ cfg.extraScrapeConfigs;
|
|
|
|
# Include standard + custom alerting rules
|
|
ruleFiles =
|
|
map (ruleGroup:
|
|
pkgs.writeText "${ruleGroup.name}.yml" (builtins.toJSON {
|
|
groups = [ruleGroup];
|
|
})) (alertingRules ++ cfg.extraAlertingRules)
|
|
++ cfg.ruleFiles;
|
|
|
|
# Connect to Alertmanager if enabled
|
|
alertmanagers = mkIf cfg.alertmanager.enable [
|
|
{
|
|
static_configs = [
|
|
{
|
|
targets = [cfg.alertmanager.url];
|
|
}
|
|
];
|
|
}
|
|
];
|
|
};
|
|
|
|
networking.firewall.allowedTCPPorts = [cfg.port];
|
|
|
|
homelab.services.${serviceName}.monitoring.enable = mkDefault true;
|
|
}
|
|
|
|
# Smart defaults for Prometheus
|
|
(mkIf cfg.monitoring.enable {
|
|
homelab.services.${serviceName}.monitoring = mkDefault {
|
|
metrics = {
|
|
path = "/metrics";
|
|
extraEndpoints = [];
|
|
};
|
|
healthCheck = {
|
|
path = "/-/healthy";
|
|
conditions = ["[STATUS] == 200" "[RESPONSE_TIME] < 1000"];
|
|
extraChecks = [
|
|
{
|
|
name = "prometheus-ready";
|
|
port = cfg.port;
|
|
path = "/-/ready";
|
|
conditions = ["[STATUS] == 200"];
|
|
group = "monitoring";
|
|
}
|
|
];
|
|
};
|
|
extraLabels = {
|
|
component = "monitoring-server";
|
|
tier = "monitoring";
|
|
};
|
|
};
|
|
})
|
|
|
|
(mkIf cfg.logging.enable {
|
|
homelab.services.${serviceName}.logging = mkDefault {
|
|
files = ["/var/log/prometheus/prometheus.log"];
|
|
parsing = {
|
|
# Prometheus log format: ts=2024-01-01T12:00:00.000Z caller=main.go:123 level=info msg="message"
|
|
regex = "^ts=(?P<timestamp>[^ ]+) caller=(?P<caller>[^ ]+) level=(?P<level>\\w+) msg=\"(?P<message>[^\"]*)\"";
|
|
extractFields = ["level" "caller"];
|
|
};
|
|
extraLabels = {
|
|
component = "monitoring-server";
|
|
application = "prometheus";
|
|
};
|
|
};
|
|
})
|
|
|
|
(mkIf cfg.proxy.enable {
|
|
homelab.services.${serviceName}.proxy = mkDefault {
|
|
subdomain = "prometheus";
|
|
enableAuth = true; # Admin interface needs protection
|
|
};
|
|
})
|
|
]);
|
|
}
|