208 lines
5.6 KiB
Nix
208 lines
5.6 KiB
Nix
# modules/services/prometheus.nix
|
|
{
|
|
config,
|
|
lib,
|
|
pkgs,
|
|
...
|
|
}:
|
|
with lib; let
|
|
cfg = config.homelab.services.prometheus;
|
|
globalCfg = config.homelab.global;
|
|
in {
|
|
options.homelab.services.prometheus = {
|
|
enable = mkEnableOption "Prometheus monitoring server";
|
|
|
|
port = mkOption {
|
|
type = types.port;
|
|
default = 9090;
|
|
description = "Prometheus server port";
|
|
};
|
|
|
|
webExternalUrl = mkOption {
|
|
type = types.str;
|
|
default = "http://${globalCfg.hostname}:${toString cfg.port}";
|
|
description = "External URL for Prometheus";
|
|
};
|
|
|
|
retention = mkOption {
|
|
type = types.str;
|
|
default = "30d";
|
|
description = "Data retention period";
|
|
};
|
|
|
|
scrapeConfigs = mkOption {
|
|
type = types.listOf types.attrs;
|
|
default = [];
|
|
description = "Additional scrape configurations";
|
|
};
|
|
|
|
alertmanager = {
|
|
enable = mkOption {
|
|
type = types.bool;
|
|
default = false;
|
|
description = "Enable Alertmanager integration";
|
|
};
|
|
|
|
url = mkOption {
|
|
type = types.str;
|
|
default = "http://localhost:9093";
|
|
description = "Alertmanager URL";
|
|
};
|
|
};
|
|
};
|
|
|
|
config = mkIf cfg.enable {
|
|
# Register service with global homelab config
|
|
homelab.global.services.prometheus = {
|
|
enable = true;
|
|
description = "Metrics collection and monitoring server";
|
|
category = "monitoring";
|
|
ports = [cfg.port];
|
|
tags = ["metrics" "monitoring" "alerting"];
|
|
priority = 20;
|
|
dependencies = ["node-exporter"];
|
|
};
|
|
|
|
# Configure the actual Prometheus service
|
|
services.prometheus = {
|
|
enable = true;
|
|
port = cfg.port;
|
|
webExternalUrl = cfg.webExternalUrl;
|
|
|
|
retentionTime = cfg.retention;
|
|
|
|
scrapeConfigs =
|
|
[
|
|
# Auto-discover monitoring endpoints from global config
|
|
{
|
|
job_name = "homelab-auto";
|
|
static_configs = [
|
|
{
|
|
targets =
|
|
map (
|
|
endpoint: "${globalCfg.hostname}:${toString endpoint.port}"
|
|
)
|
|
globalCfg.monitoring.endpoints;
|
|
}
|
|
];
|
|
scrape_interval = "30s";
|
|
metrics_path = "/metrics";
|
|
}
|
|
]
|
|
++ cfg.scrapeConfigs;
|
|
|
|
# Alertmanager configuration
|
|
alertmanagers = mkIf cfg.alertmanager.enable [
|
|
{
|
|
static_configs = [
|
|
{
|
|
targets = [cfg.alertmanager.url];
|
|
}
|
|
];
|
|
}
|
|
];
|
|
|
|
rules = [
|
|
# Basic homelab alerting rules
|
|
(pkgs.writeText "homelab-alerts.yml" ''
|
|
groups:
|
|
- name: homelab
|
|
rules:
|
|
- alert: ServiceDown
|
|
expr: up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service {{ $labels.instance }} is down"
|
|
description: "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 5 minutes."
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is above 90% on {{ $labels.instance }}"
|
|
|
|
- alert: HighDiskUsage
|
|
expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High disk usage on {{ $labels.instance }}"
|
|
description: "Disk usage is above 85% on {{ $labels.instance }} for filesystem {{ $labels.mountpoint }}"
|
|
'')
|
|
];
|
|
};
|
|
|
|
# Add monitoring endpoint to global config
|
|
homelab.global.monitoring.endpoints = [
|
|
{
|
|
name = "prometheus";
|
|
port = cfg.port;
|
|
path = "/metrics";
|
|
jobName = "prometheus";
|
|
scrapeInterval = "30s";
|
|
labels = {
|
|
service = "prometheus";
|
|
role = "monitoring";
|
|
};
|
|
}
|
|
];
|
|
|
|
# Add reverse proxy entry if configured
|
|
homelab.global.reverseProxy.entries = mkIf (globalCfg.domain != null) [
|
|
{
|
|
subdomain = "prometheus";
|
|
port = cfg.port;
|
|
path = "/";
|
|
enableAuth = true;
|
|
enableSSL = true;
|
|
customHeaders = {
|
|
"X-Frame-Options" = "DENY";
|
|
"X-Content-Type-Options" = "nosniff";
|
|
};
|
|
}
|
|
];
|
|
|
|
# Add backup job for Prometheus data
|
|
homelab.global.backups.jobs = [
|
|
{
|
|
name = "prometheus-data";
|
|
backend = "restic";
|
|
paths = ["/var/lib/prometheus2"];
|
|
schedule = "daily";
|
|
retention = {
|
|
daily = "7";
|
|
weekly = "4";
|
|
monthly = "3";
|
|
yearly = "1";
|
|
};
|
|
excludePatterns = [
|
|
"*.tmp"
|
|
"*/wal/*"
|
|
];
|
|
preHook = ''
|
|
# Stop prometheus temporarily for consistent backup
|
|
systemctl stop prometheus
|
|
'';
|
|
postHook = ''
|
|
# Restart prometheus after backup
|
|
systemctl start prometheus
|
|
'';
|
|
}
|
|
];
|
|
|
|
# Open firewall port
|
|
networking.firewall.allowedTCPPorts = [cfg.port];
|
|
|
|
# Create prometheus configuration directory
|
|
systemd.tmpfiles.rules = [
|
|
"d /var/lib/prometheus2 0755 prometheus prometheus -"
|
|
"d /etc/prometheus 0755 root root -"
|
|
];
|
|
};
|
|
}
|