dump
All checks were successful
Hello World / test (push) Successful in 12s

This commit is contained in:
plasmagoat 2025-07-05 11:12:20 +02:00
parent 4ed9ba0d24
commit a90630ecb6
98 changed files with 2063 additions and 729 deletions

View file

@ -1,19 +1,25 @@
{ config, pkgs, modulesPath, lib, ... }:
{
services.prometheus.alertmanagers = [ {
scheme = "http";
# path_prefix = "/alertmanager";
static_configs = [ {
targets = [
"localhost:9093"
config,
pkgs,
...
}: {
services.prometheus.alertmanagers = [
{
scheme = "http";
# path_prefix = "/alertmanager";
static_configs = [
{
targets = [
"localhost:9093"
];
}
];
} ];
} ];
}
];
services.prometheus.alertmanager = {
enable = true;
openFirewall = true;
webExternalUrl = "http://monitor.lab:9093"; # optional but helpful
webExternalUrl = "http://monitor.lab:9093"; # optional but helpful
configuration = {
route = {
group_wait = "10s";
@ -39,11 +45,12 @@
telegram_configs = [
{
api_url = "https://api.telegram.org";
bot_token = config.sops.secrets."telegram-alert-bot-token".path;
# FIX ME!
bot_token = "7597031094:AAHjjo3HL1XdY38pSNlR66-4wCP47o4LlSw"; # config.sops.secrets."telegram-alert-bot-token".path;
chat_id = -1002642560007;
message_thread_id = 4;
parse_mode = "HTML";
send_resolved = false;
send_resolved = true;
message = "{{ template \"telegram.message\". }}";
}
];

View file

@ -162,7 +162,7 @@
"pluginVersion": "7.3.6",
"targets": [
{
"expr": "sum(count_over_time({job=\"/var/log/traefik.log\"} |= \"RequestProtocol\" [$__interval]))",
"expr": "sum(count_over_time({job=\"traefik\"} |= \"RequestProtocol\" [$__interval]))",
"legendFormat": "",
"refId": "A"
}
@ -219,7 +219,7 @@
"pluginVersion": "7.3.6",
"targets": [
{
"expr": "sum by (OriginStatus) (count_over_time({job=\"/var/log/traefik.log\"}|= \"RequestProtocol\" | json | __error__=\"\" [$__interval]))",
"expr": "sum by (OriginStatus) (count_over_time({job=\"traefik\"}|= \"RequestProtocol\" | json | __error__=\"\" [$__interval]))",
"legendFormat": "HTTP Status: {{OriginStatus}}",
"refId": "A"
}
@ -284,7 +284,7 @@
"pluginVersion": "7.3.6",
"targets": [
{
"expr": " sum(rate({job=\"/var/log/traefik.log\"} |~ \"RequestProtocol\" | json | OriginStatus >= 400 |__error__=\"\"[$__interval])) / (sum(rate({job=\"/var/log/traefik.log\"} |~ \"RequestProtocol\" | json | __error__=\"\"[$__interval])) / 100)",
"expr": " sum(rate({job=\"traefik\"} |~ \"RequestProtocol\" | json | OriginStatus >= 400 |__error__=\"\"[$__interval])) / (sum(rate({job=\"traefik\"} |~ \"RequestProtocol\" | json | __error__=\"\"[$__interval])) / 100)",
"legendFormat": "",
"refId": "A"
}
@ -367,7 +367,7 @@
"steppedLine": false,
"targets": [
{
"expr": " sum by (OriginStatus,ServiceName) (count_over_time({job=\"/var/log/traefik.log\"} |~ \"RequestProtocol\" | json | OriginStatus >= 400 |__error__=\"\"[$__interval]))",
"expr": " sum by (OriginStatus,ServiceName) (count_over_time({job=\"traefik\"} |~ \"RequestProtocol\" | json | OriginStatus >= 400 |__error__=\"\"[$__interval]))",
"legendFormat": " {{ServiceName}} / {{OriginStatus}} ",
"refId": "A"
}
@ -474,7 +474,7 @@
"pluginVersion": "7.3.6",
"targets": [
{
"expr": "count(sum by (ClientHost) (count_over_time({job=\"/var/log/traefik.log\"}|= \"RequestProtocol\" | json | __error__=\"\" [$__interval])))",
"expr": "count(sum by (ClientHost) (count_over_time({job=\"traefik\"}|= \"RequestProtocol\" | json | __error__=\"\" [$__interval])))",
"legendFormat": "",
"refId": "A"
}
@ -544,7 +544,7 @@
"pluginVersion": "7.3.6",
"targets": [
{
"expr": "sum_over_time({job=\"/var/log/traefik.log\"}|= \"RequestProtocol\" | json | OriginStatus=200 | unwrap DownstreamContentSize | __error__=\"\" [$__interval])",
"expr": "sum_over_time({job=\"traefik\"}|= \"RequestProtocol\" | json | OriginStatus=200 | unwrap DownstreamContentSize | __error__=\"\" [$__interval])",
"legendFormat": "Bytes sent",
"refId": "A"
}
@ -638,7 +638,7 @@
"strokeWidth": 1,
"targets": [
{
"expr": "sum by (RouterName) (count_over_time({job=\"/var/log/traefik.log\"}|= \"RequestProtocol\" | json | __error__=\"\" [$__interval]))",
"expr": "sum by (RouterName) (count_over_time({job=\"traefik\"}|= \"RequestProtocol\" | json | __error__=\"\" [$__interval]))",
"legendFormat": "{{RouterName}}",
"refId": "A"
}
@ -675,7 +675,7 @@
},
"targets": [
{
"expr": "{job=\"/var/log/traefik.log\"} |= \"RequestProtocol\"| json | line_format \"Status:{{.OriginStatus}} Client From {{.ClientAddr}} {{.RequestMethod}} {{.RequestAddr}}{{.RequestPath}} Route To {{.ServiceAddr}}\"",
"expr": "{job=\"traefik\"} |= \"RequestProtocol\"| json | line_format \"Status:{{.OriginStatus}} Client From {{.ClientAddr}} {{.RequestMethod}} {{.RequestAddr}}{{.RequestPath}} Route To {{.ServiceAddr}}\"",
"legendFormat": "",
"refId": "A"
}
@ -749,7 +749,7 @@
"steppedLine": false,
"targets": [
{
"expr": "quantile_over_time(0.95,{job=\"/var/log/traefik.log\"} |= \"RequestProtocol\"| json | unwrap Duration | __error__=\"\" [$__interval]) by (ServiceName)",
"expr": "quantile_over_time(0.95,{job=\"traefik\"} |= \"RequestProtocol\"| json | unwrap Duration | __error__=\"\" [$__interval]) by (ServiceName)",
"hide": false,
"legendFormat": " {{ ServiceName }}",
"refId": "C"
@ -872,7 +872,7 @@
"steppedLine": false,
"targets": [
{
"expr": "max by (ServiceName) (max_over_time({job=\"/var/log/traefik.log\"} |= \"RequestProtocol\" |json | unwrap Duration | __error__=\"\" [$__interval]))",
"expr": "max by (ServiceName) (max_over_time({job=\"traefik\"} |= \"RequestProtocol\" |json | unwrap Duration | __error__=\"\" [$__interval]))",
"hide": false,
"legendFormat": "{{ ServiceName}}",
"refId": "D"
@ -995,7 +995,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (ServiceName) (sum_over_time({job=\"/var/log/traefik.log\"} |= \"RequestProtocol\" |json | unwrap RequestContentSize | __error__=\"\" [$__interval]))",
"expr": "sum by (ServiceName) (sum_over_time({job=\"traefik\"} |= \"RequestProtocol\" |json | unwrap RequestContentSize | __error__=\"\" [$__interval]))",
"hide": false,
"legendFormat": "{{ ServiceName}}",
"refId": "D"

View file

@ -852,7 +852,7 @@
"uid": "Prometheus"
},
"editorMode": "code",
"expr": "sum by(router) (rate(traefik_router_requests_total[$__rate_interval]))",
"expr": "sum by(service) (rate(traefik_service_requests_total[$__rate_interval]))",
"instant": false,
"legendFormat": "__auto",
"range": true,
@ -949,7 +949,7 @@
"uid": "Prometheus"
},
"editorMode": "code",
"expr": "sum by(service) (rate(traefik_router_request_duration_seconds_count[$__rate_interval]))",
"expr": "sum by(service) (rate(traefik_service_request_duration_seconds_count[$__rate_interval]))",
"instant": false,
"legendFormat": "__auto",
"range": true,

View file

@ -1,17 +1,30 @@
{ config, pkgs, modulesPath, lib, ... }:
{
config,
pkgs,
modulesPath,
lib,
...
}: {
services.grafana.enable = true;
services.grafana.settings.server = {
http_port = 3000;
http_addr = "0.0.0.0";
# Grafana needs to know on which domain and URL it's running
domain = "grafana.lab";
# root_url = "https://monitor.local/grafana/"; # Not needed if it is `https://your.domain/`
# serve_from_sub_path = true;
services.grafana.settings = {
server = {
http_port = 3000;
http_addr = "0.0.0.0";
# Grafana needs to know on which domain and URL it's running
domain = "grafana.procopius.dk";
root_url = "https://grafana.procopius.dk"; # Not needed if it is `https://your.domain/`
# serve_from_sub_path = true;
oauth_auto_login = false;
};
"auth.generic_oauth" = {
enabled = false;
};
"auth" = {
disable_login_form = false;
};
};
networking.firewall.allowedTCPPorts = [ 3000 ];
networking.firewall.allowedTCPPorts = [3000];
services.grafana = {
# declarativePlugins = with pkgs.grafanaPlugins; [ ... ];
@ -33,22 +46,32 @@
type = "loki";
url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}";
}
# Some plugins also can - c.f. https://grafana.com/docs/plugins/yesoreyeram-infinity-datasource/latest/setup/provisioning/
# {
# name = "Infinity";
# type = "yesoreyeram-infinity-datasource";
# }
# But not all - c.f. https://github.com/fr-ser/grafana-sqlite-datasource/issues/141
{
uid = "influxdb";
name = "InfluxDB";
type = "influxdb";
url = "http://127.0.0.1:8086";
access = "proxy";
jsonData = {
dbName = "proxmox";
httpHeaderName1 = "Authorization";
};
secureJsonData = {
httpHeaderValue1 = "Token iY4MTuqUAVJbBkDUiMde";
};
}
];
# Note: removing attributes from the above `datasources.settings.datasources` is not enough for them to be deleted on `grafana`;
# One needs to use the following option:
# datasources.settings.deleteDatasources = [ { name = "prometheus"; orgId = 1; } { name = "loki"; orgId = 1; } ];
dashboards.settings.providers = [{
name = "my dashboards";
options.path = "/etc/grafana-dashboards";
}];
dashboards.settings.providers = [
{
name = "my dashboards";
options.path = "/etc/grafana-dashboards";
}
];
};
};
@ -59,6 +82,13 @@
mode = "0644";
};
environment.etc."grafana-dashboards/traefik-access.json" = {
source = ./dashboards/traefik-access.json;
user = "grafana";
group = "grafana";
mode = "0644";
};
environment.etc."grafana-dashboards/grafana-traefik.json" = {
source = ./dashboards/grafana-traefik.json;
user = "grafana";

View file

@ -1,14 +1,20 @@
{ config, pkgs, modulesPath, lib, ... }:
{
config,
pkgs,
modulesPath,
lib,
...
}: {
imports = [
../../templates/base.nix
../../secrets/shared-sops.nix
./networking.nix
./prometheus.nix
./influxdb.nix
./grafana.nix
./loki.nix
./alertmanager.nix
./sops.nix
./jellyfin-exporter.nix
];
}

View file

@ -0,0 +1,25 @@
{
config,
pkgs,
modulesPath,
lib,
...
}: {
services.influxdb2 = {
enable = true;
settings = {
};
provision = {
enable = true;
initialSetup = {
username = "plasmagoat";
passwordFile = config.sops.secrets.influxdb-password.path;
tokenFile = config.sops.secrets.influxdb-token.path;
organization = "procopius";
bucket = "proxmox";
};
};
};
networking.firewall.allowedTCPPorts = [8086];
}

View file

@ -0,0 +1,14 @@
{
virtualisation.oci-containers.containers = {
jellyfin_exporter = {
image = "rebelcore/jellyfin-exporter:latest";
ports = [
"9594:9594"
];
cmd = [
"--jellyfin.address=http://media.lab:8096"
"--jellyfin.token=f7c89e5aa307434c9b3ecb329e896335"
];
};
};
}

View file

@ -1,51 +1,137 @@
{ config, pkgs, modulesPath, lib, ... }:
{
config,
pkgs,
modulesPath,
lib,
...
}: let
monitor_hostname = "monitor.lab";
traefik_hostname = "traefik.lab";
sandbox_hostname = "sandbox.lab";
forgejo_hostname = "forgejo.lab";
runner01_hostname = "forgejo-runner-01.lab";
dnsmasq_hostname = "dns.lab";
media_hostname = "media.lab";
mail_hostname = "mail.lab";
keycloak_hostname = "keycloak.lab";
let
monitor_ip = "monitor.lab";
traefik_ip = "traefik.lab";
sandbox_ip = "sandbox.lab";
forgejo_ip = "forgejo.lab";
runner01_ip = "forgejo-runner-01.lab";
dnsmasq_ip = "dns.lab";
monitored_hosts = [
monitor_hostname
traefik_hostname
sandbox_hostname
forgejo_hostname
runner01_hostname
dnsmasq_hostname
media_hostname
mail_hostname
keycloak_hostname
];
generateTargets = port:
map (host: "${host}:${toString port}") monitored_hosts;
instance_relabel_config = [
{
source_labels = ["__address__"];
regex = "([^:]+):\\d+"; # Captures everything before the last colon
target_label = "instance";
replacement = "$1";
}
];
node_exporter_port = 9100;
node_exporter_job = {
job_name = "node";
static_configs = [{targets = generateTargets node_exporter_port;}];
relabel_configs = instance_relabel_config;
};
promtail_port = 9080;
promtail_job = {
job_name = "promtail";
static_configs = [{targets = generateTargets promtail_port;}];
relabel_configs = instance_relabel_config;
};
prometheus_exporter_port = 9100;
postgres_exporter_port = 9187;
prometheus_port = 9090;
alertmanager_port = 9093;
grafana_port = 3000;
promtail_port = 9080;
traefik_monitor_port = 8082;
forgejo_monitor_port = 3000;
dnsmasq_exporter_port = 9153;
exporters = {
node = [
"${monitor_ip}:${toString prometheus_exporter_port}"
"${traefik_ip}:${toString prometheus_exporter_port}"
"${sandbox_ip}:${toString prometheus_exporter_port}"
"${forgejo_ip}:${toString prometheus_exporter_port}"
"${runner01_ip}:${toString prometheus_exporter_port}"
monitoring_infra_job = {
job_name = "monitoring_infra";
static_configs = [
{
targets = [
"${monitor_hostname}:${toString prometheus_port}"
"${monitor_hostname}:${toString alertmanager_port}"
"${monitor_hostname}:${toString grafana_port}"
];
}
];
promtail = [
"${monitor_ip}:${toString promtail_port}"
"${traefik_ip}:${toString promtail_port}"
"${sandbox_ip}:${toString promtail_port}"
"${forgejo_ip}:${toString promtail_port}"
"${runner01_ip}:${toString promtail_port}"
];
grafana = [ "${monitor_ip}:${toString grafana_port}" ];
prometheus = [ "${monitor_ip}:${toString prometheus_port}" ];
alertmanager = [ "${monitor_ip}:${toString alertmanager_port}" ];
traefik = [ "${traefik_ip}:${toString traefik_monitor_port}" ];
gitea = [ "${forgejo_ip}:${toString forgejo_monitor_port}" ];
postgres = [ "${forgejo_ip}:${toString postgres_exporter_port}" ];
dnsmasq = [ "${dnsmasq_ip}:${toString dnsmasq_exporter_port}" ];
relabel_configs = instance_relabel_config;
};
traefik_monitor_port = 8082;
traefik_job = {
job_name = "traefik";
static_configs = [{targets = ["${traefik_hostname}:${toString traefik_monitor_port}"];}];
relabel_configs = instance_relabel_config;
};
forgejo_monitor_port = 3000;
forgejo_job = {
job_name = "forgejo";
static_configs = [{targets = ["${forgejo_hostname}:${toString forgejo_monitor_port}"];}];
relabel_configs = instance_relabel_config;
};
postgres_exporter_port = 9187;
postgres_job = {
job_name = "postgres";
static_configs = [{targets = ["${forgejo_hostname}:${toString postgres_exporter_port}"];}];
relabel_configs = instance_relabel_config;
};
dnsmasq_exporter_port = 9153;
dnsmasq_job = {
job_name = "dnsmasq";
static_configs = [{targets = ["${dnsmasq_hostname}:${toString dnsmasq_exporter_port}"];}];
relabel_configs = instance_relabel_config;
};
# --- Media Stack Scrape Job ---
media_stack_job = {
job_name = "media_stack";
static_configs = [
{
targets = [
"${media_hostname}:9707" # sonarr
"${media_hostname}:9708" # readarr
"${media_hostname}:9709" # radarr
"${media_hostname}:9710" # prowlarr
"${media_hostname}:9711" # lidarr
"${media_hostname}:9712" # bazarr
];
}
];
relabel_configs = instance_relabel_config;
};
jellyfin_port = 8096;
jellyfin_exporter_port = 9594;
jellyfin_job = {
job_name = "jellyfin";
static_configs = [
{
targets = [
"${media_hostname}:${toString jellyfin_port}"
"${monitor_hostname}:${toString jellyfin_exporter_port}"
];
}
];
relabel_configs = instance_relabel_config;
};
in {
networking.firewall.allowedTCPPorts = [ 9090 ];
networking.firewall.allowedTCPPorts = [9090];
services.prometheus = {
enable = true;
@ -61,10 +147,17 @@ in {
"--web.enable-admin-api"
];
scrapeConfigs = lib.mapAttrsToList (job_name: targets: {
inherit job_name;
static_configs = [ { inherit targets; } ];
}) exporters;
scrapeConfigs = [
node_exporter_job
promtail_job
monitoring_infra_job
traefik_job
forgejo_job
postgres_job
dnsmasq_job
media_stack_job
jellyfin_job
];
# 🔔 Alerts provisioning
ruleFiles = [

View file

@ -1,11 +0,0 @@
# /etc/grafana/provisioning/notifiers/contact-points.yml
apiVersion: 1
contactPoints:
- orgId: 1
name: telegram
type: telegram
settings:
bottoken: "__YOUR_BOT_TOKEN__"
chatid: "__YOUR_CHAT_ID__"
disableResolveMessage: false

View file

@ -1,7 +1,18 @@
{ config, lib, ... }:
{
config,
lib,
...
}: {
sops.secrets."telegram-alert-bot-token" = {
sopsFile = ../../secrets/telegram/secrets.yml;
owner = "prometheus";
mode = "0440";
};
sops.secrets."influxdb-password" = {
sopsFile = ../../secrets/influxdb/secrets.yml;
owner = "influxdb2";
};
sops.secrets."influxdb-token" = {
sopsFile = ../../secrets/influxdb/secrets.yml;
owner = "influxdb2";
};
}