From 7dd5043b5d3e77c0bd509b5590d9ee8ed4fdfc57 Mon Sep 17 00:00:00 2001 From: plasmagoat Date: Fri, 6 Jun 2025 23:32:17 +0200 Subject: [PATCH] ready for runners --- nixos/README.md | 6 +- nixos/flake.nix | 13 +- nixos/hosts/dns/dnsmasq.nix | 61 ++ nixos/hosts/{forgejo => dns}/host.nix | 5 +- nixos/hosts/dns/networking.nix | 19 + nixos/hosts/forgejo-runner/host.nix | 9 + nixos/hosts/forgejo-runner/networking.nix | 4 + nixos/hosts/forgejo-runner/runner.nix | 35 + nixos/hosts/forgejo-runner/sops.nix | 7 + nixos/hosts/forgejo/database.nix | 9 +- nixos/hosts/forgejo/forgejo.nix | 5 +- nixos/hosts/forgejo/networking.nix | 8 +- nixos/hosts/forgejo/sops.nix | 19 + nixos/hosts/monitoring/alertmanager.nix | 59 ++ .../dashboards/grafana-traefik.json | 854 ++++++++---------- nixos/hosts/monitoring/grafana.nix | 24 +- nixos/hosts/monitoring/host.nix | 3 + nixos/hosts/monitoring/networking.nix | 19 +- nixos/hosts/monitoring/prometheus.nix | 113 ++- .../provisioning/alerting/alerts.yml | 265 ------ .../provisioning/alerts/loki-alerts.yml | 39 + .../alerts/node-exporter-alerts-v2.yml | 299 ++++++ .../alerts/node-exporter-alerts.yml | 320 +++++++ .../provisioning/alerts/postgres-alerts.yml | 201 +++++ .../provisioning/alerts/prometheus-alerts.yml | 255 ++++++ .../provisioning/alerts/promtail-alerts.yml | 21 + .../provisioning/alerts/traefik-alerts.yml | 30 + .../templates/telegram.markdown.v2.tmpl | 37 + .../provisioning/templates/telegram.tmpl | 28 + nixos/hosts/monitoring/sops.nix | 7 + nixos/hosts/sandbox/networking.nix | 24 +- .../traefik/configuration/middlewares.nix | 10 + nixos/hosts/traefik/configuration/routers.nix | 140 +++ .../hosts/traefik/configuration/services.nix | 38 + nixos/hosts/traefik/configuration/static.nix | 61 ++ nixos/hosts/traefik/networking.nix | 23 +- nixos/hosts/traefik/promtail.nix | 4 +- nixos/hosts/traefik/traefik.nix | 168 +--- nixos/hosts/vpn/README.md | 127 +++ nixos/modules/promtail.nix | 5 +- nixos/secrets/HOWTO.md | 59 ++ nixos/secrets/README.md | 109 ++- nixos/secrets/forgejo/runner-secrets.yml | 16 + nixos/secrets/forgejo/secrets.yml | 18 + nixos/secrets/secrets.yaml | 27 - nixos/secrets/shared-sops.nix | 11 + nixos/secrets/shared/secrets.yml | 16 + nixos/secrets/sops.nix | 8 - nixos/secrets/telegram/secrets.yml | 16 + 49 files changed, 2569 insertions(+), 1085 deletions(-) create mode 100644 nixos/hosts/dns/dnsmasq.nix rename nixos/hosts/{forgejo => dns}/host.nix (58%) create mode 100644 nixos/hosts/dns/networking.nix create mode 100644 nixos/hosts/forgejo-runner/host.nix create mode 100644 nixos/hosts/forgejo-runner/networking.nix create mode 100644 nixos/hosts/forgejo-runner/runner.nix create mode 100644 nixos/hosts/forgejo-runner/sops.nix create mode 100644 nixos/hosts/forgejo/sops.nix create mode 100644 nixos/hosts/monitoring/alertmanager.nix delete mode 100644 nixos/hosts/monitoring/provisioning/alerting/alerts.yml create mode 100644 nixos/hosts/monitoring/provisioning/alerts/loki-alerts.yml create mode 100644 nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts-v2.yml create mode 100644 nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts.yml create mode 100644 nixos/hosts/monitoring/provisioning/alerts/postgres-alerts.yml create mode 100644 nixos/hosts/monitoring/provisioning/alerts/prometheus-alerts.yml create mode 100644 nixos/hosts/monitoring/provisioning/alerts/promtail-alerts.yml create mode 100644 nixos/hosts/monitoring/provisioning/alerts/traefik-alerts.yml create mode 100644 nixos/hosts/monitoring/provisioning/templates/telegram.markdown.v2.tmpl create mode 100644 nixos/hosts/monitoring/provisioning/templates/telegram.tmpl create mode 100644 nixos/hosts/monitoring/sops.nix create mode 100644 nixos/hosts/traefik/configuration/middlewares.nix create mode 100644 nixos/hosts/traefik/configuration/routers.nix create mode 100644 nixos/hosts/traefik/configuration/services.nix create mode 100644 nixos/hosts/traefik/configuration/static.nix create mode 100644 nixos/hosts/vpn/README.md create mode 100644 nixos/secrets/HOWTO.md create mode 100644 nixos/secrets/forgejo/runner-secrets.yml create mode 100644 nixos/secrets/forgejo/secrets.yml delete mode 100644 nixos/secrets/secrets.yaml create mode 100644 nixos/secrets/shared-sops.nix create mode 100644 nixos/secrets/shared/secrets.yml delete mode 100644 nixos/secrets/sops.nix create mode 100644 nixos/secrets/telegram/secrets.yml diff --git a/nixos/README.md b/nixos/README.md index 5805186..f19ef37 100644 --- a/nixos/README.md +++ b/nixos/README.md @@ -1,5 +1,9 @@ nixos-rebuild switch --flake .#traefik --target-host root@192.168.1.171 --verbose nixos-rebuild switch --flake .#proxmox --target-host root@192.168.1.205 --verbose nixos-rebuild switch --flake .#sandbox --target-host root@192.168.1.148 --verbose -nixos-rebuild switch --flake .#monitoring --target-host root@192.168.1.88 --verbose +nixos-rebuild switch --flake .#monitoring --target-host root@monitor.lab --verbose nixos-rebuild switch --flake .#forgejo --target-host root@192.168.1.249 --verbose +nixos-rebuild switch --flake .#dns --target-host root@192.168.1.140 --verbose + +nixos-rebuild switch --flake .#runner --target-host root@forgejo-runner-01.lab --override-input runnerId 01 +nixos-rebuild switch --flake .#runner01 --target-host root@forgejo-runner-01.lab --verbose diff --git a/nixos/flake.nix b/nixos/flake.nix index 9cf7424..7e58a47 100644 --- a/nixos/flake.nix +++ b/nixos/flake.nix @@ -34,9 +34,14 @@ modules = [ ./hosts/sandbox/host.nix ]; }; + dns = nixpkgs.lib.nixosSystem { + inherit system; + modules = [ ./hosts/dns/host.nix ]; + }; + monitoring = nixpkgs.lib.nixosSystem { inherit system; - modules = [ ./hosts/monitoring/host.nix ]; + modules = [ ./hosts/monitoring/host.nix sops-nix.nixosModules.sops ]; }; forgejo = nixpkgs.lib.nixosSystem { @@ -44,6 +49,12 @@ modules = [ ./hosts/forgejo/host.nix sops-nix.nixosModules.sops ]; }; + runner01 = nixpkgs.lib.nixosSystem { + inherit system; + modules = [ ./hosts/forgejo-runner/host.nix sops-nix.nixosModules.sops ]; + specialArgs.runnerId = "01"; + }; + # dockerHost = pkgs.lib.nixosSystem { # inherit system; # modules = [ diff --git a/nixos/hosts/dns/dnsmasq.nix b/nixos/hosts/dns/dnsmasq.nix new file mode 100644 index 0000000..b076b54 --- /dev/null +++ b/nixos/hosts/dns/dnsmasq.nix @@ -0,0 +1,61 @@ +{ + services.dnsmasq = { + enable = true; + alwaysKeepRunning = true; + settings = { + domain = "lab"; + expand-hosts = true; + domain-needed = true; + + # interface = "eth0"; # Replace with your real interface + bind-interfaces = true; + + local = [ + "/lab/" + "/procopius.dk/" + ]; + bogus-priv = true; + no-resolv = true; + + # no-hosts = true; # Prevent 127.0.0.2 etc from leaking in + server = [ + "8.8.8.8" + "8.8.4.4" + "1.1.1.1" + "1.0.0.1" + ]; + + # Static DNS entry: map hostname to IP (without DHCP) + address = [ + # Static IPs + "/dns.lab/192.168.1.53" + "/traefik.lab/192.168.1.80" + # "/proxmox-01.lab/192.168.1.205" + # "/nas-01.lab/192.168.1.226" + + # Split Horizon DNS + "/procopius.dk/192.168.1.80" + "/.procopius.dk/192.168.1.80" + ]; + + cache-size = 10000; + + dhcp-authoritative = true; + dhcp-range = "192.168.1.100,192.168.1.254,12h"; + dhcp-host = "bc:24:11:58:f5:da,dns,192.168.1.53"; + # "Use 192.168.1.53 as your DNS server." + dhcp-option = [ + "option:router,192.168.1.1" # router + "option:dns-server,192.168.1.53" # DNS server (this VM) + ]; + + log-queries = true; + localise-queries = true; + log-async = true; + # log-facility = "/var/log/dnsmasq/dnsmasq.log"; + }; + }; + + services.prometheus.exporters.dnsmasq.enable = true; + services.prometheus.exporters.dnsmasq.openFirewall = true; +} diff --git a/nixos/hosts/forgejo/host.nix b/nixos/hosts/dns/host.nix similarity index 58% rename from nixos/hosts/forgejo/host.nix rename to nixos/hosts/dns/host.nix index d8257e4..d346b22 100644 --- a/nixos/hosts/forgejo/host.nix +++ b/nixos/hosts/dns/host.nix @@ -3,10 +3,7 @@ { imports = [ ../../templates/base.nix - ../../secrets/sops.nix ./networking.nix - ./storage.nix - ./forgejo.nix - ./database.nix + ./dnsmasq.nix ]; } diff --git a/nixos/hosts/dns/networking.nix b/nixos/hosts/dns/networking.nix new file mode 100644 index 0000000..0942d99 --- /dev/null +++ b/nixos/hosts/dns/networking.nix @@ -0,0 +1,19 @@ +{ + networking.hostName = "dns"; + # networking.useHostResolvConf = false; + # networking.interfaces.eth0.useDHCP = true; + networking.interfaces.eth0.ipv4.addresses = [{ + address = "192.168.1.53"; + prefixLength = 24; + }]; + + networking.defaultGateway = "192.168.1.1"; # your router + networking.nameservers = [ "8.8.8.8" ]; # fallback resolvers + + networking.firewall.allowedTCPPorts = [ 53 67 80 443 ]; + networking.firewall.allowedUDPPorts = [ 53 67 ]; + + networking.hosts = { + "192.168.1.53" = [ "dns" "dns.lab" ]; + }; +} diff --git a/nixos/hosts/forgejo-runner/host.nix b/nixos/hosts/forgejo-runner/host.nix new file mode 100644 index 0000000..7c4e256 --- /dev/null +++ b/nixos/hosts/forgejo-runner/host.nix @@ -0,0 +1,9 @@ +{ + imports = [ + ../../templates/base.nix + ../../secrets/shared-sops.nix + ./runner.nix + ./networking.nix + ./sops.nix + ]; +} diff --git a/nixos/hosts/forgejo-runner/networking.nix b/nixos/hosts/forgejo-runner/networking.nix new file mode 100644 index 0000000..df98995 --- /dev/null +++ b/nixos/hosts/forgejo-runner/networking.nix @@ -0,0 +1,4 @@ +{ config, lib, pkgs, runnerId, ... }: +{ + networking.hostName = "forgejo-runner-${runnerId}"; +} diff --git a/nixos/hosts/forgejo-runner/runner.nix b/nixos/hosts/forgejo-runner/runner.nix new file mode 100644 index 0000000..2d1a40b --- /dev/null +++ b/nixos/hosts/forgejo-runner/runner.nix @@ -0,0 +1,35 @@ +{ config, pkgs,... }: + +{ + # users.users.forgejo-runner = { + # isSystemUser = true; + # extraGroups = [ "docker" ]; # Optional: if using docker jobs + # }; + + services.gitea-actions-runner = { + package = pkgs.forgejo-actions-runner; + instances.default = { + enable = true; + name = config.networking.hostName; + url = "https://git.procopius.dk"; + # Obtaining the path to the runner token file may differ + # tokenFile should be in format TOKEN=, since it's EnvironmentFile for systemd + tokenFile = config.sops.secrets."forgejo-runner-registration-token".path; + labels = [ + "ubuntu-latest:docker://node:16-bullseye" + # "ubuntu-22.04:docker://node:16-bullseye" + # "ubuntu-20.04:docker://node:16-bullseye" + # "ubuntu-18.04:docker://node:16-buster" + ## optionally provide native execution on the host: + # "native:host" + ]; + }; + }; + + # systemd.services."forgejo-actions-runner-default".serviceConfig = { + # User = "forgejo-runner"; + # Group = "forgejo-runner"; + # }; + + virtualisation.docker.enable = true; # Optional: if using docker +} diff --git a/nixos/hosts/forgejo-runner/sops.nix b/nixos/hosts/forgejo-runner/sops.nix new file mode 100644 index 0000000..1cbbeaf --- /dev/null +++ b/nixos/hosts/forgejo-runner/sops.nix @@ -0,0 +1,7 @@ +{ config, lib, ... }: +{ + sops.secrets."forgejo-runner-registration-token" = { + sopsFile = ../../secrets/forgejo/runner-secrets.yml; + mode = "0440"; + }; +} diff --git a/nixos/hosts/forgejo/database.nix b/nixos/hosts/forgejo/database.nix index 6dc9fe4..9d6f23e 100644 --- a/nixos/hosts/forgejo/database.nix +++ b/nixos/hosts/forgejo/database.nix @@ -21,11 +21,6 @@ ''; }; - services.prometheus.exporters.postgres = { - enable = true; - listenAddress = "0.0.0.0"; - port = 9187; - }; - networking.firewall.allowedTCPPorts = [ 9187 ]; - + services.prometheus.exporters.postgres.enable = true; + services.prometheus.exporters.postgres.openFirewall = true; } diff --git a/nixos/hosts/forgejo/forgejo.nix b/nixos/hosts/forgejo/forgejo.nix index 236c250..633100c 100644 --- a/nixos/hosts/forgejo/forgejo.nix +++ b/nixos/hosts/forgejo/forgejo.nix @@ -47,14 +47,11 @@ in security = { INSTALL_LOCK = true; - SECRET_KEY = "changeme"; # can be another secret + SECRET_KEY = config.sops.secrets."forgejo-secret-key".path; # can be another secret }; }; }; - sops.secrets.forgejo-admin-password.owner = "forgejo"; - sops.secrets.forgejo-db-password.owner = "forgejo"; - systemd.services.forgejo.preStart = let adminCmd = "${lib.getExe cfg.package} admin user"; user = "plasmagoat"; # Note, Forgejo doesn't allow creation of an account named "admin" diff --git a/nixos/hosts/forgejo/networking.nix b/nixos/hosts/forgejo/networking.nix index 2f076b7..6e9eb1c 100644 --- a/nixos/hosts/forgejo/networking.nix +++ b/nixos/hosts/forgejo/networking.nix @@ -1,6 +1,4 @@ -{ config, lib, pkgs, ... }: { - - networking = { - hostName = "forgejo"; - }; +{ config, lib, pkgs, ... }: +{ + networking.hostName = "forgejo"; } diff --git a/nixos/hosts/forgejo/sops.nix b/nixos/hosts/forgejo/sops.nix new file mode 100644 index 0000000..7e96908 --- /dev/null +++ b/nixos/hosts/forgejo/sops.nix @@ -0,0 +1,19 @@ +let + forgejoSops = ../../secrets/forgejo/secrets.yml; +in +{ + sops.secrets = { + "forgejo-admin-password" = { + sopsFile = forgejoSops; + owner = "forgejo"; + }; + "forgejo-db-password" = { + sopsFile = forgejoSops; + owner = "forgejo"; + }; + "forgejo-secret-key" = { + sopsFile = forgejoSops; + owner = "forgejo"; + }; + }; +} diff --git a/nixos/hosts/monitoring/alertmanager.nix b/nixos/hosts/monitoring/alertmanager.nix new file mode 100644 index 0000000..17d0649 --- /dev/null +++ b/nixos/hosts/monitoring/alertmanager.nix @@ -0,0 +1,59 @@ +{ config, pkgs, modulesPath, lib, ... }: + +{ + services.prometheus.alertmanagers = [ { + scheme = "http"; + # path_prefix = "/alertmanager"; + static_configs = [ { + targets = [ + "localhost:9093" + ]; + } ]; + } ]; + services.prometheus.alertmanager = { + enable = true; + openFirewall = true; + webExternalUrl = "http://monitor.lab:9093"; # optional but helpful + configuration = { + route = { + group_wait = "10s"; + group_interval = "30s"; + repeat_interval = "30m"; + receiver = "telegram"; + + routes = [ + { + receiver = "telegram"; + group_wait = "10s"; + match_re = { + severity = "critical|warning"; + }; + continue = true; + } + ]; + }; + + receivers = [ + { + name = "telegram"; + telegram_configs = [ + { + api_url = "https://api.telegram.org"; + bot_token = config.sops.secrets."telegram-alert-bot-token".path; + chat_id = -1002642560007; + message_thread_id = 4; + parse_mode = "HTML"; + send_resolved = false; + message = "{{ template \"telegram.message\". }}"; + } + ]; + } + ]; + + templates = [ + (pkgs.writeText "telegram.tmpl" (builtins.readFile ./provisioning/templates/telegram.tmpl)) + (pkgs.writeText "telegram.markdown.v2.tmpl" (builtins.readFile ./provisioning/templates/telegram.markdown.v2.tmpl)) + ]; + }; + }; +} diff --git a/nixos/hosts/monitoring/dashboards/grafana-traefik.json b/nixos/hosts/monitoring/dashboards/grafana-traefik.json index f6c4f35..c03e56e 100644 --- a/nixos/hosts/monitoring/dashboards/grafana-traefik.json +++ b/nixos/hosts/monitoring/dashboards/grafana-traefik.json @@ -1,42 +1,12 @@ { - "__inputs": [], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "7.5.5" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "panel", - "id": "piechart", - "name": "Pie chart v2", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], "annotations": { "list": [ { "builtIn": 1, - "datasource": "-- Grafana --", + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", @@ -47,14 +17,13 @@ }, "description": "Traefik dashboard prometheus", "editable": true, - "gnetId": 4475, + "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, - "iteration": 1620932097756, + "id": 2, "links": [], "panels": [ { - "datasource": null, + "collapsed": false, "gridPos": { "h": 1, "w": 24, @@ -62,32 +31,29 @@ "y": 0 }, "id": 10, + "panels": [], "title": "$backend stats", "type": "row" }, { - "cacheTimeout": null, - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, "decimals": 0, "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, "unit": "short" }, "overrides": [] @@ -99,8 +65,6 @@ "y": 1 }, "id": 2, - "interval": null, - "links": [], "maxDataPoints": 3, "options": { "displayLabels": [], @@ -108,6 +72,7 @@ "calcs": [], "displayMode": "table", "placement": "right", + "showLegend": true, "values": ["value", "percent"] }, "pieType": "pie", @@ -116,10 +81,20 @@ "fields": "", "values": false }, - "text": {} + "text": {}, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } }, + "pluginVersion": "12.0.0+security-01", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "traefik_service_requests_total{service=\"$service\"}", "format": "time_series", @@ -133,22 +108,42 @@ "type": "piechart" }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": ["#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"], - "datasource": "Prometheus", - "fieldConfig": { - "defaults": {}, - "overrides": [] + "datasource": { + "type": "prometheus", + "uid": "prometheus" }, - "format": "ms", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] }, "gridPos": { "h": 7, @@ -157,42 +152,29 @@ "y": 1 }, "id": 4, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true }, - "tableColumn": "", + "pluginVersion": "12.0.0+security-01", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "sum(traefik_service_request_duration_seconds_sum{service=\"$service\"}) / sum(traefik_service_requests_total{service=\"$service\"}) * 1000", "format": "time_series", @@ -202,68 +184,97 @@ "refId": "A" } ], - "thresholds": "", "title": "$service response time", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" + "type": "stat" }, { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "fieldConfig": { - "defaults": {}, + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, "overrides": [] }, - "fill": 1, - "fillGradient": 0, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 8 }, - "hiddenSeries": false, "id": 3, - "legend": { - "alignAsTable": true, - "avg": true, - "current": false, - "max": true, - "min": true, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "null", "options": { - "alertThreshold": true + "alertThreshold": true, + "legend": { + "calcs": ["mean", "max", "min"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } }, - "percentage": false, - "pluginVersion": "7.5.5", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "pluginVersion": "12.0.0+security-01", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "sum(rate(traefik_service_requests_total{service=\"$service\"}[5m]))", "format": "time_series", @@ -273,50 +284,11 @@ "refId": "A" } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, "title": "Total requests over 5min $service", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "type": "timeseries" }, { "collapsed": false, - "datasource": null, "gridPos": { "h": 1, "w": 24, @@ -329,215 +301,54 @@ "type": "row" }, { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "Prometheus", - "fieldConfig": { - "defaults": {}, - "overrides": [] + "datasource": { + "type": "prometheus", + "uid": "prometheus" }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 16 - }, - "hiddenSeries": false, - "id": 5, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": true, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.5", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code=\"200\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{method}} : {{code}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Status code 200 over 5min", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "Prometheus", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 16 - }, - "hiddenSeries": false, - "id": 6, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": true, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.5", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code!=\"200\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ method }} : {{code}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Others status code over 5min", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "cacheTimeout": null, - "datasource": "Prometheus", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, - "decimals": 0, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -549,6 +360,165 @@ }, "overrides": [] }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "alertThreshold": true, + "legend": { + "calcs": ["lastNotNull", "max", "min"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.0+security-01", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code=\"200\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{method}} : {{code}}", + "refId": "A" + } + ], + "title": "Status code 200 over 5min", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 6, + "options": { + "alertThreshold": true, + "legend": { + "calcs": ["lastNotNull", "max", "min"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.0+security-01", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\",code!=\"200\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ method }} : {{code}}", + "refId": "A" + } + ], + "title": "Others status code over 5min", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "decimals": 0, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, "gridPos": { "h": 7, "w": 12, @@ -556,8 +526,6 @@ "y": 23 }, "id": 7, - "interval": null, - "links": [], "maxDataPoints": 3, "options": { "displayLabels": [], @@ -565,6 +533,7 @@ "calcs": [], "displayMode": "table", "placement": "right", + "showLegend": true, "values": ["value"] }, "pieType": "pie", @@ -573,10 +542,20 @@ "fields": "", "values": false }, - "text": {} + "text": {}, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } }, + "pluginVersion": "12.0.0+security-01", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "sum(rate(traefik_service_requests_total[5m])) by (service) ", "format": "time_series", @@ -590,28 +569,24 @@ "type": "piechart" }, { - "cacheTimeout": null, - "datasource": "Prometheus", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, "decimals": 0, "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, "unit": "short" }, "overrides": [] @@ -623,8 +598,6 @@ "y": 23 }, "id": 8, - "interval": null, - "links": [], "maxDataPoints": 3, "options": { "displayLabels": [], @@ -632,6 +605,7 @@ "calcs": [], "displayMode": "table", "placement": "right", + "showLegend": true, "values": ["value"] }, "pieType": "pie", @@ -640,10 +614,20 @@ "fields": "", "values": false }, - "text": {} + "text": {}, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } }, + "pluginVersion": "12.0.0+security-01", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "exemplar": true, "expr": "sum(rate(traefik_entrypoint_requests_total{entrypoint =~ \"$entrypoint\"}[5m])) by (entrypoint) ", "format": "time_series", @@ -657,48 +641,32 @@ "type": "piechart" } ], - "schemaVersion": 27, - "style": "dark", + "preload": false, + "schemaVersion": 41, "tags": ["traefik", "prometheus"], "templating": { "list": [ { - "allValue": null, "current": {}, "datasource": "Prometheus", - "definition": "label_values(service)", - "description": null, - "error": null, - "hide": 0, + "definition": "label_values({job=\"traefik\"},service)", "includeAll": false, - "label": null, - "multi": false, "name": "service", "options": [], "query": { - "query": "label_values(service)", - "refId": "StandardVariableQuery" + "qryType": 1, + "query": "label_values({job=\"traefik\"},service)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" }, { - "allValue": null, "current": {}, "datasource": "Prometheus", "definition": "", - "description": null, - "error": null, - "hide": 0, "includeAll": true, - "label": null, "multi": true, "name": "entrypoint", "options": [], @@ -708,13 +676,7 @@ }, "refresh": 1, "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" } ] }, @@ -722,23 +684,9 @@ "from": "now-1h", "to": "now" }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] - }, + "timepicker": {}, "timezone": "", "title": "Traefik", "uid": "qPdAviJmz", - "version": 10 + "version": 1 } diff --git a/nixos/hosts/monitoring/grafana.nix b/nixos/hosts/monitoring/grafana.nix index 501e0df..6381378 100644 --- a/nixos/hosts/monitoring/grafana.nix +++ b/nixos/hosts/monitoring/grafana.nix @@ -6,7 +6,7 @@ http_port = 3000; http_addr = "0.0.0.0"; # Grafana needs to know on which domain and URL it's running - domain = "grafana.procopius.dk"; + domain = "grafana.lab"; # root_url = "https://monitor.local/grafana/"; # Not needed if it is `https://your.domain/` # serve_from_sub_path = true; }; @@ -45,12 +45,6 @@ # One needs to use the following option: # datasources.settings.deleteDatasources = [ { name = "prometheus"; orgId = 1; } { name = "loki"; orgId = 1; } ]; - alerting.rules.path = "/etc/grafana/provisioning/alerting/alerts.yml"; - - # notifiers.settings = { - # path = "/etc/grafana/provisioning/notifiers"; - # }; - dashboards.settings.providers = [{ name = "my dashboards"; options.path = "/etc/grafana-dashboards"; @@ -99,20 +93,4 @@ group = "grafana"; mode = "0644"; }; - - # πŸ”” Alerts provisioning - environment.etc."grafana/provisioning/alerting/alerts.yml" = { - source = ./provisioning/alerting/alerts.yml; - user = "grafana"; - group = "grafana"; - mode = "0644"; - }; - - # πŸ“¬ Contact point provisioning - environment.etc."grafana/provisioning/notifiers/contact-points.yml" = { - source = ./provisioning/notifiers/contact-points.yml; - user = "grafana"; - group = "grafana"; - mode = "0644"; - }; } diff --git a/nixos/hosts/monitoring/host.nix b/nixos/hosts/monitoring/host.nix index d2114c4..bccda47 100644 --- a/nixos/hosts/monitoring/host.nix +++ b/nixos/hosts/monitoring/host.nix @@ -3,9 +3,12 @@ { imports = [ ../../templates/base.nix + ../../secrets/shared-sops.nix ./networking.nix ./prometheus.nix ./grafana.nix ./loki.nix + ./alertmanager.nix + ./sops.nix ]; } diff --git a/nixos/hosts/monitoring/networking.nix b/nixos/hosts/monitoring/networking.nix index 7e6e7a9..a8fd74e 100644 --- a/nixos/hosts/monitoring/networking.nix +++ b/nixos/hosts/monitoring/networking.nix @@ -1,17 +1,4 @@ -{ config, lib, pkgs, ... }: { - - networking = { - hostName = "monitor"; - # interfaces.eth0 = { - # ipv4.addresses = [{ - # address = "192.168.1.171"; - # prefixLength = 24; - # }]; - # }; - # firewall.allowedTCPPorts = [ 80 3000 9090 ]; - # defaultGateway = { - # address = "192.168.1.1"; - # interface = "eth0"; - # }; - }; +{ config, lib, pkgs, ... }: +{ + networking.hostName = "monitor"; } diff --git a/nixos/hosts/monitoring/prometheus.nix b/nixos/hosts/monitoring/prometheus.nix index ac34e0e..6eed8bc 100644 --- a/nixos/hosts/monitoring/prometheus.nix +++ b/nixos/hosts/monitoring/prometheus.nix @@ -1,18 +1,46 @@ { config, pkgs, modulesPath, lib, ... }: let - monitor_ip = "monitor.local"; - traefik_ip = "traefik.local"; - sandbox_ip = "sandbox.local"; - forgejo_ip = "forgejo.local"; + monitor_ip = "monitor.lab"; + traefik_ip = "traefik.lab"; + sandbox_ip = "sandbox.lab"; + forgejo_ip = "forgejo.lab"; + dnsmasq_ip = "dns.lab"; prometheus_exporter_port = 9100; postgres_exporter_port = 9187; prometheus_port = 9090; + alertmanager_port = 9093; grafana_port = 3000; promtail_port = 9080; traefik_monitor_port = 8082; forgejo_monitor_port = 3000; + dnsmasq_exporter_port = 9153; + + exporters = { + node = [ + "${monitor_ip}:${toString prometheus_exporter_port}" + "${traefik_ip}:${toString prometheus_exporter_port}" + "${sandbox_ip}:${toString prometheus_exporter_port}" + "${forgejo_ip}:${toString prometheus_exporter_port}" + ]; + promtail = [ + "${monitor_ip}:${toString promtail_port}" + "${traefik_ip}:${toString promtail_port}" + "${sandbox_ip}:${toString promtail_port}" + "${forgejo_ip}:${toString promtail_port}" + ]; + grafana = [ "${monitor_ip}:${toString grafana_port}" ]; + prometheus = [ "${monitor_ip}:${toString prometheus_port}" ]; + alertmanager = [ "${monitor_ip}:${toString alertmanager_port}" ]; + traefik = [ "${traefik_ip}:${toString traefik_monitor_port}" ]; + gitea = [ "${forgejo_ip}:${toString forgejo_monitor_port}" ]; + postgres = [ "${forgejo_ip}:${toString postgres_exporter_port}" ]; + + dnsmasq = [ "${dnsmasq_ip}:${toString dnsmasq_exporter_port}" ]; + }; + + in { networking.firewall.allowedTCPPorts = [ 9090 ]; @@ -22,64 +50,27 @@ in { globalConfig = { scrape_timeout = "10s"; scrape_interval = "30s"; + # A short evaluation_interval will check alerting rules very often. + # It can be costly if you run Prometheus with 100+ alerts. + evaluation_interval = "20s"; }; - scrapeConfigs = [ - { - job_name = "node"; - static_configs = [ - { - targets = [ - "${monitor_ip}:${toString prometheus_exporter_port}" - "${traefik_ip}:${toString prometheus_exporter_port}" - "${sandbox_ip}:${toString prometheus_exporter_port}" - "${forgejo_ip}:${toString prometheus_exporter_port}" - ]; - } - ]; - } - { - job_name = "grafana"; - static_configs = [ - { targets = [ "${monitor_ip}:${toString grafana_port}" ]; } - ]; - } - { - job_name = "prometheus"; - static_configs = [ - { targets = [ "${monitor_ip}:${toString prometheus_port}" ]; } - ]; - } - { - job_name = "traefik"; - static_configs = [ - { targets = [ "${traefik_ip}:${toString traefik_monitor_port}" ]; } - ]; - } - { - job_name = "gitea"; - static_configs = [ - { targets = [ "${forgejo_ip}:${toString forgejo_monitor_port}" ]; } - ]; - } - { - job_name = "postgres"; - static_configs = [ - { targets = [ "${forgejo_ip}:${toString postgres_exporter_port}" ]; } - ]; - } - { - job_name = "promtail"; - static_configs = [ - { - targets = [ - "${monitor_ip}:${toString promtail_port}" - "${traefik_ip}:${toString promtail_port}" - "${sandbox_ip}:${toString promtail_port}" - "${forgejo_ip}:${toString promtail_port}" - ]; - } - ]; - } + extraFlags = [ + "--web.enable-admin-api" + ]; + + scrapeConfigs = lib.mapAttrsToList (job_name: targets: { + inherit job_name; + static_configs = [ { inherit targets; } ]; + }) exporters; + + # πŸ”” Alerts provisioning + ruleFiles = [ + (pkgs.writeText "prometheus-alerts.yml" (builtins.readFile ./provisioning/alerts/prometheus-alerts.yml)) + (pkgs.writeText "loki-alerts.yml" (builtins.readFile ./provisioning/alerts/loki-alerts.yml)) + (pkgs.writeText "promtail-alerts.yml" (builtins.readFile ./provisioning/alerts/promtail-alerts.yml)) + (pkgs.writeText "postgres-alerts.yml" (builtins.readFile ./provisioning/alerts/postgres-alerts.yml)) + (pkgs.writeText "traefik-alerts.yml" (builtins.readFile ./provisioning/alerts/traefik-alerts.yml)) + (pkgs.writeText "node-exporter-alerts.yml" (builtins.readFile ./provisioning/alerts/node-exporter-alerts.yml)) ]; }; } diff --git a/nixos/hosts/monitoring/provisioning/alerting/alerts.yml b/nixos/hosts/monitoring/provisioning/alerting/alerts.yml deleted file mode 100644 index a9fbd51..0000000 --- a/nixos/hosts/monitoring/provisioning/alerting/alerts.yml +++ /dev/null @@ -1,265 +0,0 @@ -apiVersion: 1 - -groups: - - name: node-alerts - folder: "VM Alerts" - interval: 1m - rules: - - uid: high_cpu - title: High CPU Usage - condition: A - for: 5m - annotations: - summary: "High CPU on {{ $labels.instance }}" - description: "CPU > 90% for 5 minutes" - labels: - severity: warning - data: - - refId: A - relativeTimeRange: { from: 300, to: 0 } - datasourceUid: prometheus - model: - expr: avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) by (instance) > 0.9 - interval: "" - datasource: { type: prometheus, uid: prometheus } - instant: false - intervalMs: 15000 - maxDataPoints: 43200 - - - uid: high_memory - title: High Memory Usage - condition: A - for: 5m - annotations: - summary: "High memory on {{ $labels.instance }}" - description: "Memory > 90% for 5 minutes" - labels: - severity: warning - data: - - refId: A - relativeTimeRange: { from: 300, to: 0 } - datasourceUid: prometheus - model: - expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.9 - interval: "" - datasource: { type: prometheus, uid: prometheus } - instant: false - intervalMs: 15000 - maxDataPoints: 43200 - - - uid: low_disk - title: Low Disk Space - condition: A - for: 5m - annotations: - summary: "Low disk on {{ $labels.instance }} {{ $labels.mountpoint }}" - description: "< 10% space left" - labels: - severity: critical - data: - - refId: A - relativeTimeRange: { from: 300, to: 0 } - datasourceUid: prometheus - model: - expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.1 - interval: "" - datasource: { type: prometheus, uid: prometheus } - instant: false - intervalMs: 15000 - maxDataPoints: 43200 - - - uid: node_down - title: Node is Down - condition: A - for: 1m - annotations: - summary: "{{ $labels.instance }} is DOWN" - description: "Exporter is not responding" - labels: - severity: critical - data: - - refId: A - relativeTimeRange: { from: 60, to: 0 } - datasourceUid: prometheus - model: - expr: up == 0 - interval: "" - datasource: { type: prometheus, uid: prometheus } - instant: true - intervalMs: 15000 - maxDataPoints: 43200 - - - name: prometheus-alerts - folder: "Prometheus Alerts" - interval: 1m - rules: - - uid: prometheus_down - title: Prometheus is Down - condition: A - for: 1m - annotations: - summary: "Prometheus is not responding" - description: "up{job='prometheus'} == 0" - labels: - severity: critical - data: - - refId: A - relativeTimeRange: { from: 60, to: 0 } - datasourceUid: prometheus - model: - expr: up{job="prometheus"} == 0 - interval: "" - datasource: { type: prometheus, uid: prometheus } - instant: true - intervalMs: 15000 - maxDataPoints: 43200 - - - name: traefik-alerts - folder: "Traefik Alerts" - interval: 1m - rules: - - uid: traefik_5xx - title: Traefik 5xx Errors - condition: A - for: 1m - annotations: - summary: "Traefik has a high rate of 5xx responses" - description: "Rate of HTTP 5xx responses is high" - labels: - severity: warning - data: - - refId: A - relativeTimeRange: { from: 60, to: 0 } - datasourceUid: prometheus - model: - expr: rate(traefik_service_requests_total{code=~"5.."}[1m]) > 0 - interval: "" - datasource: { type: prometheus, uid: prometheus } - instant: false - intervalMs: 15000 - maxDataPoints: 43200 - - - name: grafana-alerts - folder: "Grafana Alerts" - interval: 1m - rules: - - uid: grafana_down - title: Grafana is Down - condition: A - for: 1m - annotations: - summary: "Grafana is not responding" - description: "up{job='grafana'} == 0" - labels: - severity: critical - data: - - refId: A - relativeTimeRange: { from: 60, to: 0 } - datasourceUid: prometheus - model: - expr: up{job="grafana"} == 0 - interval: "" - datasource: { type: prometheus, uid: prometheus } - instant: true - intervalMs: 15000 - maxDataPoints: 43200 - - - name: postgres-alerts - folder: "Postgres Alerts" - interval: 1m - rules: - - uid: postgres_down - title: Postgres is Down - condition: A - for: 1m - annotations: - summary: "Postgres is not responding" - description: "up{job='postgres'} == 0" - labels: - severity: critical - data: - - refId: A - relativeTimeRange: { from: 60, to: 0 } - datasourceUid: prometheus - model: - expr: up{job="postgres"} == 0 - interval: "" - datasource: { type: prometheus, uid: prometheus } - instant: true - intervalMs: 15000 - maxDataPoints: 43200 - - - name: gitea-alerts - folder: "Gitea Alerts" - interval: 1m - rules: - - uid: gitea_down - title: Gitea is Down - condition: A - for: 1m - annotations: - summary: "Gitea is not responding" - description: "up{job='gitea'} == 0" - labels: - severity: critical - data: - - refId: A - relativeTimeRange: { from: 60, to: 0 } - datasourceUid: prometheus - model: - expr: up{job="gitea"} == 0 - interval: "" - datasource: { type: prometheus, uid: prometheus } - instant: true - intervalMs: 15000 - maxDataPoints: 43200 - - - name: promtail-alerts - folder: "Promtail Alerts" - interval: 1m - rules: - - uid: promtail_down - title: Promtail is Down - condition: A - for: 1m - annotations: - summary: "Promtail is not responding" - description: "up{job='promtail'} == 0" - labels: - severity: critical - data: - - refId: A - relativeTimeRange: { from: 60, to: 0 } - datasourceUid: prometheus - model: - expr: up{job="promtail"} == 0 - interval: "" - datasource: { type: prometheus, uid: prometheus } - instant: true - intervalMs: 15000 - maxDataPoints: 43200 - - - name: logs-alerts - folder: "Logs Alerts" - interval: 1m - rules: - - uid: failed_ssh_logins - title: Failed SSH Logins - condition: A - for: 1m - annotations: - summary: "Too many failed SSH login attempts" - description: "Check for brute force login attempts" - labels: - severity: warning - data: - - refId: A - relativeTimeRange: { from: 300, to: 0 } - datasourceUid: prometheus - model: - expr: sum(rate(failed_ssh_logins[5m])) > 5 - interval: "" - datasource: { type: prometheus, uid: prometheus } - instant: false - intervalMs: 15000 - maxDataPoints: 43200 diff --git a/nixos/hosts/monitoring/provisioning/alerts/loki-alerts.yml b/nixos/hosts/monitoring/provisioning/alerts/loki-alerts.yml new file mode 100644 index 0000000..0d4f91e --- /dev/null +++ b/nixos/hosts/monitoring/provisioning/alerts/loki-alerts.yml @@ -0,0 +1,39 @@ +groups: + - name: Loki + + rules: + - alert: LokiProcessTooManyRestarts + expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2' + for: 0m + labels: + severity: warning + annotations: + summary: Loki process too many restarts (instance {{ $labels.instance }}) + description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: LokiRequestErrors + expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10' + for: 15m + labels: + severity: critical + annotations: + summary: Loki request errors (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: LokiRequestPanic + expr: "sum(increase(loki_panic_total[10m])) by (namespace, job) > 0" + for: 5m + labels: + severity: critical + annotations: + summary: Loki request panic (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: LokiRequestLatency + expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1' + for: 5m + labels: + severity: critical + annotations: + summary: Loki request latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts-v2.yml b/nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts-v2.yml new file mode 100644 index 0000000..97a3390 --- /dev/null +++ b/nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts-v2.yml @@ -0,0 +1,299 @@ +groups: + - name: NodeExporterV2 + rules: + - alert: Node down + expr: up{job="monitoring-pi"} == 0 + for: 2m + labels: + severity: warning + annotations: + title: Node {{ $labels.instance }} is down + description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down. + + - alert: HostOutOfMemory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }} + + - alert: HostMemoryUnderMemoryPressure + expr: rate(node_vmstat_pgmajfault[1m]) > 1000 + for: 2m + labels: + severity: warning + annotations: + summary: Host memory under memory pressure (instance {{ $labels.instance }}) + description: The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }} + + - alert: HostUnusualNetworkThroughputIn + expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput in (instance {{ $labels.instance }}) + description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }} + + - alert: HostUnusualNetworkThroughputOut + expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput out (instance {{ $labels.instance }}) + description: Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }} + + - alert: HostUnusualDiskReadRate + expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk read rate (instance {{ $labels.instance }}) + description: Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }} + + - alert: HostUnusualDiskWriteRate + expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write rate (instance {{ $labels.instance }}) + description: Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }} + + # Please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". + # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: Disk is almost full (< 10% left)\n VALUE = {{ $value }} + + # Please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". + # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. + - alert: HostDiskWillFillIn24Hours + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) + description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }} + + - alert: HostOutOfInodes + expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host out of inodes (instance {{ $labels.instance }}) + description: Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }} + + - alert: HostInodesWillFillIn24Hours + expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) + description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }} + + - alert: HostUnusualDiskReadLatency + expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk read latency (instance {{ $labels.instance }}) + description: Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }} + + - alert: HostUnusualDiskWriteLatency + expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write latency (instance {{ $labels.instance }}) + description: Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }} + + - alert: HostHighCpuLoad + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 + for: 0m + labels: + severity: warning + annotations: + summary: Host high CPU load (instance {{ $labels.instance }}) + description: CPU load is > 80%\n VALUE = {{ $value }} + + - alert: HostCpuStealNoisyNeighbor + expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) + description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }} + + # 1000 context switches is an arbitrary number. + # Alert threshold depends on nature of application. + # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 + - alert: HostContextSwitching + expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000 + for: 0m + labels: + severity: warning + annotations: + summary: Host context switching (instance {{ $labels.instance }}) + description: Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }} + + - alert: HostSwapIsFillingUp + expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 + for: 2m + labels: + severity: warning + annotations: + summary: Host swap is filling up (instance {{ $labels.instance }}) + description: Swap is filling up (>80%)\n VALUE = {{ $value }} + + - alert: HostSystemdServiceCrashed + expr: node_systemd_unit_state{state="failed"} == 1 + for: 0m + labels: + severity: warning + annotations: + summary: Host SystemD service crashed (instance {{ $labels.instance }}) + description: SystemD service crashed\n VALUE = {{ $value }} + + - alert: HostPhysicalComponentTooHot + expr: node_hwmon_temp_celsius > 75 + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: Physical hardware component too hot\n VALUE = {{ $value }} + + - alert: HostNodeOvertemperatureAlarm + expr: node_hwmon_temp_crit_alarm_celsius == 1 + for: 0m + labels: + severity: critical + annotations: + summary: Host node overtemperature alarm (instance {{ $labels.instance }}) + description: Physical node temperature alarm triggered\n VALUE = {{ $value }} + + - alert: HostRaidArrayGotInactive + expr: node_md_state{state="inactive"} > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Host RAID array got inactive (instance {{ $labels.instance }}) + description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }} + + - alert: HostRaidDiskFailure + expr: node_md_disks{state="failed"} > 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host RAID disk failure (instance {{ $labels.instance }}) + description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }} + + - alert: HostKernelVersionDeviations + expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1 + for: 6h + labels: + severity: warning + annotations: + summary: Host kernel version deviations (instance {{ $labels.instance }}) + description: Different kernel versions are running\n VALUE = {{ $value }} + + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: OOM kill detected\n VALUE = {{ $value }} + + - alert: HostEdacCorrectableErrorsDetected + expr: increase(node_edac_correctable_errors_total[1m]) > 0 + for: 0m + labels: + severity: info + annotations: + summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) + description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }} + + - alert: HostEdacUncorrectableErrorsDetected + expr: node_edac_uncorrectable_errors_total > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }} + + - alert: HostNetworkReceiveErrors + expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }}) + description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n VALUE = {{ $value }} + + - alert: HostNetworkTransmitErrors + expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }}) + description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }} + + - alert: HostNetworkInterfaceSaturated + expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 + for: 1m + labels: + severity: warning + annotations: + summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }}) + description: The network interface is getting overloaded.\n VALUE = {{ $value }} + + - alert: HostConntrackLimit + expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: Host conntrack limit (instance {{ $labels.instance }}) + description: The number of conntrack is approching limit\n VALUE = {{ $value }} + + - alert: HostClockSkew + expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) + for: 2m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + description: Clock skew detected. Clock is out of sync.\n VALUE = {{ $value }} + + - alert: HostClockNotSynchronising + expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + description: Clock not synchronising.\n VALUE = {{ $value }} diff --git a/nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts.yml b/nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts.yml new file mode 100644 index 0000000..2cd649a --- /dev/null +++ b/nixos/hosts/monitoring/provisioning/alerts/node-exporter-alerts.yml @@ -0,0 +1,320 @@ +groups: + +- name: NodeExporter + + rules: + + - alert: HostOutOfMemory + expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)' + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostMemoryUnderMemoryPressure + expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)' + for: 0m + labels: + severity: warning + annotations: + summary: Host memory under memory pressure (instance {{ $labels.instance }}) + description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostMemoryIsUnderutilized + expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8' + for: 0m + labels: + severity: info + annotations: + summary: Host Memory is underutilized (instance {{ $labels.instance }}) + description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualNetworkThroughputIn + expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)' + for: 0m + labels: + severity: warning + annotations: + summary: Host unusual network throughput in (instance {{ $labels.instance }}) + description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualNetworkThroughputOut + expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)' + for: 0m + labels: + severity: warning + annotations: + summary: Host unusual network throughput out (instance {{ $labels.instance }}) + description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskReadRate + expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)' + for: 0m + labels: + severity: warning + annotations: + summary: Host unusual disk read rate (instance {{ $labels.instance }}) + description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfDiskSpace + expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)' + for: 2m + labels: + severity: critical + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostDiskMayFillIn24Hours + expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0' + for: 2m + labels: + severity: warning + annotations: + summary: Host disk may fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfInodes + expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)' + for: 2m + labels: + severity: critical + annotations: + summary: Host out of inodes (instance {{ $labels.instance }}) + description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostFilesystemDeviceError + expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1' + for: 2m + labels: + severity: critical + annotations: + summary: Host filesystem device error (instance {{ $labels.instance }}) + description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostInodesMayFillIn24Hours + expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0' + for: 2m + labels: + severity: warning + annotations: + summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskReadLatency + expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)' + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk read latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskWriteLatency + expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)' + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostHighCpuLoad + expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80' + for: 10m + labels: + severity: warning + annotations: + summary: Host high CPU load (instance {{ $labels.instance }}) + description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostCpuIsUnderutilized + expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' + for: 1w + labels: + severity: info + annotations: + summary: Host CPU is underutilized (instance {{ $labels.instance }}) + description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostCpuStealNoisyNeighbor + expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) + description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostCpuHighIowait + expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU high iowait (instance {{ $labels.instance }}) + description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskIo + expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8' + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk IO (instance {{ $labels.instance }}) + description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostContextSwitchingHigh + expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2' + for: 0m + labels: + severity: warning + annotations: + summary: Host context switching high (instance {{ $labels.instance }}) + description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSwapIsFillingUp + expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)' + for: 2m + labels: + severity: warning + annotations: + summary: Host swap is filling up (instance {{ $labels.instance }}) + description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSystemdServiceCrashed + expr: '(node_systemd_unit_state{state="failed"} == 1)' + for: 0m + labels: + severity: warning + annotations: + summary: Host systemd service crashed (instance {{ $labels.instance }}) + description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostPhysicalComponentTooHot + expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius' + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNodeOvertemperatureAlarm + expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))' + for: 0m + labels: + severity: critical + annotations: + summary: Host node overtemperature alarm (instance {{ $labels.instance }}) + description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSoftwareRaidInsufficientDrives + expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)' + for: 0m + labels: + severity: critical + annotations: + summary: Host software RAID insufficient drives (instance {{ $labels.instance }}) + description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSoftwareRaidDiskFailure + expr: '(node_md_disks{state="failed"} > 0)' + for: 2m + labels: + severity: warning + annotations: + summary: Host software RAID disk failure (instance {{ $labels.instance }}) + description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostKernelVersionDeviations + expr: 'changes(node_uname_info[1h]) > 0' + for: 0m + labels: + severity: info + annotations: + summary: Host kernel version deviations (instance {{ $labels.instance }}) + description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOomKillDetected + expr: '(increase(node_vmstat_oom_kill[1m]) > 0)' + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacCorrectableErrorsDetected + expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)' + for: 0m + labels: + severity: info + annotations: + summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacUncorrectableErrorsDetected + expr: '(node_edac_uncorrectable_errors_total > 0)' + for: 0m + labels: + severity: warning + annotations: + summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkReceiveErrors + expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Receive Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkTransmitErrors + expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Transmit Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkBondDegraded + expr: '((node_bonding_active - node_bonding_slaves) != 0)' + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Bond Degraded (instance {{ $labels.instance }}) + description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostConntrackLimit + expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)' + for: 5m + labels: + severity: warning + annotations: + summary: Host conntrack limit (instance {{ $labels.instance }}) + description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockSkew + expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))' + for: 10m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockNotSynchronising + expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)' + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/nixos/hosts/monitoring/provisioning/alerts/postgres-alerts.yml b/nixos/hosts/monitoring/provisioning/alerts/postgres-alerts.yml new file mode 100644 index 0000000..4757e88 --- /dev/null +++ b/nixos/hosts/monitoring/provisioning/alerts/postgres-alerts.yml @@ -0,0 +1,201 @@ +groups: + - name: Postgres + + rules: + - alert: PostgresqlDown + expr: "pg_up == 0" + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql down (instance {{ $labels.instance }}) + description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlRestarted + expr: "time() - pg_postmaster_start_time_seconds < 60" + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql restarted (instance {{ $labels.instance }}) + description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlExporterError + expr: "pg_exporter_last_scrape_error > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql exporter error (instance {{ $labels.instance }}) + description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTableNotAutoVacuumed + expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10" + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }}) + description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTableNotAutoAnalyzed + expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10" + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto analyzed (instance {{ $labels.instance }}) + description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyConnections + expr: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)" + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many connections (instance {{ $labels.instance }}) + description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlNotEnoughConnections + expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5' + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql not enough connections (instance {{ $labels.instance }}) + description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlDeadLocks + expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5' + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql dead locks (instance {{ $labels.instance }}) + description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlHighRollbackRate + expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02' + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql high rollback rate (instance {{ $labels.instance }}) + description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlCommitRateLow + expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5' + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql commit rate low (instance {{ $labels.instance }}) + description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlLowXidConsumption + expr: "rate(pg_txid_current[1m]) < 5" + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql low XID consumption (instance {{ $labels.instance }}) + description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlHighRateStatementTimeout + expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate statement timeout (instance {{ $labels.instance }}) + description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlHighRateDeadlock + expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate deadlock (instance {{ $labels.instance }}) + description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlUnusedReplicationSlot + expr: "pg_replication_slots_active == 0" + for: 1m + labels: + severity: warning + annotations: + summary: Postgresql unused replication slot (instance {{ $labels.instance }}) + description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyDeadTuples + expr: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1" + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many dead tuples (instance {{ $labels.instance }}) + description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlConfigurationChanged + expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m' + for: 0m + labels: + severity: info + annotations: + summary: Postgresql configuration changed (instance {{ $labels.instance }}) + description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlSslCompressionActive + expr: "sum(pg_stat_ssl_compression) > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql SSL compression active (instance {{ $labels.instance }}) + description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyLocksAcquired + expr: "((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20" + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql too many locks acquired (instance {{ $labels.instance }}) + description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlBloatIndexHigh(>80%) + expr: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)" + for: 1h + labels: + severity: warning + annotations: + summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }}) + description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlBloatTableHigh(>80%) + expr: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)" + for: 1h + labels: + severity: warning + annotations: + summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }}) + description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlInvalidIndex + expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' + for: 6h + labels: + severity: warning + annotations: + summary: Postgresql invalid index (instance {{ $labels.instance }}) + description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlReplicationLag + expr: "pg_replication_lag_seconds > 5" + for: 30s + labels: + severity: warning + annotations: + summary: Postgresql replication lag (instance {{ $labels.instance }}) + description: "The PostgreSQL replication lag is high (> 5s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/nixos/hosts/monitoring/provisioning/alerts/prometheus-alerts.yml b/nixos/hosts/monitoring/provisioning/alerts/prometheus-alerts.yml new file mode 100644 index 0000000..68952cc --- /dev/null +++ b/nixos/hosts/monitoring/provisioning/alerts/prometheus-alerts.yml @@ -0,0 +1,255 @@ +groups: + - name: Prometheus + + rules: + - alert: PrometheusJobMissing + expr: 'absent(up{job="prometheus"})' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetMissing + expr: "up == 0" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAllTargetsMissing + expr: "sum by (job) (up) == 0" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus all targets missing (instance {{ $labels.instance }}) + description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetMissingWithWarmupTime + expr: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing with warmup time (instance {{ $labels.instance }}) + description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusConfigurationReloadFailure + expr: "prometheus_config_last_reload_successful != 1" + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTooManyRestarts + expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus too many restarts (instance {{ $labels.instance }}) + description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerJobMissing + expr: 'absent(up{job="alertmanager"})' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) + description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: "alertmanager_config_last_reload_successful != 1" + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) + description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigNotSynced + expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) + description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerE2eDeadManSwitch + expr: "vector(1)" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) + description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotConnectedToAlertmanager + expr: "prometheus_notifications_alertmanagers_discovered < 1" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationFailures + expr: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTemplateTextExpansionFailures + expr: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationSlow + expr: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds" + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotificationsBacklog + expr: "min_over_time(prometheus_notifications_queue_length[10m]) > 0" + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: "rate(alertmanager_notifications_failed_total[1m]) > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetEmpty + expr: "prometheus_sd_discovered_targets == 0" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target empty (instance {{ $labels.instance }}) + description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapingSlow + expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05' + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusLargeScrape + expr: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10" + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus large scrape (instance {{ $labels.instance }}) + description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapeDuplicate + expr: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0" + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) + description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointCreationFailures + expr: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointDeletionFailures + expr: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCompactionsFailed + expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbHeadTruncationsFailed + expr: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbReloadFailures + expr: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalCorruptions + expr: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalTruncationsFailed + expr: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0" + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTimeseriesCardinality + expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus timeseries cardinality (instance {{ $labels.instance }}) + description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/nixos/hosts/monitoring/provisioning/alerts/promtail-alerts.yml b/nixos/hosts/monitoring/provisioning/alerts/promtail-alerts.yml new file mode 100644 index 0000000..39aee40 --- /dev/null +++ b/nixos/hosts/monitoring/provisioning/alerts/promtail-alerts.yml @@ -0,0 +1,21 @@ +groups: + - name: Promtail + + rules: + - alert: PromtailRequestErrors + expr: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10' + for: 5m + labels: + severity: critical + annotations: + summary: Promtail request errors (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PromtailRequestLatency + expr: "histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1" + for: 5m + labels: + severity: critical + annotations: + summary: Promtail request latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/nixos/hosts/monitoring/provisioning/alerts/traefik-alerts.yml b/nixos/hosts/monitoring/provisioning/alerts/traefik-alerts.yml new file mode 100644 index 0000000..218843c --- /dev/null +++ b/nixos/hosts/monitoring/provisioning/alerts/traefik-alerts.yml @@ -0,0 +1,30 @@ +groups: + - name: Traefik + + rules: + - alert: TraefikServiceDown + expr: "count(traefik_service_server_up) by (service) == 0" + for: 0m + labels: + severity: critical + annotations: + summary: Traefik service down (instance {{ $labels.instance }}) + description: "All Traefik services are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: TraefikHighHttp4xxErrorRateService + expr: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5' + for: 1m + labels: + severity: critical + annotations: + summary: Traefik high HTTP 4xx error rate service (instance {{ $labels.instance }}) + description: "Traefik service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: TraefikHighHttp5xxErrorRateService + expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5' + for: 1m + labels: + severity: critical + annotations: + summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }}) + description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/nixos/hosts/monitoring/provisioning/templates/telegram.markdown.v2.tmpl b/nixos/hosts/monitoring/provisioning/templates/telegram.markdown.v2.tmpl new file mode 100644 index 0000000..4f3d3ed --- /dev/null +++ b/nixos/hosts/monitoring/provisioning/templates/telegram.markdown.v2.tmpl @@ -0,0 +1,37 @@ +{{ define "telegram.markdown.message" }} +{{- if eq .Status "firing" -}} + {{- if eq .CommonLabels.severity "critical" -}} + πŸ”΄ Alert: {{ .CommonLabels.alertname }} + {{- else if eq .CommonLabels.severity "warning" -}} + 🟠 Alert: {{ .CommonLabels.alertname }} + {{- else -}} + βšͺ️ Alert: {{ .CommonLabels.alertname }} + {{- end }} +Status: πŸ”₯ FIRING +Severity: {{ if eq .CommonLabels.severity "critical" }}πŸ”΄ {{ .CommonLabels.severity | title }}{{ else if eq .CommonLabels.severity "warning" }}🟠 {{ .CommonLabels.severity | title }}{{ else }}βšͺ️ {{ .CommonLabels.severity | title }}{{ end }} +{{- else if eq .Status "resolved" -}} + βšͺ️ Alert: {{ .CommonLabels.alertname }} +Status: βœ… RESOLVED +Severity: {{ if eq .CommonLabels.severity "critical" }}🟒 {{ .CommonLabels.severity | title }}{{ else if eq .CommonLabels.severity "warning" }}🟒 {{ .CommonLabels.severity | title }}{{ else }}βšͺ️ {{ .CommonLabels.severity | title }}{{ end }} +{{- end }} + +{{- range .Alerts -}} + +{{- if .Labels.job }} +Job: `{{ .Labels.job }}` +{{- end }} + +{{- if .Labels.namespace }} +Namespace: `{{ .Labels.namespace }}` +{{- end }} + +{{- if .Labels.instance }} +Instance: `{{ .Labels.instance }}` +{{- end }} + +{{- if .Annotations.runbook_url }} +[RunbookURL]({{ .Annotations.runbook_url }}) + +{{- end }} +{{- end }} +{{ end }} diff --git a/nixos/hosts/monitoring/provisioning/templates/telegram.tmpl b/nixos/hosts/monitoring/provisioning/templates/telegram.tmpl new file mode 100644 index 0000000..1f814c7 --- /dev/null +++ b/nixos/hosts/monitoring/provisioning/templates/telegram.tmpl @@ -0,0 +1,28 @@ +{{ define "alert_list" }}{{ range . }} +--- +πŸͺͺ {{ .Labels.alertname }} +{{- if eq .Labels.severity "critical" }} +🚨 CRITICAL 🚨 {{ end }} +{{- if eq .Labels.severity "warning" }} +⚠️ WARNING ⚠️{{ end }} +{{- if .Annotations.summary }} +πŸ“ {{ .Annotations.summary }}{{ end }} +{{- if .Annotations.description }} +πŸ“– {{ .Annotations.description }}{{ end }} + +🏷 Labels: +{{ range .Labels.SortedPairs }} {{ .Name }}: {{ .Value }} +{{ end }}{{ end }} +πŸ›  Grafana πŸ’Š Alertmanager πŸ’Š Any other link πŸ›  +{{ end }} + +{{ define "telegram.message" }} +{{ if gt (len .Alerts.Firing) 0 }} +πŸ”₯ Alerts Firing πŸ”₯ +{{ template "alert_list" .Alerts.Firing }} +{{ end }} +{{ if gt (len .Alerts.Resolved) 0 }} +βœ… Alerts Resolved βœ… +{{ template "alert_list" .Alerts.Resolved }} +{{ end }} +{{ end }} diff --git a/nixos/hosts/monitoring/sops.nix b/nixos/hosts/monitoring/sops.nix new file mode 100644 index 0000000..2b0c9ab --- /dev/null +++ b/nixos/hosts/monitoring/sops.nix @@ -0,0 +1,7 @@ +{ config, lib, ... }: +{ + sops.secrets."telegram-alert-bot-token" = { + sopsFile = ../../secrets/telegram/secrets.yml; + owner = "prometheus"; + }; +} diff --git a/nixos/hosts/sandbox/networking.nix b/nixos/hosts/sandbox/networking.nix index f0397ab..e1419e9 100644 --- a/nixos/hosts/sandbox/networking.nix +++ b/nixos/hosts/sandbox/networking.nix @@ -1,20 +1,6 @@ -{ config, lib, pkgs, ... }: { - - networking = { - hostName = "sandbox"; - interfaces.eth0 = { - ipv4.addresses = [{ - address = "192.168.1.148"; - prefixLength = 24; - }]; - ipv6.addresses = [{ - address = "fe80::148"; - prefixLength = 64; - }]; - }; - defaultGateway = { - address = "192.168.1.1"; - interface = "eth0"; - }; - }; +{ config, lib, pkgs, ... }: +{ + networking.hostName = "sandbox"; + networking.interfaces.ens18.useDHCP = true; + networking.defaultGateway = "192.168.1.1"; } diff --git a/nixos/hosts/traefik/configuration/middlewares.nix b/nixos/hosts/traefik/configuration/middlewares.nix new file mode 100644 index 0000000..d3995a5 --- /dev/null +++ b/nixos/hosts/traefik/configuration/middlewares.nix @@ -0,0 +1,10 @@ +{ lib, config, ... }: + +let + internalNetwork = "192.168.1.0/24"; +in +{ + internal-whitelist = { + ipWhiteList.sourceRange = [ internalNetwork ]; + }; +} diff --git a/nixos/hosts/traefik/configuration/routers.nix b/nixos/hosts/traefik/configuration/routers.nix new file mode 100644 index 0000000..71a087b --- /dev/null +++ b/nixos/hosts/traefik/configuration/routers.nix @@ -0,0 +1,140 @@ +{ lib, config, ... }: + +{ + traefik = { + rule = "Host(`traefik.procopius.dk`)"; + service = "traefik"; + entryPoints = [ "websecure" ]; + middlewares = [ "internal-whitelist" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + proxmox = { + rule = "Host(`proxmox.procopius.dk`)"; + service = "proxmox"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + forgejo = { + rule = "Host(`git.procopius.dk`)"; + service = "forgejo"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + prometheus = { + rule = "Host(`prometheus.procopius.dk`)"; + service = "prometheus"; + entryPoints = [ "websecure" ]; + middlewares = [ "internal-whitelist" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + grafana = { + rule = "Host(`grafana.procopius.dk`)"; + service = "grafana"; + entryPoints = [ "websecure" ]; + middlewares = [ "internal-whitelist" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + alertmanager = { + rule = "Host(`alertmanager.procopius.dk`)"; + service = "alertmanager"; + entryPoints = [ "websecure" ]; + middlewares = [ "internal-whitelist" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + jellyfin = { + rule = "Host(`jellyfin.procopius.dk`)"; + service = "jellyfin"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + sonarr = { + rule = "Host(`sonarr.procopius.dk`)"; + service = "sonarr"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + radarr = { + rule = "Host(`radarr.procopius.dk`)"; + service = "radarr"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + ente = { + rule = "Host(`ente.procopius.dk`)"; + service = "ente"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + photos = { + rule = "Host(`photos.procopius.dk`)"; + service = "photos"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + minio = { + rule = "Host(`minio.procopius.dk`)"; + service = "minio"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + minio-api = { + rule = "Host(`minio-api.procopius.dk`)"; + service = "minio-api"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + account = { + rule = "Host(`account.procopius.dk`)"; + service = "account"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + auth = { + rule = "Host(`auth.procopius.dk`)"; + service = "auth"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + nas = { + rule = "Host(`nas.procopius.dk`)"; + service = "nas"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + umami = { + rule = "Host(`umami.procopius.dk`)"; + service = "umami"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + mesterjakob = { + rule = "Host(`mester.jakobblum.dk`)"; + service = "mesterjakob"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; + + catchAll = { + rule = "HostRegexp(`.+`)"; + service = "nginx"; + entryPoints = [ "websecure" ]; + tls = { certResolver = "letsencrypt"; }; + }; +} diff --git a/nixos/hosts/traefik/configuration/services.nix b/nixos/hosts/traefik/configuration/services.nix new file mode 100644 index 0000000..971183d --- /dev/null +++ b/nixos/hosts/traefik/configuration/services.nix @@ -0,0 +1,38 @@ +{ lib, config, ... }: + +{ + proxmox.loadBalancer.servers = [ { url = "https://192.168.1.205:8006"; } ]; + proxmox.loadBalancer.serversTransport = "insecureTransport"; + + traefik.loadBalancer.servers = [ { url = "http://localhost:8080"; } ]; + + forgejo.loadBalancer.servers = [ { url = "http://forgejo.lab:3000"; } ]; + + nginx.loadBalancer.servers = [ { url = "https://192.168.1.226:4433"; } ]; + nginx.loadBalancer.serversTransport = "insecureTransport"; + + prometheus.loadBalancer.servers = [ { url = "http://monitor.lab:9090"; } ]; + grafana.loadBalancer.servers = [ { url = "http://monitor.lab:3000"; } ]; + alertmanager.loadBalancer.servers = [ { url = "http://monitor.lab:9093"; } ]; + + + # from nginx + account.loadBalancer.servers = [ { url = "http://192.168.1.226:3001"; } ]; + auth.loadBalancer.servers = [ { url = "http://192.168.1.226:3005"; } ]; + ente.loadBalancer.servers = [ { url = "http://192.168.1.226:8087"; } ]; + photos.loadBalancer.servers = [ { url = "http://192.168.1.226:3000"; } ]; + minio.loadBalancer.servers = [ { url = "http://192.168.1.226:3201"; } ]; + minio-api.loadBalancer.servers = [ { url = "http://192.168.1.226:3200"; } ]; + + nas.loadBalancer.servers = [ { url = "https://192.168.1.226:5001"; } ]; + nas.loadBalancer.serversTransport = "insecureTransport"; + + + jellyfin.loadBalancer.servers = [ { url = "http://192.168.1.226:8096"; } ]; + radarr.loadBalancer.servers = [ { url = "http://192.168.1.226:7878"; } ]; + sonarr.loadBalancer.servers = [ { url = "http://192.168.1.226:8989"; } ]; + + umami.loadBalancer.servers = [ { url = "http://192.168.1.226:3333"; } ]; + + mesterjakob.loadBalancer.servers = [ { url = "http://192.168.1.226:4200"; } ]; +} diff --git a/nixos/hosts/traefik/configuration/static.nix b/nixos/hosts/traefik/configuration/static.nix new file mode 100644 index 0000000..22bd1f4 --- /dev/null +++ b/nixos/hosts/traefik/configuration/static.nix @@ -0,0 +1,61 @@ +{ lib, config, ... }: + +{ + entryPoints = { + web = { + address = ":80"; + asDefault = true; + http.redirections.entrypoint = { + to = "websecure"; + scheme = "https"; + }; + }; + + websecure = { + address = ":443"; + http.tls.certResolver = "letsencrypt"; + }; + + metrics = { + address = ":8082"; + }; + }; + + api = { + dashboard = true; + insecure = true; + }; + + certificatesResolvers = { + letsencrypt = { + acme = { + email = "david.mikael@proton.me"; + storage = "/var/lib/traefik/acme.json"; + # httpChallenge = { + # entryPoint = "web"; + # }; + dnsChallenge = { + provider = "cloudflare"; + delayBeforeCheck = 10; + resolvers = [ "1.1.1.1:53" "8.8.8.8:53" ]; + }; + }; + }; + }; + + metrics = { + prometheus = { + entryPoint = "metrics"; + }; + }; + + log = { + level = "DEBUG"; + filePath = "/var/log/traefik/traefik.log"; + }; + + accessLog = { + format = "json"; + filePath = "/var/log/traefik/access.log"; + }; +} diff --git a/nixos/hosts/traefik/networking.nix b/nixos/hosts/traefik/networking.nix index 9d07344..4c09486 100644 --- a/nixos/hosts/traefik/networking.nix +++ b/nixos/hosts/traefik/networking.nix @@ -1,18 +1,13 @@ { config, lib, pkgs, ... }: { - networking = { - hostName = "traefik"; - interfaces.eth0 = { - ipv4.addresses = [{ - address = "192.168.1.171"; - prefixLength = 24; - }]; - }; - firewall.allowedTCPPorts = [ 80 443 8080 8082 ]; + networking.hostName = "traefik"; + networking.interfaces.eth0.ipv4.addresses = [{ + address = "192.168.1.80"; + prefixLength = 24; + }]; - defaultGateway = { - address = "192.168.1.1"; - interface = "eth0"; - }; - }; + networking.firewall.allowedTCPPorts = [ 80 443 8080 8082 ]; + + networking.nameservers = [ "192.168.1.53" ]; + networking.defaultGateway = "192.168.1.1"; } diff --git a/nixos/hosts/traefik/promtail.nix b/nixos/hosts/traefik/promtail.nix index 4a48e8d..7b4f91b 100644 --- a/nixos/hosts/traefik/promtail.nix +++ b/nixos/hosts/traefik/promtail.nix @@ -14,10 +14,10 @@ { targets = [ "localhost" ]; labels = { - job = "traefik"; + job = "/var/log/traefik/*.log"; host = config.networking.hostName; env = "proxmox"; - instance = "${config.networking.hostName}.local"; # prometheus scrape target + instance = "${config.networking.hostName}.lab"; # prometheus scrape target __path__ = "/var/log/traefik/*.log"; }; } diff --git a/nixos/hosts/traefik/traefik.nix b/nixos/hosts/traefik/traefik.nix index 36e27a9..52462bd 100644 --- a/nixos/hosts/traefik/traefik.nix +++ b/nixos/hosts/traefik/traefik.nix @@ -1,158 +1,36 @@ -{ config, lib, pkgs, ... }: { +{ config, lib, pkgs, ... }: - # Traefik reverse proxy setup +let + staticConfig = import ./configuration/static.nix { inherit lib config; }; + middlewaresConfig = import ./configuration/middlewares.nix { inherit lib config; }; + routersConfig = import ./configuration/routers.nix { inherit lib config; }; + servicesConfig = import ./configuration/services.nix { inherit lib config; }; +in +{ services.traefik = { enable = true; - staticConfigOptions = { - entryPoints = { - web = { - address = ":80"; - asDefault = true; - http.redirections.entrypoint = { - to = "websecure"; - scheme = "https"; - }; - }; + # ==== Static Configuration ==== + staticConfigOptions = staticConfig; - websecure = { - address = ":443"; - asDefault = true; - http.tls.certResolver = "letsencrypt"; - }; + # ==== Dynamic Configuration ==== + dynamicConfigOptions.http = { + routers = routersConfig; + services = servicesConfig; + middlewares = middlewaresConfig; - metrics = { - address = ":8082"; + serversTransports = { + insecureTransport = { + insecureSkipVerify = true; }; }; - - api.dashboard = true; - api.insecure = true; - - # Enable Let's Encrypt - certificatesResolvers = { - letsencrypt = { - acme = { - email = "david.mikael@proton.me"; # Replace with your email - storage = "/var/lib/traefik/acme.json"; # Location to store ACME certificates - httpChallenge = { - entryPoint = "web"; # Uses HTTP challenge (can also use DNS) - }; - # Uncomment the following for staging (testing) environment - # caServer = "https://acme-staging-v02.api.letsencrypt.org/directory"; - }; - }; - }; - - # Enable Prometheus metrics - metrics = { - prometheus = { - entryPoint = "metrics"; - }; - }; - log = { - level = "DEBUG"; - filePath = "/var/log/traefik/traefik.log"; - }; - - accessLog = { - format = "json"; - filePath = "/var/log/traefik/access.log"; - }; - - # Enable access logs (you can customize the log format) - # accessLog = { - # filePath = "/var/log/traefik/access.log"; # Log to a file - # format = "common"; # You can adjust this to `json` or `common` - # }; - # tracing = { - # enabled = true; - # provider = "jaeger"; # or zipkin, or other - # jaeger = { - # apiURL = "http://localhost:5775"; # Replace with your Jaeger instance URL - # }; - # }; - }; - - dynamicConfigOptions = { - # Add IP whitelisting middleware to restrict access to internal network only - http.middlewares = { - internal-whitelist = { - ipWhiteList = { - sourceRange = ["192.168.1.0/24"]; # Adjust to your internal network range - # Alternatively use `127.0.0.1/32` for localhost access - }; - }; - }; - - # Route to Proxmox UI - http.routers.proxmox = { - rule = "Host(`proxmox.procopius.dk`)"; - service = "proxmox"; - entryPoints = [ "web" "websecure" ]; - tls = { - certResolver = "letsencrypt"; # Use Let's Encrypt - }; - }; - # Route to Traefik Dashboard - http.routers.traefik = { - rule = "Host(`traefik.procopius.dk`)"; - service = "traefik"; - entryPoints = [ "web" "websecure" ]; - middlewares = ["internal-whitelist"]; - tls = { - certResolver = "letsencrypt"; # Use Let's Encrypt - }; - }; - - http.routers.forgejo = { - rule = "Host(`git.procopius.dk`)"; - service = "forgejo"; - entryPoints = [ "web" "websecure" ]; - tls = { - certResolver = "letsencrypt"; # Use Let's Encrypt - }; - }; - - # Route to Traefik Dashboard - http.routers.catchAll = { - # rule = "Host(`jellyfin.procopius.dk`)"; - rule = "HostRegexp(`.+`)"; - # rule = "HostRegexp(`{host:.+}`)"; - service = "nginx"; - entryPoints = [ "web" "websecure" ]; - tls = { - certResolver = "letsencrypt"; # Use Let's Encrypt - }; - }; - - - # Define the services - http.services.proxmox.loadBalancer.servers = [ - { url = "https://192.168.1.205:8006"; } # Proxmox - ]; - http.services.proxmox.loadBalancer.serversTransport = "insecureTransport"; - - - http.services.traefik.loadBalancer.servers = [ - { url = "http://traefik.local:8080"; } # Traefik Dashboard - ]; - - http.services.forgejo.loadBalancer.servers = [ - { url = "http://192.168.1.249:3000"; } # forgejo - ]; - - http.services.nginx.loadBalancer.servers = [ - { url = "https://192.168.1.226:4433"; } # nginx - ]; - http.services.nginx.loadBalancer.serversTransport = "insecureTransport"; - - - http.serversTransports.insecureTransport.insecureSkipVerify = true; - }; }; - # Optionally, you can add Docker support if using Docker Compose + systemd.services.traefik.serviceConfig.Environment = [ + "CLOUDFLARE_DNS_API_TOKEN=gQYyG6cRw-emp_qpsUj9TrkYgoVC1v9UUtv94ozA" + "CLOUDFLARE_ZONE_API_TOKEN=gQYyG6cRw-emp_qpsUj9TrkYgoVC1v9UUtv94ozA" + ]; + virtualisation.docker.enable = true; } diff --git a/nixos/hosts/vpn/README.md b/nixos/hosts/vpn/README.md new file mode 100644 index 0000000..e76e6ef --- /dev/null +++ b/nixos/hosts/vpn/README.md @@ -0,0 +1,127 @@ +Great question β€” and you're absolutely right to ask. + +You **don’t need** Innernet or Headscale+patch *if* you're comfortable building a small self-hosted portal that handles: + +* βœ… OIDC login +* βœ… WireGuard peer key management +* βœ… Config generation and download + +So let’s break it down: + +--- + +## 🧩 Why *Innernet* and *Headscale+patch* Exist + +Those tools solve **user/device coordination**, dynamic routing, and access control *automatically*, **with batteries included**. They're great if you want a more managed, β€œplug-and-play” experience. Specifically: + +| Tool | Purpose | +| ------------------------ | ----------------------------------------------------------------- | +| **Innernet** | Full WireGuard-based mesh + OIDC login + per-device management | +| **Headscale** | Tailscale backend, enables user login + ACL + device coordination | +| **Headscale+OIDC patch** | Add OIDC login to Headscale (by default, only pre-shared keys) | + +These tools do things like: + +* Assign IPs/DNS names to users +* Regenerate keys +* Handle coordination between peers +* Gate access with identity + +--- + +## πŸ› οΈ Why You *Don’t* Need Them (Your Custom Portal Idea) + +If you're fine with: + +* Managing WireGuard server and IPs manually +* Writing a small web service to issue `.conf` files after login +* Using firewall rules for access control + +Then you absolutely **can** build a custom system around **WireGuard + OIDC** like this: + +--- + +### 🧱 Architecture: WireGuard + OIDC + Config Generator + +``` + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Your OIDC β”‚ + β”‚ provider │◄────── Google, Authentik, Keycloak + β””β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + (OIDC Login) + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Web Portal │◄────── nginx + Go, Node.js, Flask, etc. + β”‚ (self-hosted) β”‚ + β””β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”˜ + Gen Keys β”‚ β”‚ Serve config + & register β”‚ └─────> `client.conf` / QR code + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ WireGuard server β”‚ (NixOS) + β”‚ - Handles all peers β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +### πŸ” What Your Portal Needs to Do + +* Authenticate via OIDC (OAuth2 flow) +* Store a list of "authorized users" +* When a user logs in: + + * Check if they already have a WireGuard peer entry + * If not: generate a keypair, assign IP (e.g. `10.100.0.5`) + * Append to server config (`/etc/wireguard/wg0.conf`) or use `wg set` + * Return config file or QR code for the mobile client + +--- + +## βœ… Benefits of This DIY Approach + +* πŸ’‘ Full control: you own every step +* πŸ” OIDC is the gatekeeper (centrally managed access) +* ⚑ No extra daemon/software overhead (just WireGuard + your app) +* 🧩 Easily extensible (e.g., device limits, IP lease expiration) + +--- + +## βš™οΈ Tools You Can Use + +| Purpose | Tools | +| ---------------------- | -------------------------------------------------------------------------- | +| OIDC Auth | [Dex](https://github.com/dexidp/dex), Keycloak, Authentik, or Firebase | +| Web Portal Backend | Node.js (Express), Go, Flask, FastAPI | +| Config Template Engine | Go templates, EJS, Jinja2 | +| WireGuard Integration | `wg`, `wg-quick`, or directly edit `wg0.conf` | +| NixOS Support | Use `networking.wireguard.interfaces.wg0.peers` to declaratively add peers | + +--- + +## πŸ§ͺ Bonus Features You Could Build + +* πŸ” Per-user firewall rules +* πŸ” One-time config URLs (`vpn.mydomain.com/invite/xyz`) +* πŸ” Optional Authelia/Keycloak MFA +* πŸ“‰ Dashboard with currently active peers (use `wg show`) + +--- + +## ✨ Summary + +| Option | You Need Innernet / Headscale? | Login via OIDC? | DNS Routing? | Complexity | +| ----------------- | ------------------------------ | --------------- | ------------ | ---------- | +| Your own portal | ❌ No | βœ… Yes | ❌ Manual | 🟑 Medium | +| Innernet | βœ… Yes | βœ… Yes | βœ… Built-in | 🟠 Medium | +| Headscale + patch | βœ… Yes | βœ… Yes | βœ… Built-in | πŸ”΄ High | + +You’re on a great path by choosing the custom VPN portal idea. +Let me know if you'd like: + +* A **sample architecture repo** +* A **NixOS module to support peer configs** +* Help building the **login + config generator backend** + +I can generate a Nix flake and a working OIDC portal template to kickstart the project. diff --git a/nixos/modules/promtail.nix b/nixos/modules/promtail.nix index af5d478..84147a0 100644 --- a/nixos/modules/promtail.nix +++ b/nixos/modules/promtail.nix @@ -20,7 +20,7 @@ in filename = "/var/lib/promtail/positions.yaml"; }; clients = [{ - url = "http://monitor.local:3100/loki/api/v1/push"; + url = "http://monitor.lab:3100/loki/api/v1/push"; }]; scrape_configs = [{ job_name = "journal"; @@ -30,13 +30,14 @@ in job = "promtail"; host = config.networking.hostName; env = "proxmox"; - instance = "${config.networking.hostName}.local"; + instance = "${config.networking.hostName}.lab"; }; }; relabel_configs = [{ source_labels = ["__journal__systemd_unit"]; target_label = "unit"; }]; + }]; }; }; diff --git a/nixos/secrets/HOWTO.md b/nixos/secrets/HOWTO.md new file mode 100644 index 0000000..791314d --- /dev/null +++ b/nixos/secrets/HOWTO.md @@ -0,0 +1,59 @@ +### πŸ”§ Using Secrets in NixOS Configurations + +You can use decrypted SOPS secrets in your `configuration.nix`, service modules, and flake-based setups. + +#### πŸ”‘ 1. Use as environment variable (e.g. password) + +```nix +systemd.services.my-service.serviceConfig.EnvironmentFile = + config.sops.secrets."my-password".path; +``` + +> Your `secrets.yaml` should contain: +> +> ```yaml +> my-password: PASSWORD=supersecret +> ``` + +--- + +#### πŸ—‚ 2. Use as file source (e.g. private key or token) + +```nix +environment.etc."ssh/id_ed25519".source = + config.sops.secrets."ssh-private-key".path; +``` + +> This places the decrypted secret at `/etc/ssh/id_ed25519` with appropriate permissions. + +--- + +#### πŸ‘€ 3. Read a secret value directly (not recommended for sensitive data) + +```nix +# Use a secret as a string value in a setting +services.myapp.settings.apiKey = builtins.readFile config.sops.secrets."api-key".path; +``` + +--- + +#### πŸ›  4. Use in systemd preStart scripts + +```nix +systemd.services.my-service.preStart = '' + export PASSWORD=$(<${config.sops.secrets."my-password".path}) + ./myapp --auth $PASSWORD +''; +``` + +--- + +#### 🧠 5. Use in Forgejo user creation + +```nix +systemd.services.forgejo.preStart = '' + ${lib.getExe cfg.package} admin user create \ + --username admin \ + --password "$(tr -d '\n' < ${config.sops.secrets."admin-password".path})" +''; +``` diff --git a/nixos/secrets/README.md b/nixos/secrets/README.md index a536d14..f14a377 100644 --- a/nixos/secrets/README.md +++ b/nixos/secrets/README.md @@ -1,51 +1,104 @@ +# πŸ” Secrets Management (with SOPS + Nix) -πŸ”‘ 2. Generate an age Keypair +This directory contains encrypted secrets used across the infrastructure managed by NixOS and [sops-nix](https://github.com/Mic92/sops-nix). Secrets are stored using [SOPS](https://github.com/mozilla/sops) and encrypted with an `age` key located on each host at `/etc/sops/age.key`. -age-keygen -o secrets/age.key +--- -This will output something like: - -# created: 2025-06-02T22:00:00Z -# public key: age1abcdefghijk... - -Copy that public key somewhere β€” you’ll need it for encrypting. - -βœ… You should now have: +## πŸ“ Directory Structure +``` secrets/ -β”œβ”€β”€ age.key # keep this safe and private! +β”œβ”€β”€ forgejo/ +β”‚ └── secrets.yaml # Forgejo-specific secrets (admin password, DB password, secret key) +β”œβ”€β”€ runner/ +β”‚ └── secrets.yaml # Forgejo runner secrets (tokens, etc.) +β”œβ”€β”€ shared/ +β”‚ └── secrets.yaml # Shared secrets used across multiple VMs (SSH keys, tokens) +```` -πŸ“ 3. Create Encrypted Secrets File +--- -sops --age age1abcdefghijk... secrets/secrets.yaml +## πŸ›  SOPS Basics -This opens a YAML file in your $EDITOR. Add secrets like: +### βœ… Encrypt a **new secret file** -forgejo-admin-password: "my-super-secret-password" +```bash +sops --age -e > secrets/myservice/secrets.yaml +```` +Example: +```bash +sops --age $(cat ~/.config/sops/age/keys.txt | grep public) -e > secrets/forgejo/secrets.yaml +``` +> Press `i` to enter edit mode if prompted, or fill it using YAML format: +```yaml +admin-password: hunter2 +db-password: supersecret +``` -Save and close the file β€” it’s now encrypted using the public key. +--- -βœ… Now you should have: +### ✏️ Edit secrets in an existing file -secrets/ -β”œβ”€β”€ age.key -β”œβ”€β”€ secrets.yaml # encrypted file (safe to commit) +```bash +sops secrets/forgejo/secrets.yaml +``` -You can commit secrets.yaml, but do not commit age.key unless you're OK with putting it on a VM. +--- +## 🧬 Using Secrets in Nix -πŸ§ͺ Test Decryption Locally +### 🧩 Option 1: Reference shared secrets (via `defaultSopsFile`) -export SOPS_AGE_KEY_FILE=secrets/age.key +```nix +# shared-sops.nix +{ + sops = { + age.keyFile = "/etc/sops/age.key"; + defaultSopsFile = ../secrets/shared/secrets.yaml; -To test: + secrets = { + "monitoring-token".owner = "prometheus"; + }; + }; +} +``` -sops -d secrets/secrets.yaml +Then in services: -To edit: +```nix +environment.etc."monitoring/token".source = config.sops.secrets."monitoring-token".path; +``` -sops secrets/secrets.yaml +--- +### 🧩 Option 2: Reference per-service secrets with explicit `sopsFile` +```nix +# forgejo/sops.nix +{ + sops.secrets = { + "admin-password" = { + sopsFile = ./../secrets/forgejo/secrets.yaml; + owner = "forgejo"; + }; + }; +} +``` -[plasmagoat@forgejo:~]$ sudo chmod 400 /etc/sops/age.key && sudo chown root:root /etc/sops/age.key +--- + +## πŸ§ͺ Testing secrets setup + +Check which secrets will be applied: + +```bash +nixos-rebuild dry-activate --flake .#my-hostname +``` + +--- + +## πŸ“š Resources + +* [sops-nix](https://github.com/Mic92/sops-nix) +* [Mozilla SOPS](https://github.com/mozilla/sops) +* [age encryption](https://github.com/FiloSottile/age) diff --git a/nixos/secrets/forgejo/runner-secrets.yml b/nixos/secrets/forgejo/runner-secrets.yml new file mode 100644 index 0000000..1165df5 --- /dev/null +++ b/nixos/secrets/forgejo/runner-secrets.yml @@ -0,0 +1,16 @@ +forgejo-runner-registration-token: ENC[AES256_GCM,data:ms0Ouy5GP6rlwkiLXoq31ZPSi9bpDKpNOqzEFATHLHflt+YTIjWuPAVRvKEIEQ==,iv:z2snOwdGq3e7Mxl+CmnoOh8c+ZaA+6lNDdXh2vVLULM=,tag:5ZpELR8K5JBQraMBYdXSuA==,type:str] +sops: + age: + - recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSA0d3kzWXd2RElqdnViNGJG + NHd5bER1S2dVQUpxOER6Mi9TYlVuOHFhVzNNCnNWZVJzdU1LSG4yR3BNdmFEVzA2 + bFNzK2cxNG9OcTB6NC8wdDAxcCtDekkKLS0tIExQdWNJQnBmb05RMktoeXF0dDZC + M3FyUEswckYrUDdvdmdUYnBqaTZFcncK8aNh8jL8nzYv2vWwhxX4QPed1pjFr2zK + 9znxO+osZsUNIXySioLBfsA1kfqZCzaASsM2ezfWHKt1nCVQAvbXGA== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2025-06-06T21:10:54Z" + mac: ENC[AES256_GCM,data:cDk2zKgxX01y/X9eQCbLm6OW74nE9HJdtliE6iye3gsDKbM+SqCuU1JTBvEcOAeROLn4svJmlRe3DDTGhrnuNO8tL8qLXKt2oQ0CM+A/3kXBb/jG13ps57fEpD32u/QbK6smVDS0Li+TCHEtfqiLyVat42lgyy9kakgjOll//K0=,iv:K8ly08WGyHLpk07oUwaO7ygEqcriJ3Uq1Ev/FtUcfiY=,tag:8VRCalipvZv0DAOAu9tSlg==,type:str] + unencrypted_suffix: _unencrypted + version: 3.10.2 diff --git a/nixos/secrets/forgejo/secrets.yml b/nixos/secrets/forgejo/secrets.yml new file mode 100644 index 0000000..94f993a --- /dev/null +++ b/nixos/secrets/forgejo/secrets.yml @@ -0,0 +1,18 @@ +forgejo-admin-password: ENC[AES256_GCM,data:S05b/J9AK2SuIKDSWmtRf72C7V5FwMgZv/o5yxzNXRZEH2eIm18sC6+FEg==,iv:Ig/c4K9Io0S07Ywl4JQtbfxhjXJ7Rvea7+N4KhLUqjc=,tag:rx44tRuAbERBZR45QN6b9A==,type:str] +forgejo-db-password: ENC[AES256_GCM,data:5YwRl6HNa1LzJgr73ArllG9s+vWCS7m/s6QQh5YUz8I0anG7GQ==,iv:5ARq3unUy2xbDcAFkucvEhjz/QYC2rYgutEo4T2bw2E=,tag:k7eHKqeA7k6XzksLVcnXRw==,type:str] +forgejo-secret-key: ENC[AES256_GCM,data:iserDzOnJkM4HLP4c6rekSFANtRmEXwuCPyfMqo=,iv:3CNqN/DyS4PIl/iOO4JCpWJn3ARlb5KQSCNv5Orx2mo=,tag:q34jEpGrK2EKf0bcBznpQQ==,type:str] +sops: + age: + - recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBjeTdJNVExVjB2dzF0dTRu + ZEV1RHlvd3VPNUZ1b0FsQW14bHJOUUM5Z1NjCmhudWRoUjd5a3dWSEhwK1dDd0hK + N1JUUHhlOVFGVWxwalpvbXJVMlhtcGcKLS0tIFJmRjM4bnJ0TUIyWElaUUd3Y2Zq + LzBHRWZXODVDZTE2WnVZOGNQckk4KzAKdm3xnA03JnQnc07yhVVtYkVYS6654Zm1 + 4AcLRSCcWvWrvp26XYVE2UGqU7acfxrTsk07o0nHAQpa5LjgJ4oFKw== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2025-06-06T18:38:08Z" + mac: ENC[AES256_GCM,data:BvpIz6tfVSR3m1l7g4ilUyoTKKqirt+k6tPizxCsAgjztt0IyDCio+cLTln4P1tGSy/frjvbxy1mR3tIDkWn6aDFoYz/gnsbTKHSo/K5Q77jJ3uJffoB3/Wruigojl3EBIQHALicq9xhF8rsH/RKjpWqh+TrQwO+ibbA6ff76cw=,iv:Z0ZwJ9aPpI9MtbsZnvFkW7zsFFOMj5/Gv+tF/mal+yI=,tag:knf01NC/XwgjPUHH+8RpSg==,type:str] + unencrypted_suffix: _unencrypted + version: 3.10.2 diff --git a/nixos/secrets/secrets.yaml b/nixos/secrets/secrets.yaml deleted file mode 100644 index e4bc266..0000000 --- a/nixos/secrets/secrets.yaml +++ /dev/null @@ -1,27 +0,0 @@ -forgejo-admin-password: ENC[AES256_GCM,data:cLC4JQC8PMF4/aeVBzOROupPLzd7TbYwvudr7yVx4YpLCGSmYXRwJQAoXg==,iv:tG2kL66ZshwZkJodZQ5K8SZKfG1eJYeX9eYsZ7yM7rA=,tag:0roW0M9eUmzejkH6pwN/IA==,type:str] -forgejo-db-password: ENC[AES256_GCM,data:0KZJHmNuxpO8TmLNuryipICPTjG9h56+II1Azk+v3fkE5MAb9g==,iv:zb14BvbC2OehCYATgMMoPXv742jjD4v0B12cVhNCWBw=,tag:pnrboj5IvwXYXaZJbZpxTQ==,type:str] -hello: ENC[AES256_GCM,data:XkOLnE2Mkunc0zNF1932jOuz1olAwWf56lkqL2dt+h99WoL/vNLfSQ0al8NfEA==,iv:WC2xbB9WmB/khOVjdClFerJ8kjtHjaR/p6rDYaaDZhY=,tag:tT92FNrRm74XoZxoFFXm5g==,type:str] -example_key: ENC[AES256_GCM,data:kBk87OXu+qfJjP/2EA==,iv:64WcHaVfQrVCouUCZoHk0z/4ii8U9m61/E9SqLeB3Ms=,tag:MZJ6m7m4+s6BNGhtNs+ZFQ==,type:str] -#ENC[AES256_GCM,data:lM4LNQNU2S66a73pUymyUA==,iv:pAHgR+ViSO3Ff2zSaZQcXNGb2r2KH+ZbRd33vpq8ncs=,tag:WTNQCjaESLXTXwcwZePU2A==,type:comment] -example_array: - - ENC[AES256_GCM,data:Sc1q0Yd3sQ6eOzSwfQA=,iv:L4YBbWWeQZAYROHpiNEtHLDCdcuW+vvEpYhGxD0b62g=,tag:82L6MlHWIMpxKb4B3+Lszg==,type:str] - - ENC[AES256_GCM,data:Ud9dpSAcHc8NOq48wQI=,iv:9ERTBUQqKHPUIG57KXbRPMXN37cx+WcxOCDxCWpbE1k=,tag:ftTGF/obIJVZSTodIGoABw==,type:str] -example_number: ENC[AES256_GCM,data:1Xvp578L4rjW6g==,iv:82z/MQM586y4WilPZgmisa2C7GTdG0vmIEkyx/aMCXw=,tag:UtNDNKbu0tuhSyu1OQiJJA==,type:float] -example_booleans: - - ENC[AES256_GCM,data:RkxG/g==,iv:RNZpV/1KRWOazIuHj+SH7r3AmwnRBIUgXgfDplrk5X0=,tag:cKv0dVJGQcluscNspIrPgg==,type:bool] - - ENC[AES256_GCM,data:PvghSeY=,iv:xPlMb1LMsg5gAWsCXT3UnMyOfQmSKDKdDrjt+n9+Nqs=,tag:B2aROAGdcupDmoOHAiXeTg==,type:bool] -sops: - age: - - recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y - enc: | - -----BEGIN AGE ENCRYPTED FILE----- - YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBwVElvVXluZCsxK1BiT3c2 - Zm9kaURNdnZ2Nk9EM0dld2tjdFhrZlFiSEVnCk8zZVpWWlFXS3JYS0Q2WHExLzFU - WkFwcDFmR3VrdHFmS2JmVC95TnZIMjQKLS0tIGsyVmp1Sm1uL3FKVWlERUZHdmVw - TG9HYXdUdlZNYXJUZng2ejBwbjJoNVkK0ER6mqLdz0hEaovWME4p56tjuYbPIuhb - X1smwLmHxgcRboeFU5dyp3wZKBg7ccRPneQKsgJvYb929BesynHr6g== - -----END AGE ENCRYPTED FILE----- - lastmodified: "2025-06-03T16:03:32Z" - mac: ENC[AES256_GCM,data:mLCtH1EPm1cD7KD/fCVO0hrIfG6AOl396kcwdahyr326IRvTneT+6lr+f0XAHSkPXtRsmSCiD9WNhLYAh/kCfsP7tVPKl4X17OHkK9blUJ5JpuqnZJfOQ3PXNitYFvcSUUi1Y1/vIQmDf52oTPlcZgxmTgsQj4MEJIIni7d0SOc=,iv:MhAJ0QAdyHv8BzHIBQ/lZ7zV/MKjcsicbBOw9kwo7Nc=,tag:qrfTfCPxAMvXOm69BMWJ4g==,type:str] - unencrypted_suffix: _unencrypted - version: 3.10.2 diff --git a/nixos/secrets/shared-sops.nix b/nixos/secrets/shared-sops.nix new file mode 100644 index 0000000..dd47aec --- /dev/null +++ b/nixos/secrets/shared-sops.nix @@ -0,0 +1,11 @@ +{ + sops = { + age.keyFile = "/etc/sops/age.key"; + defaultSopsFile = ./shared/secrets.yml; + secrets = { + # "monitoring-token".owner = "prometheus"; + # "ssh-private-key".owner = "root"; + # "ssh-public-key".owner = "root"; + }; + }; +} diff --git a/nixos/secrets/shared/secrets.yml b/nixos/secrets/shared/secrets.yml new file mode 100644 index 0000000..53b1990 --- /dev/null +++ b/nixos/secrets/shared/secrets.yml @@ -0,0 +1,16 @@ +test: ENC[AES256_GCM,data:a+pmog==,iv:3Ledge90oTzTM8uNFWWIgLafa7/Hhx9WzXRAS3flUZo=,tag:mfWiEWxkZVihuX3S3SY12w==,type:str] +sops: + age: + - recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBudWI1RXZBOGFoaGIwWWc2 + cXdpems4SmY1QjNhOWxHTVBhR1ZPYjdSZUcwClhHb1IzSTBmcFllbWl2UVpabGFo + Sm1neTlUVVUwalZUSWVLaVNBUWhUSTAKLS0tIG5yVHBiWGMwMG1OTnBXckh4eXdv + MU5UQ29lMmw0ZEJnV09IWEpWc2NYT2MKfl+3cZvgunrgGr7KhjGZOlJ0EKRAiAED + pDGr25OcjQgpsg4/LCPPOMsi9Yyi/RICJGyDDINaTLMEQDhIsoOyUQ== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2025-06-06T17:29:59Z" + mac: ENC[AES256_GCM,data:nvb3Wc3578e45ob2FyyWlsadVOdErTfJ2Ni5jb06f/WbzDkyJd3lCBRTUIAdyXijT4ErtogHImBjYXzRuCi9xP68mTtaoQb6l8bULKJLdY/yDcMzMyKGZLDxTVW80nLvDrqs5piKBYFWtyFaAEio8fVlA4RIUsyFx/mgcbI3ChA=,iv:9/DmD48MKzBNGSODUr4jqDv17r2o4xgH7TVbpQeuyCU=,tag:1uVui9sVI9SfTlgtqPCLMA==,type:str] + unencrypted_suffix: _unencrypted + version: 3.10.2 diff --git a/nixos/secrets/sops.nix b/nixos/secrets/sops.nix deleted file mode 100644 index 05c4290..0000000 --- a/nixos/secrets/sops.nix +++ /dev/null @@ -1,8 +0,0 @@ -{ config, lib, ... }: -{ - sops = { - defaultSopsFile = ./secrets.yaml; - age.keyFile = "/etc/sops/age.key"; - #secrets."forgejo-admin-password".owner = "forgejo"; - }; -} diff --git a/nixos/secrets/telegram/secrets.yml b/nixos/secrets/telegram/secrets.yml new file mode 100644 index 0000000..e5d637e --- /dev/null +++ b/nixos/secrets/telegram/secrets.yml @@ -0,0 +1,16 @@ +telegram-alert-bot-token: ENC[AES256_GCM,data:mM1aYhpcCecRUdwkdlBKA+dWOHZEwUvP+m4MIg4n89SzgY8GWw0z1OaIpxfR0w==,iv:tzmCjiYntDYpkO4S0a/tMQkfGQpZjLBiBu4Rs/5RHbc=,tag:5cZDEK474WzXwIW5Jc7S5w==,type:str] +sops: + age: + - recipient: age1n20y9kmdh324m3tkclvhmyuc7c8hk4w84zsal725adahwl8nzq0s04aq4y + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSA5RE82S05lbHV6aE9qZFhL + MTZ4Y1R6cXFSZUFhZHdHbzZ5OWdrOXhwdEZ3CkxzM1NtSjNzeXM5byt1VzVtRHpH + Tk9ORUtEZ05FMTgrYVNhU3dKRkFKaHMKLS0tIFlLeWJ1dmJsRWc5SkhDbjdEb0or + UHl6emN0My8wcFZWYlZEaElrb2NidjgKlZols9SJQxgaoOdJJxghqlACBcwuFs94 + IGAOoQVUSFhMCWzyXqAQ/1/VkbWqfiUmvqDa3ulEK2Ri+1F+u3mB1Q== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2025-06-06T21:21:32Z" + mac: ENC[AES256_GCM,data:YS7BLFXkQ/A5PVLVOyMaqRHGavY0YttFps3njzSiYgBUa4VfPHqMcl2fW5vMec5MwM3GKPFGtrSEZKK1NVqLxUWZrfIF6ugAZ4vhRCyWe1Kze2Zs2S0ia2C3mUdhQR2wb7M7YzohI/e7PDZo0UcrcG3YeEzS5NL7qb0hzFsrGLY=,iv:kqzD06q5X0ZkZ1sIoUQz05b6QRDWQVsPqQYxPP2OAl8=,tag:eexvJspUxpDpwJqU1zEMnA==,type:str] + unencrypted_suffix: _unencrypted + version: 3.10.2