alerts...
This commit is contained in:
parent
7278922625
commit
fc9971ddc9
5 changed files with 324 additions and 4 deletions
|
|
@ -3,6 +3,7 @@ let
|
||||||
cfg = config.services.forgejo;
|
cfg = config.services.forgejo;
|
||||||
srv = cfg.settings.server;
|
srv = cfg.settings.server;
|
||||||
domain = "git.procopius.dk";
|
domain = "git.procopius.dk";
|
||||||
|
ssh_domain = "gitssh.procopius.dk";
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
users.users.plasmagoat.extraGroups = [ "forgejo" ];
|
users.users.plasmagoat.extraGroups = [ "forgejo" ];
|
||||||
|
|
@ -19,6 +20,10 @@ in
|
||||||
ROOT_URL = "https://${srv.DOMAIN}/";
|
ROOT_URL = "https://${srv.DOMAIN}/";
|
||||||
PROTOCOL = "http";
|
PROTOCOL = "http";
|
||||||
HTTP_PORT = 3000;
|
HTTP_PORT = 3000;
|
||||||
|
|
||||||
|
START_SSH_SERVER = true;
|
||||||
|
SSH_PORT = 2222;
|
||||||
|
SSH_DOMAIN = ssh_domain;
|
||||||
};
|
};
|
||||||
database = {
|
database = {
|
||||||
DB_TYPE = lib.mkForce "postgres";
|
DB_TYPE = lib.mkForce "postgres";
|
||||||
|
|
@ -61,5 +66,5 @@ in
|
||||||
'';
|
'';
|
||||||
|
|
||||||
# Optional: firewall
|
# Optional: firewall
|
||||||
networking.firewall.allowedTCPPorts = [ 3000 ];
|
networking.firewall.allowedTCPPorts = [ 3000 2222 ];
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@
|
||||||
http_port = 3000;
|
http_port = 3000;
|
||||||
http_addr = "0.0.0.0";
|
http_addr = "0.0.0.0";
|
||||||
# Grafana needs to know on which domain and URL it's running
|
# Grafana needs to know on which domain and URL it's running
|
||||||
# domain = "monitor.local";
|
domain = "grafana.procopius.dk";
|
||||||
# root_url = "https://monitor.local/grafana/"; # Not needed if it is `https://your.domain/`
|
# root_url = "https://monitor.local/grafana/"; # Not needed if it is `https://your.domain/`
|
||||||
# serve_from_sub_path = true;
|
# serve_from_sub_path = true;
|
||||||
};
|
};
|
||||||
|
|
@ -22,11 +22,13 @@
|
||||||
datasources.settings.datasources = [
|
datasources.settings.datasources = [
|
||||||
# "Built-in" datasources can be provisioned - c.f. https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources
|
# "Built-in" datasources can be provisioned - c.f. https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources
|
||||||
{
|
{
|
||||||
|
uid = "prometheus";
|
||||||
name = "Prometheus";
|
name = "Prometheus";
|
||||||
type = "prometheus";
|
type = "prometheus";
|
||||||
url = "http://127.0.0.1:${toString config.services.prometheus.port}";
|
url = "http://127.0.0.1:${toString config.services.prometheus.port}";
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
|
uid = "loki";
|
||||||
name = "Loki";
|
name = "Loki";
|
||||||
type = "loki";
|
type = "loki";
|
||||||
url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}";
|
url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}";
|
||||||
|
|
@ -41,7 +43,13 @@
|
||||||
|
|
||||||
# Note: removing attributes from the above `datasources.settings.datasources` is not enough for them to be deleted on `grafana`;
|
# Note: removing attributes from the above `datasources.settings.datasources` is not enough for them to be deleted on `grafana`;
|
||||||
# One needs to use the following option:
|
# One needs to use the following option:
|
||||||
# datasources.settings.deleteDatasources = [ { name = "foo"; orgId = 1; } { name = "bar"; orgId = 1; } ];
|
# datasources.settings.deleteDatasources = [ { name = "prometheus"; orgId = 1; } { name = "loki"; orgId = 1; } ];
|
||||||
|
|
||||||
|
alerting.rules.path = "/etc/grafana/provisioning/alerting/alerts.yml";
|
||||||
|
|
||||||
|
# notifiers.settings = {
|
||||||
|
# path = "/etc/grafana/provisioning/notifiers";
|
||||||
|
# };
|
||||||
|
|
||||||
dashboards.settings.providers = [{
|
dashboards.settings.providers = [{
|
||||||
name = "my dashboards";
|
name = "my dashboards";
|
||||||
|
|
@ -91,4 +99,20 @@
|
||||||
group = "grafana";
|
group = "grafana";
|
||||||
mode = "0644";
|
mode = "0644";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# 🔔 Alerts provisioning
|
||||||
|
environment.etc."grafana/provisioning/alerting/alerts.yml" = {
|
||||||
|
source = ./provisioning/alerting/alerts.yml;
|
||||||
|
user = "grafana";
|
||||||
|
group = "grafana";
|
||||||
|
mode = "0644";
|
||||||
|
};
|
||||||
|
|
||||||
|
# 📬 Contact point provisioning
|
||||||
|
environment.etc."grafana/provisioning/notifiers/contact-points.yml" = {
|
||||||
|
source = ./provisioning/notifiers/contact-points.yml;
|
||||||
|
user = "grafana";
|
||||||
|
group = "grafana";
|
||||||
|
mode = "0644";
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,9 @@ let
|
||||||
forgejo_ip = "forgejo.local";
|
forgejo_ip = "forgejo.local";
|
||||||
|
|
||||||
prometheus_exporter_port = 9100;
|
prometheus_exporter_port = 9100;
|
||||||
|
postgres_exporter_port = 9187;
|
||||||
|
prometheus_port = 9090;
|
||||||
|
grafana_port = 3000;
|
||||||
promtail_port = 9080;
|
promtail_port = 9080;
|
||||||
traefik_monitor_port = 8082;
|
traefik_monitor_port = 8082;
|
||||||
forgejo_monitor_port = 3000;
|
forgejo_monitor_port = 3000;
|
||||||
|
|
@ -34,6 +37,18 @@ in {
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
job_name = "grafana";
|
||||||
|
static_configs = [
|
||||||
|
{ targets = [ "${monitor_ip}:${toString grafana_port}" ]; }
|
||||||
|
];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "prometheus";
|
||||||
|
static_configs = [
|
||||||
|
{ targets = [ "${monitor_ip}:${toString prometheus_port}" ]; }
|
||||||
|
];
|
||||||
|
}
|
||||||
{
|
{
|
||||||
job_name = "traefik";
|
job_name = "traefik";
|
||||||
static_configs = [
|
static_configs = [
|
||||||
|
|
@ -49,7 +64,7 @@ in {
|
||||||
{
|
{
|
||||||
job_name = "postgres";
|
job_name = "postgres";
|
||||||
static_configs = [
|
static_configs = [
|
||||||
{ targets = [ "${forgejo_ip}:9187" ]; }
|
{ targets = [ "${forgejo_ip}:${toString postgres_exporter_port}" ]; }
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
|
|
|
||||||
265
nixos/hosts/monitoring/provisioning/alerting/alerts.yml
Normal file
265
nixos/hosts/monitoring/provisioning/alerting/alerts.yml
Normal file
|
|
@ -0,0 +1,265 @@
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
groups:
|
||||||
|
- name: node-alerts
|
||||||
|
folder: "VM Alerts"
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: high_cpu
|
||||||
|
title: High CPU Usage
|
||||||
|
condition: A
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
summary: "High CPU on {{ $labels.instance }}"
|
||||||
|
description: "CPU > 90% for 5 minutes"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: { from: 300, to: 0 }
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
expr: avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) by (instance) > 0.9
|
||||||
|
interval: ""
|
||||||
|
datasource: { type: prometheus, uid: prometheus }
|
||||||
|
instant: false
|
||||||
|
intervalMs: 15000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
|
||||||
|
- uid: high_memory
|
||||||
|
title: High Memory Usage
|
||||||
|
condition: A
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
summary: "High memory on {{ $labels.instance }}"
|
||||||
|
description: "Memory > 90% for 5 minutes"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: { from: 300, to: 0 }
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.9
|
||||||
|
interval: ""
|
||||||
|
datasource: { type: prometheus, uid: prometheus }
|
||||||
|
instant: false
|
||||||
|
intervalMs: 15000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
|
||||||
|
- uid: low_disk
|
||||||
|
title: Low Disk Space
|
||||||
|
condition: A
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
summary: "Low disk on {{ $labels.instance }} {{ $labels.mountpoint }}"
|
||||||
|
description: "< 10% space left"
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: { from: 300, to: 0 }
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.1
|
||||||
|
interval: ""
|
||||||
|
datasource: { type: prometheus, uid: prometheus }
|
||||||
|
instant: false
|
||||||
|
intervalMs: 15000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
|
||||||
|
- uid: node_down
|
||||||
|
title: Node is Down
|
||||||
|
condition: A
|
||||||
|
for: 1m
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $labels.instance }} is DOWN"
|
||||||
|
description: "Exporter is not responding"
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: { from: 60, to: 0 }
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
expr: up == 0
|
||||||
|
interval: ""
|
||||||
|
datasource: { type: prometheus, uid: prometheus }
|
||||||
|
instant: true
|
||||||
|
intervalMs: 15000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
|
||||||
|
- name: prometheus-alerts
|
||||||
|
folder: "Prometheus Alerts"
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: prometheus_down
|
||||||
|
title: Prometheus is Down
|
||||||
|
condition: A
|
||||||
|
for: 1m
|
||||||
|
annotations:
|
||||||
|
summary: "Prometheus is not responding"
|
||||||
|
description: "up{job='prometheus'} == 0"
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: { from: 60, to: 0 }
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
expr: up{job="prometheus"} == 0
|
||||||
|
interval: ""
|
||||||
|
datasource: { type: prometheus, uid: prometheus }
|
||||||
|
instant: true
|
||||||
|
intervalMs: 15000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
|
||||||
|
- name: traefik-alerts
|
||||||
|
folder: "Traefik Alerts"
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: traefik_5xx
|
||||||
|
title: Traefik 5xx Errors
|
||||||
|
condition: A
|
||||||
|
for: 1m
|
||||||
|
annotations:
|
||||||
|
summary: "Traefik has a high rate of 5xx responses"
|
||||||
|
description: "Rate of HTTP 5xx responses is high"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: { from: 60, to: 0 }
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
expr: rate(traefik_service_requests_total{code=~"5.."}[1m]) > 0
|
||||||
|
interval: ""
|
||||||
|
datasource: { type: prometheus, uid: prometheus }
|
||||||
|
instant: false
|
||||||
|
intervalMs: 15000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
|
||||||
|
- name: grafana-alerts
|
||||||
|
folder: "Grafana Alerts"
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: grafana_down
|
||||||
|
title: Grafana is Down
|
||||||
|
condition: A
|
||||||
|
for: 1m
|
||||||
|
annotations:
|
||||||
|
summary: "Grafana is not responding"
|
||||||
|
description: "up{job='grafana'} == 0"
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: { from: 60, to: 0 }
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
expr: up{job="grafana"} == 0
|
||||||
|
interval: ""
|
||||||
|
datasource: { type: prometheus, uid: prometheus }
|
||||||
|
instant: true
|
||||||
|
intervalMs: 15000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
|
||||||
|
- name: postgres-alerts
|
||||||
|
folder: "Postgres Alerts"
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: postgres_down
|
||||||
|
title: Postgres is Down
|
||||||
|
condition: A
|
||||||
|
for: 1m
|
||||||
|
annotations:
|
||||||
|
summary: "Postgres is not responding"
|
||||||
|
description: "up{job='postgres'} == 0"
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: { from: 60, to: 0 }
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
expr: up{job="postgres"} == 0
|
||||||
|
interval: ""
|
||||||
|
datasource: { type: prometheus, uid: prometheus }
|
||||||
|
instant: true
|
||||||
|
intervalMs: 15000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
|
||||||
|
- name: gitea-alerts
|
||||||
|
folder: "Gitea Alerts"
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: gitea_down
|
||||||
|
title: Gitea is Down
|
||||||
|
condition: A
|
||||||
|
for: 1m
|
||||||
|
annotations:
|
||||||
|
summary: "Gitea is not responding"
|
||||||
|
description: "up{job='gitea'} == 0"
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: { from: 60, to: 0 }
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
expr: up{job="gitea"} == 0
|
||||||
|
interval: ""
|
||||||
|
datasource: { type: prometheus, uid: prometheus }
|
||||||
|
instant: true
|
||||||
|
intervalMs: 15000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
|
||||||
|
- name: promtail-alerts
|
||||||
|
folder: "Promtail Alerts"
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: promtail_down
|
||||||
|
title: Promtail is Down
|
||||||
|
condition: A
|
||||||
|
for: 1m
|
||||||
|
annotations:
|
||||||
|
summary: "Promtail is not responding"
|
||||||
|
description: "up{job='promtail'} == 0"
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: { from: 60, to: 0 }
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
expr: up{job="promtail"} == 0
|
||||||
|
interval: ""
|
||||||
|
datasource: { type: prometheus, uid: prometheus }
|
||||||
|
instant: true
|
||||||
|
intervalMs: 15000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
|
||||||
|
- name: logs-alerts
|
||||||
|
folder: "Logs Alerts"
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: failed_ssh_logins
|
||||||
|
title: Failed SSH Logins
|
||||||
|
condition: A
|
||||||
|
for: 1m
|
||||||
|
annotations:
|
||||||
|
summary: "Too many failed SSH login attempts"
|
||||||
|
description: "Check for brute force login attempts"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange: { from: 300, to: 0 }
|
||||||
|
datasourceUid: prometheus
|
||||||
|
model:
|
||||||
|
expr: sum(rate(failed_ssh_logins[5m])) > 5
|
||||||
|
interval: ""
|
||||||
|
datasource: { type: prometheus, uid: prometheus }
|
||||||
|
instant: false
|
||||||
|
intervalMs: 15000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
|
@ -0,0 +1,11 @@
|
||||||
|
# /etc/grafana/provisioning/notifiers/contact-points.yml
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
contactPoints:
|
||||||
|
- orgId: 1
|
||||||
|
name: telegram
|
||||||
|
type: telegram
|
||||||
|
settings:
|
||||||
|
bottoken: "__YOUR_BOT_TOKEN__"
|
||||||
|
chatid: "__YOUR_CHAT_ID__"
|
||||||
|
disableResolveMessage: false
|
||||||
Loading…
Add table
Add a link
Reference in a new issue