ready for runners
This commit is contained in:
parent
fc9971ddc9
commit
7dd5043b5d
49 changed files with 2569 additions and 1085 deletions
|
|
@ -1,265 +0,0 @@
|
|||
apiVersion: 1
|
||||
|
||||
groups:
|
||||
- name: node-alerts
|
||||
folder: "VM Alerts"
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: high_cpu
|
||||
title: High CPU Usage
|
||||
condition: A
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "High CPU on {{ $labels.instance }}"
|
||||
description: "CPU > 90% for 5 minutes"
|
||||
labels:
|
||||
severity: warning
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: { from: 300, to: 0 }
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) by (instance) > 0.9
|
||||
interval: ""
|
||||
datasource: { type: prometheus, uid: prometheus }
|
||||
instant: false
|
||||
intervalMs: 15000
|
||||
maxDataPoints: 43200
|
||||
|
||||
- uid: high_memory
|
||||
title: High Memory Usage
|
||||
condition: A
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "High memory on {{ $labels.instance }}"
|
||||
description: "Memory > 90% for 5 minutes"
|
||||
labels:
|
||||
severity: warning
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: { from: 300, to: 0 }
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 0.9
|
||||
interval: ""
|
||||
datasource: { type: prometheus, uid: prometheus }
|
||||
instant: false
|
||||
intervalMs: 15000
|
||||
maxDataPoints: 43200
|
||||
|
||||
- uid: low_disk
|
||||
title: Low Disk Space
|
||||
condition: A
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "Low disk on {{ $labels.instance }} {{ $labels.mountpoint }}"
|
||||
description: "< 10% space left"
|
||||
labels:
|
||||
severity: critical
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: { from: 300, to: 0 }
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.1
|
||||
interval: ""
|
||||
datasource: { type: prometheus, uid: prometheus }
|
||||
instant: false
|
||||
intervalMs: 15000
|
||||
maxDataPoints: 43200
|
||||
|
||||
- uid: node_down
|
||||
title: Node is Down
|
||||
condition: A
|
||||
for: 1m
|
||||
annotations:
|
||||
summary: "{{ $labels.instance }} is DOWN"
|
||||
description: "Exporter is not responding"
|
||||
labels:
|
||||
severity: critical
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: { from: 60, to: 0 }
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: up == 0
|
||||
interval: ""
|
||||
datasource: { type: prometheus, uid: prometheus }
|
||||
instant: true
|
||||
intervalMs: 15000
|
||||
maxDataPoints: 43200
|
||||
|
||||
- name: prometheus-alerts
|
||||
folder: "Prometheus Alerts"
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: prometheus_down
|
||||
title: Prometheus is Down
|
||||
condition: A
|
||||
for: 1m
|
||||
annotations:
|
||||
summary: "Prometheus is not responding"
|
||||
description: "up{job='prometheus'} == 0"
|
||||
labels:
|
||||
severity: critical
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: { from: 60, to: 0 }
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: up{job="prometheus"} == 0
|
||||
interval: ""
|
||||
datasource: { type: prometheus, uid: prometheus }
|
||||
instant: true
|
||||
intervalMs: 15000
|
||||
maxDataPoints: 43200
|
||||
|
||||
- name: traefik-alerts
|
||||
folder: "Traefik Alerts"
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: traefik_5xx
|
||||
title: Traefik 5xx Errors
|
||||
condition: A
|
||||
for: 1m
|
||||
annotations:
|
||||
summary: "Traefik has a high rate of 5xx responses"
|
||||
description: "Rate of HTTP 5xx responses is high"
|
||||
labels:
|
||||
severity: warning
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: { from: 60, to: 0 }
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: rate(traefik_service_requests_total{code=~"5.."}[1m]) > 0
|
||||
interval: ""
|
||||
datasource: { type: prometheus, uid: prometheus }
|
||||
instant: false
|
||||
intervalMs: 15000
|
||||
maxDataPoints: 43200
|
||||
|
||||
- name: grafana-alerts
|
||||
folder: "Grafana Alerts"
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: grafana_down
|
||||
title: Grafana is Down
|
||||
condition: A
|
||||
for: 1m
|
||||
annotations:
|
||||
summary: "Grafana is not responding"
|
||||
description: "up{job='grafana'} == 0"
|
||||
labels:
|
||||
severity: critical
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: { from: 60, to: 0 }
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: up{job="grafana"} == 0
|
||||
interval: ""
|
||||
datasource: { type: prometheus, uid: prometheus }
|
||||
instant: true
|
||||
intervalMs: 15000
|
||||
maxDataPoints: 43200
|
||||
|
||||
- name: postgres-alerts
|
||||
folder: "Postgres Alerts"
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: postgres_down
|
||||
title: Postgres is Down
|
||||
condition: A
|
||||
for: 1m
|
||||
annotations:
|
||||
summary: "Postgres is not responding"
|
||||
description: "up{job='postgres'} == 0"
|
||||
labels:
|
||||
severity: critical
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: { from: 60, to: 0 }
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: up{job="postgres"} == 0
|
||||
interval: ""
|
||||
datasource: { type: prometheus, uid: prometheus }
|
||||
instant: true
|
||||
intervalMs: 15000
|
||||
maxDataPoints: 43200
|
||||
|
||||
- name: gitea-alerts
|
||||
folder: "Gitea Alerts"
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: gitea_down
|
||||
title: Gitea is Down
|
||||
condition: A
|
||||
for: 1m
|
||||
annotations:
|
||||
summary: "Gitea is not responding"
|
||||
description: "up{job='gitea'} == 0"
|
||||
labels:
|
||||
severity: critical
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: { from: 60, to: 0 }
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: up{job="gitea"} == 0
|
||||
interval: ""
|
||||
datasource: { type: prometheus, uid: prometheus }
|
||||
instant: true
|
||||
intervalMs: 15000
|
||||
maxDataPoints: 43200
|
||||
|
||||
- name: promtail-alerts
|
||||
folder: "Promtail Alerts"
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: promtail_down
|
||||
title: Promtail is Down
|
||||
condition: A
|
||||
for: 1m
|
||||
annotations:
|
||||
summary: "Promtail is not responding"
|
||||
description: "up{job='promtail'} == 0"
|
||||
labels:
|
||||
severity: critical
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: { from: 60, to: 0 }
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: up{job="promtail"} == 0
|
||||
interval: ""
|
||||
datasource: { type: prometheus, uid: prometheus }
|
||||
instant: true
|
||||
intervalMs: 15000
|
||||
maxDataPoints: 43200
|
||||
|
||||
- name: logs-alerts
|
||||
folder: "Logs Alerts"
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: failed_ssh_logins
|
||||
title: Failed SSH Logins
|
||||
condition: A
|
||||
for: 1m
|
||||
annotations:
|
||||
summary: "Too many failed SSH login attempts"
|
||||
description: "Check for brute force login attempts"
|
||||
labels:
|
||||
severity: warning
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange: { from: 300, to: 0 }
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: sum(rate(failed_ssh_logins[5m])) > 5
|
||||
interval: ""
|
||||
datasource: { type: prometheus, uid: prometheus }
|
||||
instant: false
|
||||
intervalMs: 15000
|
||||
maxDataPoints: 43200
|
||||
39
nixos/hosts/monitoring/provisioning/alerts/loki-alerts.yml
Normal file
39
nixos/hosts/monitoring/provisioning/alerts/loki-alerts.yml
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
groups:
|
||||
- name: Loki
|
||||
|
||||
rules:
|
||||
- alert: LokiProcessTooManyRestarts
|
||||
expr: 'changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Loki process too many restarts (instance {{ $labels.instance }})
|
||||
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestErrors
|
||||
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request errors (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestPanic
|
||||
expr: "sum(increase(loki_panic_total[10m])) by (namespace, job) > 0"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request panic (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestLatency
|
||||
expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request latency (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -0,0 +1,299 @@
|
|||
groups:
|
||||
- name: NodeExporterV2
|
||||
rules:
|
||||
- alert: Node down
|
||||
expr: up{job="monitoring-pi"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
title: Node {{ $labels.instance }} is down
|
||||
description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down.
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description: The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}
|
||||
|
||||
# Please add ignored mountpoints in node_exporter parameters like
|
||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: Disk is almost full (< 10% left)\n VALUE = {{ $value }}
|
||||
|
||||
# Please add ignored mountpoints in node_exporter parameters like
|
||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostOutOfInodes
|
||||
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||
description: Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostInodesWillFillIn24Hours
|
||||
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostUnusualDiskReadLatency
|
||||
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||
description: Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostUnusualDiskWriteLatency
|
||||
expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||
description: Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: CPU load is > 80%\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}
|
||||
|
||||
# 1000 context switches is an arbitrary number.
|
||||
# Alert threshold depends on nature of application.
|
||||
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
|
||||
- alert: HostContextSwitching
|
||||
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host context switching (instance {{ $labels.instance }})
|
||||
description: Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
description: Swap is filling up (>80%)\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostSystemdServiceCrashed
|
||||
expr: node_systemd_unit_state{state="failed"} == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host SystemD service crashed (instance {{ $labels.instance }})
|
||||
description: SystemD service crashed\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: node_hwmon_temp_celsius > 75
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description: Physical hardware component too hot\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: node_hwmon_temp_crit_alarm_celsius == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||
description: Physical node temperature alarm triggered\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: node_md_state{state="inactive"} > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||
description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostRaidDiskFailure
|
||||
expr: node_md_disks{state="failed"} > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||
description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostKernelVersionDeviations
|
||||
expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||
description: Different kernel versions are running\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: OOM kill detected\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr: increase(node_edac_correctable_errors_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||
description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr: node_edac_uncorrectable_errors_total > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }})
|
||||
description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }})
|
||||
description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostNetworkInterfaceSaturated
|
||||
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }})
|
||||
description: The network interface is getting overloaded.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostConntrackLimit
|
||||
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||
description: The number of conntrack is approching limit\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock skew (instance {{ $labels.instance }})
|
||||
description: Clock skew detected. Clock is out of sync.\n VALUE = {{ $value }}
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
description: Clock not synchronising.\n VALUE = {{ $value }}
|
||||
|
|
@ -0,0 +1,320 @@
|
|||
groups:
|
||||
|
||||
- name: NodeExporter
|
||||
|
||||
rules:
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryIsUnderutilized
|
||||
expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostDiskMayFillIn24Hours
|
||||
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfInodes
|
||||
expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostFilesystemDeviceError
|
||||
expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host filesystem device error (instance {{ $labels.instance }})
|
||||
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostInodesMayFillIn24Hours
|
||||
expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadLatency
|
||||
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteLatency
|
||||
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuIsUnderutilized
|
||||
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuHighIowait
|
||||
expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
||||
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskIo
|
||||
expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostContextSwitchingHigh
|
||||
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host context switching high (instance {{ $labels.instance }})
|
||||
description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSystemdServiceCrashed
|
||||
expr: '(node_systemd_unit_state{state="failed"} == 1)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
||||
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSoftwareRaidInsufficientDrives
|
||||
expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
|
||||
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSoftwareRaidDiskFailure
|
||||
expr: '(node_md_disks{state="failed"} > 0)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host software RAID disk failure (instance {{ $labels.instance }})
|
||||
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostKernelVersionDeviations
|
||||
expr: 'changes(node_uname_info[1h]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||
description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr: '(node_edac_uncorrectable_errors_total > 0)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkBondDegraded
|
||||
expr: '((node_bonding_active - node_bonding_slaves) != 0)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostConntrackLimit
|
||||
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock skew (instance {{ $labels.instance }})
|
||||
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
201
nixos/hosts/monitoring/provisioning/alerts/postgres-alerts.yml
Normal file
201
nixos/hosts/monitoring/provisioning/alerts/postgres-alerts.yml
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
groups:
|
||||
- name: Postgres
|
||||
|
||||
rules:
|
||||
- alert: PostgresqlDown
|
||||
expr: "pg_up == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql down (instance {{ $labels.instance }})
|
||||
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlRestarted
|
||||
expr: "time() - pg_postmaster_start_time_seconds < 60"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql restarted (instance {{ $labels.instance }})
|
||||
description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlExporterError
|
||||
expr: "pg_exporter_last_scrape_error > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql exporter error (instance {{ $labels.instance }})
|
||||
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTableNotAutoVacuumed
|
||||
expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
|
||||
description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTableNotAutoAnalyzed
|
||||
expr: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
|
||||
description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyConnections
|
||||
expr: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql too many connections (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlNotEnoughConnections
|
||||
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlDeadLocks
|
||||
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql dead locks (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRollbackRate
|
||||
expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql high rollback rate (instance {{ $labels.instance }})
|
||||
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlCommitRateLow
|
||||
expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql commit rate low (instance {{ $labels.instance }})
|
||||
description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlLowXidConsumption
|
||||
expr: "rate(pg_txid_current[1m]) < 5"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql low XID consumption (instance {{ $labels.instance }})
|
||||
description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateStatementTimeout
|
||||
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
|
||||
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateDeadlock
|
||||
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
|
||||
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlUnusedReplicationSlot
|
||||
expr: "pg_replication_slots_active == 0"
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql unused replication slot (instance {{ $labels.instance }})
|
||||
description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyDeadTuples
|
||||
expr: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlConfigurationChanged
|
||||
expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Postgresql configuration changed (instance {{ $labels.instance }})
|
||||
description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlSslCompressionActive
|
||||
expr: "sum(pg_stat_ssl_compression) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql SSL compression active (instance {{ $labels.instance }})
|
||||
description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyLocksAcquired
|
||||
expr: "((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20"
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
|
||||
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlBloatIndexHigh(>80%)
|
||||
expr: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)"
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
|
||||
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlBloatTableHigh(>80%)
|
||||
expr: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)"
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
|
||||
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlInvalidIndex
|
||||
expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql invalid index (instance {{ $labels.instance }})
|
||||
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlReplicationLag
|
||||
expr: "pg_replication_lag_seconds > 5"
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql replication lag (instance {{ $labels.instance }})
|
||||
description: "The PostgreSQL replication lag is high (> 5s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
255
nixos/hosts/monitoring/provisioning/alerts/prometheus-alerts.yml
Normal file
255
nixos/hosts/monitoring/provisioning/alerts/prometheus-alerts.yml
Normal file
|
|
@ -0,0 +1,255 @@
|
|||
groups:
|
||||
- name: Prometheus
|
||||
|
||||
rules:
|
||||
- alert: PrometheusJobMissing
|
||||
expr: 'absent(up{job="prometheus"})'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus job missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetMissing
|
||||
expr: "up == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus target missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAllTargetsMissing
|
||||
expr: "sum by (job) (up) == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus all targets missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetMissingWithWarmupTime
|
||||
expr: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus target missing with warmup time (instance {{ $labels.instance }})
|
||||
description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusConfigurationReloadFailure
|
||||
expr: "prometheus_config_last_reload_successful != 1"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
|
||||
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTooManyRestarts
|
||||
expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus too many restarts (instance {{ $labels.instance }})
|
||||
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerJobMissing
|
||||
expr: 'absent(up{job="alertmanager"})'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerConfigurationReloadFailure
|
||||
expr: "alertmanager_config_last_reload_successful != 1"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
|
||||
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerConfigNotSynced
|
||||
expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
|
||||
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerE2eDeadManSwitch
|
||||
expr: "vector(1)"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
|
||||
description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusNotConnectedToAlertmanager
|
||||
expr: "prometheus_notifications_alertmanagers_discovered < 1"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
|
||||
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusRuleEvaluationFailures
|
||||
expr: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTemplateTextExpansionFailures
|
||||
expr: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusRuleEvaluationSlow
|
||||
expr: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
|
||||
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusNotificationsBacklog
|
||||
expr: "min_over_time(prometheus_notifications_queue_length[10m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
|
||||
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerNotificationFailing
|
||||
expr: "rate(alertmanager_notifications_failed_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
|
||||
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetEmpty
|
||||
expr: "prometheus_sd_discovered_targets == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus target empty (instance {{ $labels.instance }})
|
||||
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetScrapingSlow
|
||||
expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
|
||||
description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusLargeScrape
|
||||
expr: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus large scrape (instance {{ $labels.instance }})
|
||||
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetScrapeDuplicate
|
||||
expr: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
|
||||
description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbCheckpointCreationFailures
|
||||
expr: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbCheckpointDeletionFailures
|
||||
expr: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbCompactionsFailed
|
||||
expr: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbHeadTruncationsFailed
|
||||
expr: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbReloadFailures
|
||||
expr: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbWalCorruptions
|
||||
expr: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbWalTruncationsFailed
|
||||
expr: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTimeseriesCardinality
|
||||
expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
|
||||
description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
groups:
|
||||
- name: Promtail
|
||||
|
||||
rules:
|
||||
- alert: PromtailRequestErrors
|
||||
expr: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Promtail request errors (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PromtailRequestLatency
|
||||
expr: "histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1"
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Promtail request latency (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
groups:
|
||||
- name: Traefik
|
||||
|
||||
rules:
|
||||
- alert: TraefikServiceDown
|
||||
expr: "count(traefik_service_server_up) by (service) == 0"
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Traefik service down (instance {{ $labels.instance }})
|
||||
description: "All Traefik services are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TraefikHighHttp4xxErrorRateService
|
||||
expr: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Traefik high HTTP 4xx error rate service (instance {{ $labels.instance }})
|
||||
description: "Traefik service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TraefikHighHttp5xxErrorRateService
|
||||
expr: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
|
||||
description: "Traefik service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
{{ define "telegram.markdown.message" }}
|
||||
{{- if eq .Status "firing" -}}
|
||||
{{- if eq .CommonLabels.severity "critical" -}}
|
||||
🔴 Alert: {{ .CommonLabels.alertname }}
|
||||
{{- else if eq .CommonLabels.severity "warning" -}}
|
||||
🟠 Alert: {{ .CommonLabels.alertname }}
|
||||
{{- else -}}
|
||||
⚪️ Alert: {{ .CommonLabels.alertname }}
|
||||
{{- end }}
|
||||
Status: 🔥 FIRING
|
||||
Severity: {{ if eq .CommonLabels.severity "critical" }}🔴 {{ .CommonLabels.severity | title }}{{ else if eq .CommonLabels.severity "warning" }}🟠 {{ .CommonLabels.severity | title }}{{ else }}⚪️ {{ .CommonLabels.severity | title }}{{ end }}
|
||||
{{- else if eq .Status "resolved" -}}
|
||||
⚪️ Alert: {{ .CommonLabels.alertname }}
|
||||
Status: ✅ RESOLVED
|
||||
Severity: {{ if eq .CommonLabels.severity "critical" }}🟢 {{ .CommonLabels.severity | title }}{{ else if eq .CommonLabels.severity "warning" }}🟢 {{ .CommonLabels.severity | title }}{{ else }}⚪️ {{ .CommonLabels.severity | title }}{{ end }}
|
||||
{{- end }}
|
||||
|
||||
{{- range .Alerts -}}
|
||||
|
||||
{{- if .Labels.job }}
|
||||
Job: `{{ .Labels.job }}`
|
||||
{{- end }}
|
||||
|
||||
{{- if .Labels.namespace }}
|
||||
Namespace: `{{ .Labels.namespace }}`
|
||||
{{- end }}
|
||||
|
||||
{{- if .Labels.instance }}
|
||||
Instance: `{{ .Labels.instance }}`
|
||||
{{- end }}
|
||||
|
||||
{{- if .Annotations.runbook_url }}
|
||||
[RunbookURL]({{ .Annotations.runbook_url }})
|
||||
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{ end }}
|
||||
28
nixos/hosts/monitoring/provisioning/templates/telegram.tmpl
Normal file
28
nixos/hosts/monitoring/provisioning/templates/telegram.tmpl
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
{{ define "alert_list" }}{{ range . }}
|
||||
---
|
||||
🪪 <b>{{ .Labels.alertname }}</b>
|
||||
{{- if eq .Labels.severity "critical" }}
|
||||
🚨 CRITICAL 🚨 {{ end }}
|
||||
{{- if eq .Labels.severity "warning" }}
|
||||
⚠️ WARNING ⚠️{{ end }}
|
||||
{{- if .Annotations.summary }}
|
||||
📝 {{ .Annotations.summary }}{{ end }}
|
||||
{{- if .Annotations.description }}
|
||||
📖 {{ .Annotations.description }}{{ end }}
|
||||
|
||||
🏷 Labels:
|
||||
{{ range .Labels.SortedPairs }} <i>{{ .Name }}</i>: <code>{{ .Value }}</code>
|
||||
{{ end }}{{ end }}
|
||||
🛠 <a href="https://grafana.prod.global:3000">Grafana</a> 💊 <a href="https://alertmanager.prod.global:9093">Alertmanager</a> 💊 <a href="https://">Any other link</a> 🛠
|
||||
{{ end }}
|
||||
|
||||
{{ define "telegram.message" }}
|
||||
{{ if gt (len .Alerts.Firing) 0 }}
|
||||
🔥 Alerts Firing 🔥
|
||||
{{ template "alert_list" .Alerts.Firing }}
|
||||
{{ end }}
|
||||
{{ if gt (len .Alerts.Resolved) 0 }}
|
||||
✅ Alerts Resolved ✅
|
||||
{{ template "alert_list" .Alerts.Resolved }}
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
Loading…
Add table
Add a link
Reference in a new issue