Alerts

Inactive (17) Pending (0) Firing (2)

/etc/prometheus/rules/mail_alerts.yml > mail

DovecotDown (0 active)

alert: DovecotDown
expr: node_systemd_unit_state{name="dovecot.service",state="active"}
  == 0
for: 2m
labels:
  service: dovecot
  severity: critical
annotations:
  description: Dovecot IMAP service has been inactive for more than 2 minutes
  summary: Dovecot is down on {{ $labels.instance }}

PostfixDown (0 active)

alert: PostfixDown
expr: node_systemd_unit_state{name="postfix.service",state="active"}
  == 0
for: 2m
labels:
  service: postfix
  severity: critical
annotations:
  description: Postfix mail service has been inactive for more than 2 minutes
  summary: Postfix is down on {{ $labels.instance }}

PostfixMailQueueCritical (0 active)

alert: PostfixMailQueueCritical
expr: node_postfix_queue_size
  > 200
for: 5m
labels:
  service: postfix
  severity: critical
annotations:
  description: Mail queue has {{ $value }} messages — possible delivery failure
  summary: Postfix mail queue critical on {{ $labels.instance }}

PostfixMailQueueGrowing (0 active)

alert: PostfixMailQueueGrowing
expr: node_postfix_queue_size
  > 50
for: 15m
labels:
  service: postfix
  severity: warning
annotations:
  description: 'Mail queue has {{ $value }} messages (threshold: 50)'
  summary: Postfix mail queue growing on {{ $labels.instance }}

/etc/prometheus/rules/system_alerts.yml > system

ServerDown (1 active)

alert: ServerDown
expr: up == 0
for: 1m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} ({{ $labels.job }}) has been unreachable for
    more than 1 minute'
  summary: Instance {{ $labels.instance }} is down

Labels	State	Active Since	Value
alertname="ServerDown" instance="localhost:9093" job="alertmanager" severity="critical"	firing	2026-04-28 18:46:47.865744121 +0000 UTC	0
Annotations
description localhost:9093 (alertmanager) has been unreachable for more than 1 minute summary Instance localhost:9093 is down

SystemdServiceFailed (3 active)

alert: SystemdServiceFailed
expr: node_systemd_unit_state{state="failed"}
  == 1
for: 2m
labels:
  severity: warning
annotations:
  description: Service {{ $labels.name }} is in failed state
  summary: Systemd service failed on {{ $labels.instance }}

Labels	State	Active Since	Value
alertname="SystemdServiceFailed" instance="mwaponda.com" job="node" name="prometheus-alertmanager.service" severity="warning" state="failed" type="simple"	firing	2026-04-28 18:46:47.865744121 +0000 UTC	1
Annotations
description Service prometheus-alertmanager.service is in failed state summary Systemd service failed on mwaponda.com
alertname="SystemdServiceFailed" instance="mwaponda.com" job="node" name="openipmi.service" severity="warning" state="failed" type="forking"	firing	2026-04-28 18:47:17.865744121 +0000 UTC	1
Annotations
description Service openipmi.service is in failed state summary Systemd service failed on mwaponda.com
alertname="SystemdServiceFailed" instance="mwaponda.com" job="node" name="logrotate.service" severity="warning" state="failed" type="oneshot"	firing	2026-04-29 00:00:17.865744121 +0000 UTC	1
Annotations
description Service logrotate.service is in failed state summary Systemd service failed on mwaponda.com

CriticalCPULoad (0 active)

alert: CriticalCPULoad
expr: 100
  - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
  > 95
for: 2m
labels:
  severity: critical
annotations:
  description: 'CPU usage is {{ printf "%.1f" $value }}% (threshold: 95%)'
  summary: Critical CPU load on {{ $labels.instance }}

CriticalMemoryUsage (0 active)

alert: CriticalMemoryUsage
expr: (1
  - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 2m
labels:
  severity: critical
annotations:
  description: Memory usage is {{ printf "%.1f" $value }}%
  summary: Critical memory usage on {{ $labels.instance }}

DiskSpaceCritical (0 active)

alert: DiskSpaceCritical
expr: (1
  - (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"}))
  * 100 > 90
for: 2m
labels:
  severity: critical
annotations:
  description: Disk {{ $labels.mountpoint }} is {{ printf "%.1f" $value }}%
    full
  summary: Critical disk space on {{ $labels.instance }}

DiskSpaceWarning (0 active)

alert: DiskSpaceWarning
expr: (1
  - (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"}))
  * 100 > 80
for: 5m
labels:
  severity: warning
annotations:
  description: Disk {{ $labels.mountpoint }} is {{ printf "%.1f" $value }}%
    full
  summary: Disk space warning on {{ $labels.instance }}

DiskWillFillIn24h (0 active)

alert: DiskWillFillIn24h
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"}[6h],
  24 * 3600) < 0
for: 30m
labels:
  severity: warning
annotations:
  description: '{{ $labels.mountpoint }} is predicted to be full within 24 hours'
  summary: Disk will fill within 24h on {{ $labels.instance }}

HighCPULoad (0 active)

alert: HighCPULoad
expr: 100
  - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
  > 85
for: 5m
labels:
  severity: warning
annotations:
  description: 'CPU usage is {{ printf "%.1f" $value }}% (threshold: 85%)'
  summary: High CPU load on {{ $labels.instance }}

HighLoadAverage (0 active)

alert: HighLoadAverage
expr: node_load15
  / on (instance) count by (instance) (node_cpu_seconds_total{mode="idle"})
  > 0.8
for: 10m
labels:
  severity: warning
annotations:
  description: 15-minute load average per CPU is {{ printf "%.2f" $value }}
  summary: High load average on {{ $labels.instance }}

HighMemoryUsage (0 active)

alert: HighMemoryUsage
expr: (1
  - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
  severity: warning
annotations:
  description: 'Memory usage is {{ printf "%.1f" $value }}% (threshold: 85%)'
  summary: High memory usage on {{ $labels.instance }}

UnexpectedReboot (0 active)

alert: UnexpectedReboot
expr: node_time_seconds
  - node_boot_time_seconds < 300
labels:
  severity: warning
annotations:
  description: Server has been up for less than 5 minutes — possible unexpected reboot
  summary: 'Server rebooted: {{ $labels.instance }}'

/etc/prometheus/rules/web_alerts.yml > web

ApacheDown (0 active)

alert: ApacheDown
expr: node_systemd_unit_state{name="apache2.service",state="active"}
  == 0
for: 1m
labels:
  service: apache2
  severity: critical
annotations:
  description: Apache web server has been inactive for more than 1 minute
  summary: Apache is down on {{ $labels.instance }}

NginxDown (0 active)

alert: NginxDown
expr: node_systemd_unit_state{name="nginx.service",state="active"}
  == 0
for: 1m
labels:
  service: nginx
  severity: critical
annotations:
  description: Nginx web server has been inactive for more than 1 minute
  summary: Nginx is down on {{ $labels.instance }}

SSLCertExpiringCritical (0 active)

alert: SSLCertExpiringCritical
expr: probe_ssl_earliest_cert_expiry
  - time() < 86400 * 7
for: 1h
labels:
  severity: critical
annotations:
  description: Certificate expires in {{ $value | humanizeDuration }} — renew immediately
  summary: 'SSL certificate expiring in 7 days: {{ $labels.instance }}'

SSLCertExpiringWarning (0 active)

alert: SSLCertExpiringWarning
expr: probe_ssl_earliest_cert_expiry
  - time() < 86400 * 30
for: 1h
labels:
  severity: warning
annotations:
  description: Certificate expires in {{ $value | humanizeDuration }}
  summary: 'SSL certificate expiring soon: {{ $labels.instance }}'