Alerts


/etc/prometheus/rules/mail_alerts.yml > mail
DovecotDown (0 active)
alert: DovecotDown
expr: node_systemd_unit_state{name="dovecot.service",state="active"}
  == 0
for: 2m
labels:
  service: dovecot
  severity: critical
annotations:
  description: Dovecot IMAP service has been inactive for more than 2 minutes
  summary: Dovecot is down on {{ $labels.instance }}
PostfixDown (0 active)
alert: PostfixDown
expr: node_systemd_unit_state{name="postfix.service",state="active"}
  == 0
for: 2m
labels:
  service: postfix
  severity: critical
annotations:
  description: Postfix mail service has been inactive for more than 2 minutes
  summary: Postfix is down on {{ $labels.instance }}
PostfixMailQueueCritical (0 active)
alert: PostfixMailQueueCritical
expr: node_postfix_queue_size
  > 200
for: 5m
labels:
  service: postfix
  severity: critical
annotations:
  description: Mail queue has {{ $value }} messages — possible delivery failure
  summary: Postfix mail queue critical on {{ $labels.instance }}
PostfixMailQueueGrowing (0 active)
alert: PostfixMailQueueGrowing
expr: node_postfix_queue_size
  > 50
for: 15m
labels:
  service: postfix
  severity: warning
annotations:
  description: 'Mail queue has {{ $value }} messages (threshold: 50)'
  summary: Postfix mail queue growing on {{ $labels.instance }}
/etc/prometheus/rules/system_alerts.yml > system
ServerDown (1 active)
alert: ServerDown
expr: up == 0
for: 1m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} ({{ $labels.job }}) has been unreachable for
    more than 1 minute'
  summary: Instance {{ $labels.instance }} is down
Labels State Active Since Value
alertname="ServerDown" instance="localhost:9093" job="alertmanager" severity="critical" firing 2026-04-28 18:46:47.865744121 +0000 UTC 0
SystemdServiceFailed (3 active)
alert: SystemdServiceFailed
expr: node_systemd_unit_state{state="failed"}
  == 1
for: 2m
labels:
  severity: warning
annotations:
  description: Service {{ $labels.name }} is in failed state
  summary: Systemd service failed on {{ $labels.instance }}
Labels State Active Since Value
alertname="SystemdServiceFailed" instance="mwaponda.com" job="node" name="prometheus-alertmanager.service" severity="warning" state="failed" type="simple" firing 2026-04-28 18:46:47.865744121 +0000 UTC 1
alertname="SystemdServiceFailed" instance="mwaponda.com" job="node" name="openipmi.service" severity="warning" state="failed" type="forking" firing 2026-04-28 18:47:17.865744121 +0000 UTC 1
alertname="SystemdServiceFailed" instance="mwaponda.com" job="node" name="logrotate.service" severity="warning" state="failed" type="oneshot" firing 2026-04-29 00:00:17.865744121 +0000 UTC 1
CriticalCPULoad (0 active)
alert: CriticalCPULoad
expr: 100
  - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
  > 95
for: 2m
labels:
  severity: critical
annotations:
  description: 'CPU usage is {{ printf "%.1f" $value }}% (threshold: 95%)'
  summary: Critical CPU load on {{ $labels.instance }}
CriticalMemoryUsage (0 active)
alert: CriticalMemoryUsage
expr: (1
  - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 2m
labels:
  severity: critical
annotations:
  description: Memory usage is {{ printf "%.1f" $value }}%
  summary: Critical memory usage on {{ $labels.instance }}
DiskSpaceCritical (0 active)
alert: DiskSpaceCritical
expr: (1
  - (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"}))
  * 100 > 90
for: 2m
labels:
  severity: critical
annotations:
  description: Disk {{ $labels.mountpoint }} is {{ printf "%.1f" $value }}%
    full
  summary: Critical disk space on {{ $labels.instance }}
DiskSpaceWarning (0 active)
alert: DiskSpaceWarning
expr: (1
  - (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"}))
  * 100 > 80
for: 5m
labels:
  severity: warning
annotations:
  description: Disk {{ $labels.mountpoint }} is {{ printf "%.1f" $value }}%
    full
  summary: Disk space warning on {{ $labels.instance }}
DiskWillFillIn24h (0 active)
alert: DiskWillFillIn24h
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"}[6h],
  24 * 3600) < 0
for: 30m
labels:
  severity: warning
annotations:
  description: '{{ $labels.mountpoint }} is predicted to be full within 24 hours'
  summary: Disk will fill within 24h on {{ $labels.instance }}
HighCPULoad (0 active)
alert: HighCPULoad
expr: 100
  - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
  > 85
for: 5m
labels:
  severity: warning
annotations:
  description: 'CPU usage is {{ printf "%.1f" $value }}% (threshold: 85%)'
  summary: High CPU load on {{ $labels.instance }}
HighLoadAverage (0 active)
alert: HighLoadAverage
expr: node_load15
  / on (instance) count by (instance) (node_cpu_seconds_total{mode="idle"})
  > 0.8
for: 10m
labels:
  severity: warning
annotations:
  description: 15-minute load average per CPU is {{ printf "%.2f" $value }}
  summary: High load average on {{ $labels.instance }}
HighMemoryUsage (0 active)
alert: HighMemoryUsage
expr: (1
  - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
  severity: warning
annotations:
  description: 'Memory usage is {{ printf "%.1f" $value }}% (threshold: 85%)'
  summary: High memory usage on {{ $labels.instance }}
UnexpectedReboot (0 active)
alert: UnexpectedReboot
expr: node_time_seconds
  - node_boot_time_seconds < 300
labels:
  severity: warning
annotations:
  description: Server has been up for less than 5 minutes — possible unexpected reboot
  summary: 'Server rebooted: {{ $labels.instance }}'
/etc/prometheus/rules/web_alerts.yml > web
ApacheDown (0 active)
alert: ApacheDown
expr: node_systemd_unit_state{name="apache2.service",state="active"}
  == 0
for: 1m
labels:
  service: apache2
  severity: critical
annotations:
  description: Apache web server has been inactive for more than 1 minute
  summary: Apache is down on {{ $labels.instance }}
NginxDown (0 active)
alert: NginxDown
expr: node_systemd_unit_state{name="nginx.service",state="active"}
  == 0
for: 1m
labels:
  service: nginx
  severity: critical
annotations:
  description: Nginx web server has been inactive for more than 1 minute
  summary: Nginx is down on {{ $labels.instance }}
SSLCertExpiringCritical (0 active)
alert: SSLCertExpiringCritical
expr: probe_ssl_earliest_cert_expiry
  - time() < 86400 * 7
for: 1h
labels:
  severity: critical
annotations:
  description: Certificate expires in {{ $value | humanizeDuration }} — renew immediately
  summary: 'SSL certificate expiring in 7 days: {{ $labels.instance }}'
SSLCertExpiringWarning (0 active)
alert: SSLCertExpiringWarning
expr: probe_ssl_earliest_cert_expiry
  - time() < 86400 * 30
for: 1h
labels:
  severity: warning
annotations:
  description: Certificate expires in {{ $value | humanizeDuration }}
  summary: 'SSL certificate expiring soon: {{ $labels.instance }}'