Alerting
Alerting
Production-ready Prometheus alerting rules for Cbox FPM Exporter metrics.
Complete Alert Rules
Save as fpm-exporter-alerts.yml:
groups:
- name: phpfpm
interval: 30s
rules:
# Critical: PHP-FPM pool is down
- alert: PHPFPMDown
expr: phpfpm_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PHP-FPM pool {{ $labels.pool }} is down"
description: "Cannot scrape PHP-FPM status from {{ $labels.socket }}"
# Warning: High process utilization
- alert: PHPFPMHighUtilization
expr: phpfpm_active_processes / phpfpm_pm_max_children_config > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "PHP-FPM pool {{ $labels.pool }} at {{ $value | humanizePercentage }} capacity"
description: "Consider increasing pm.max_children or adding more workers"
# Critical: Process limit reached
- alert: PHPFPMMaxChildrenReached
expr: increase(phpfpm_max_children_reached[5m]) > 0
labels:
severity: critical
annotations:
summary: "PHP-FPM pool {{ $labels.pool }} hit max_children limit"
description: "Requests are being queued. Increase pm.max_children."
# Warning: High listen queue
- alert: PHPFPMListenQueueHigh
expr: phpfpm_listen_queue > 10
for: 2m
labels:
severity: warning
annotations:
summary: "PHP-FPM pool {{ $labels.pool }} has {{ $value }} queued requests"
description: "Requests are waiting for available workers"
# Warning: Slow requests increasing
- alert: PHPFPMSlowRequests
expr: rate(phpfpm_slow_requests[5m]) > 0.1
labels:
severity: warning
annotations:
summary: "PHP-FPM pool {{ $labels.pool }} has slow requests"
description: "Requests exceeding slowlog timeout"
- name: opcache
interval: 30s
rules:
# Warning: Low hit rate
- alert: OpcacheLowHitRate
expr: phpfpm_opcache_hit_rate < 85
for: 10m
labels:
severity: warning
annotations:
summary: "Opcache hit rate {{ $value | printf \"%.1f\" }}% on {{ $labels.pool }}"
description: "Cold cache or undersized opcache settings"
# Warning: High wasted memory
- alert: OpcacheHighWaste
expr: phpfpm_opcache_wasted_memory_percent > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Opcache {{ $value | printf \"%.1f\" }}% wasted on {{ $labels.pool }}"
description: "Consider restarting PHP-FPM or tuning validate_timestamps"
# Critical: Out of memory restarts
- alert: OpcacheOOMRestarts
expr: increase(phpfpm_opcache_oom_restarts_total[1h]) > 0
labels:
severity: critical
annotations:
summary: "Opcache OOM restart on {{ $labels.pool }}"
description: "Increase opcache.memory_consumption"
# Warning: Low free memory
- alert: OpcacheMemoryLow
expr: phpfpm_opcache_free_memory_bytes < 10000000
for: 5m
labels:
severity: warning
annotations:
summary: "Opcache <10MB free on {{ $labels.pool }}"
description: "Cache may start evicting scripts"
- name: laravel
interval: 30s
rules:
# Warning: Queue backlog
- alert: LaravelQueueBacklog
expr: laravel_queue_size > 500
for: 5m
labels:
severity: warning
annotations:
summary: "Queue backlog {{ $value }} on {{ $labels.site }}/{{ $labels.queue }}"
description: "Jobs accumulating faster than workers can process"
# Critical: Large queue backlog
- alert: LaravelQueueCritical
expr: laravel_queue_size > 5000
for: 2m
labels:
severity: critical
annotations:
summary: "Critical queue backlog {{ $value }} on {{ $labels.site }}/{{ $labels.queue }}"
description: "Immediate attention required"
# Critical: Debug mode in production
- alert: LaravelDebugEnabled
expr: laravel_debug_mode == 1 and laravel_app_info{env="production"}
labels:
severity: critical
annotations:
summary: "Debug mode enabled on production {{ $labels.site }}"
description: "Security risk - disable APP_DEBUG"
# Warning: Extended maintenance
- alert: LaravelMaintenanceExtended
expr: laravel_maintenance_mode == 1
for: 30m
labels:
severity: warning
annotations:
summary: "{{ $labels.site }} in maintenance >30 minutes"
description: "Is maintenance intentional?"
# Warning: Config not cached
- alert: LaravelConfigNotCached
expr: laravel_cache_config == 0 and laravel_app_info{env="production"}
labels:
severity: warning
annotations:
summary: "Config not cached on {{ $labels.site }}"
description: "Run 'php artisan config:cache' for better performance"
# Warning: Routes not cached
- alert: LaravelRoutesNotCached
expr: laravel_cache_routes == 0 and laravel_app_info{env="production"}
labels:
severity: warning
annotations:
summary: "Routes not cached on {{ $labels.site }}"
description: "Run 'php artisan route:cache' for better performance"
Loading Rules
Add to your Prometheus configuration:
# prometheus.yml
rule_files:
- /etc/prometheus/rules/fpm-exporter-alerts.yml
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
Reload Prometheus:
curl -X POST http://localhost:9090/-/reload
Severity Levels
| Level | Response | Example |
|---|---|---|
critical |
Immediate (page) | FPM down, max_children reached |
warning |
Next business day | High utilization, queue backlog |
info |
Review weekly | Cache not enabled |
Alertmanager Routing
Example routing by severity:
# alertmanager.yml
route:
receiver: 'slack-warnings'
routes:
- match:
severity: critical
receiver: 'pagerduty'
- match:
severity: warning
receiver: 'slack-warnings'
receivers:
- name: 'pagerduty'
pagerduty_configs:
- service_key: '<key>'
- name: 'slack-warnings'
slack_configs:
- api_url: 'https://hooks.slack.com/...'
channel: '#alerts'
Next Steps
- Grafana Dashboards - Visualize metrics
- Kubernetes Deployment - Production deployment