PromQL queries, alerting rules, recording rules, Grafana dashboard JSON, SLO
# Request rate (per second, 5m window)
rate(http_requests_total[5m])
# Error rate percentage
sum(rate(http_requests_total{status=~"5.."}[5m]))
/ sum(rate(http_requests_total[5m])) * 100
# P99 latency from histogram
histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
# P50 latency by endpoint
histogram_quantile(0.50,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, handler)
)
# Saturation: CPU usage per pod
sum(rate(container_cpu_usage_seconds_total[5m])) by (pod)
/ sum(kube_pod_container_resource_limits{resource="cpu"}) by (pod) * 100
# SLO: 99.9% availability over 30 days
# Error budget = 0.1% = 43.2 minutes/month
# Current burn rate (how fast consuming budget)
1 - (
sum(rate(http_requests_total{status!~"5.."}[1h]))
/ sum(rate(http_requests_total[1h]))
) / (1 - 0.999)
# Remaining error budget (percentage)
1 - (
sum(increase(http_requests_total{status=~"5.."}[30d]))
/ (sum(increase(http_requests_total[30d])) * 0.001)
)