Cultivates system resilience through controlled chaos experiments on Kubernetes clusters and distributed services
Chaos Gardener automates controlled chaos engineering experiments to validate system resilience. It introduces calibrated failures into production-like environments to uncover weaknesses before they cause outages.
chaos-gardener inject pod-kill --namespace <ns> --label-selector <selector> --percentage <0-100> --duration <time>
chaos-gardener inject network-latency --target <service> --latency-ms <ms> --jitter <ms> --duration <time>
chaos-gardener inject network-loss --target <pod> --loss-percent <0-100> --duration <time>
chaos-gardener inject io-throttle --device <path> --read-bps <bytes> --write-bps <bytes> --duration <time>
chaos-gardener inject cpu-stress --target <pod> --load <0-100> --duration <time>
chaos-gardener inject memory-fill --target <pod> --fill-percent <0-100> --duration <time>
chaos-gardener inject time-shift --target <pod> --offset <seconds> --duration <time>
chaos-gardener inject dns-chaos --domain <domain> --failure-rate <0-100> --duration <time>
chaos-gardener experiment create <name> --scenario <scenario-file.yaml> --target-env <environment>
chaos-gardener experiment run <experiment-id> --dry-run --verify
chaos-gardener experiment status <experiment-id>
chaos-gardener experiment abort <experiment-id>
chaos-gardener verify scaling --deployment <name> --expected-replicas <count> --timeout <duration>
chaos-gardener verify recovery --service <name> --success-rate <percent> --window <duration>
chaos-gardener verify metrics --prometheus-url <url> --alert <alert-name> --threshold <value>
chaos-gardener report generate <experiment-id> --output <format> --include-metrics
chaos-gardener schedule create <cron> --experiment <id> --approval-required
chaos-gardener schedule list --namespace <ns>
chaos-gardener safety check --experiment <scenario> --blocklist <services>
CHAOS_GARDENER_NAMESPACE: Default namespace for experiments (default: chaos-testing)CHAOS_GARDENER_PROMETHEUS_URL: Prometheus endpoint for metric verificationCHAOS_GARDENER_SLACK_WEBHOOK: Webhook for experiment notificationsCHAOS_GARDENER_MAX_CONCURRENT_EXPERIMENTS: Limit parallel experiments (default: 3)CHAOS_GARDENER_AUTO_ABORT_ON_CRITICAL_ALERT: Auto-abort if critical Prometheus alert fires (default: true)# Clone experiment repository
git clone https://github.com/org/chaos-scenarios.git /tmp/chaos-scenarios
cd /tmp/chaos-scenarios
# Select environment
export CHAOS_GARDENER_NAMESPACE=staging
export CHAOS_GARDENER_PROMETHEUS_URL=http://prometheus.monitoring.svc:9090
# Verify cluster access and safety
chaos-gardener safety check --experiment ecommerce-checkout-failure.yaml \
--blocklist "payment-gateway,auth-service"
Expected output: ✓ Safety check passed: 0 blocked services found
# Create experiment from scenario file
chaos-gardener experiment create checkout-resilience \
--scenario scenarios/ecommerce/checkout-failure.yaml \
--target-env staging
# Preview injected faults (dry-run)
chaos-gardener experiment run checkout-resilience-001 \
--dry-run --verify
# Expected output includes:
# - Target pods: 12
# - Faults to inject: Pod kill (30%), Network latency (500ms), DB connection drop (5s)
# - Verification checks: 5
# - Estimated blast radius: Low (non-payment services)
# Run experiment with real-time monitoring
chaos-gardener experiment run checkout-resilience-001
# In separate terminal, monitor metrics:
watch -n 5 'curl -s http://prometheus.monitoring.svc:9090/api/v1/query?query=rate(http_requests_total{service="checkout"}[5m]) | jq'
# Monitor experiment status
chaos-gardener experiment status checkout-resilience-001
Expected status progression: INITIALIZING → INJECTING → STEADY_STATE → VERIFYING → COMPLETED
# Wait for experiment completion, verify metrics automatically:
chaos-gardener verify recovery --service checkout-service \
--success-rate 99.5 --window 2m
# Expected output: `✓ Recovery verified: 99.8% success rate over 120s window`
# Verify auto-scaling behavior
chaos-gardener verify scaling --deployment checkout-api \
--expected-replicas 5 --timeout 90s
Expected output: ✓ Scaling verified: reached 5 replicas in 87s
chaos-gardener report generate checkout-resilience-001 \
--output html --include-metrics \
--slack-notify "#chaos-engineering"
# Report includes:
# - Timeline of injected faults
# - Metric graphs (latency, error rate, throughput)
# - Service impact matrix
# - Resilience score (calculated from recovery time and success rate)
# Experiment auto-cleans injected faults. Verify cleanup:
kubectl get chaosengines -n staging
# Should show: No resources found
# Archive experiment data
chaos-gardener report archive checkout-resilience-001 \
--s3-bucket chaos-reports --retention 90d
--approval-required flag and 48-hour notice to stakeholders.--percentage to limit affected pods, never target payment/auth services without explicit waiver.CHAOS_GARDENER_AUTO_ABORT_ON_CRITICAL_ALERT=true ensures immediate stop if SLA breaches >5%.--verify flag specifying success metrics.--duration (max 1 hour for production).Scenario: Test deployment recovery from random pod deletions
chaos-gardener inject pod-kill \
--namespace checkout \
--label-selector "app=checkout-api" \
--percentage 30 \
--duration 10m \
--interval 60s
Expected Output:
✓ Injection prepared: Target 6/20 pods (30%)
✓ Chaos experiment ID: podkill-20240315-001
→ Injecting pod-kill every 60s for 600s total
Monitoring: `chaos-gardener experiment status podkill-20240315-001`
Verification:
chaos-gardener verify recovery \
--service checkout-api \
--success-rate 99.0 \
--window 5m
Actual Result: ✓ Recovery verified: 99.3% success rate, max latency 2.1s
Scenario: Simulate network degradation between checkout and inventory services
chaos-gardener inject network-latency \
--target inventory-service \
--latency-ms 500 \
--jitter 100 \
--duration 5m
Expected Output:
✓ Iptables rules created on 3 inventory pods
✓ Experiment ID: netlat-20240315-002
→ 500ms ±100ms latency injected for 300s
Metrics dashboard: http://grafana.team/d/chaos/network-latency
Verification:
# Check that checkout service circuit breaker triggered
kubectl logs deployment/checkout -c checkout --tail=50 | grep circuit-breaker
# Expected: "Circuit breaker OPENED for inventory-service, fallback active"
chaos-gardener verify metrics \
--prometheus-url http://prometheus:9090 \
--alert "HighCheckoutLatency" \
--threshold "95th_percentile_latency_seconds > 1.0"
Actual Result: ✓ Metric verification passed: latency stayed below 1.0s threshold
Scenario File: scenarios/payment/region-failure.yaml
apiVersion: chaos-mesh.org/v1alpha1